def run(self, fileStore): seqFile1 = fileStore.readGlobalFile(self.seqFileID1) seqFile2 = fileStore.readGlobalFile(self.seqFileID2) if self.blastOptions.compressFiles: seqFile1 = decompressFastaFile(seqFile1, fileStore.getLocalTempFile()) seqFile2 = decompressFastaFile(seqFile2, fileStore.getLocalTempFile()) blastResultsFile = fileStore.getLocalTempFile() runLastz(seqFile1, seqFile2, blastResultsFile, lastzArguments=self.blastOptions.lastzArguments, gpuLastz=self.blastOptions.gpuLastz) if self.blastOptions.realign: realignResultsFile = fileStore.getLocalTempFile() runCactusRealign( seqFile1, seqFile2, inputAlignmentsFile=blastResultsFile, outputAlignmentsFile=realignResultsFile, realignArguments=self.blastOptions.realignArguments) blastResultsFile = realignResultsFile resultsFile = fileStore.getLocalTempFile() cactus_call(parameters=[ "cactus_blast_convertCoordinates", blastResultsFile, resultsFile, str(self.blastOptions.roundsOfCoordinateConversion) ]) logger.info("Ran the blast okay") return fileStore.writeGlobalFile(resultsFile)
def maskJobOverride(job, config_node, mask_file_path, mask_file_id, min_length): """ return a hijacked config file that does just one preprocessing job: mask each fasta sequence with the given bed file. if paf_length is specified, the file is treated as a PAF file, and a BED is extracted from it using coverage gaps of at least the given length. """ # this was unzipped upstream if mask_file_path.endswith('.gz'): mask_file_path = mask_file_path[:-3] if mask_file_path.endswith('.paf'): # convert the PAF to BED paf_file = job.fileStore.readGlobalFile(mask_file_id) bed_file = job.fileStore.getLocalTempFile() if not min_length: min_length = 1 cactus_call( parameters=['pafcoverage', paf_file, '-g', '-m', str(min_length)], outfile=bed_file) mask_file_id = job.fileStore.writeGlobalFile(bed_file) # rewrite the config for node in config_node.findall("preprocessor"): config_node.remove(node) mask_node = ET.SubElement(config_node, 'preprocessor') mask_node.attrib['preprocessJob'] = 'maskFile' mask_node.attrib['inputBedID'] = mask_file_id return config_node
def run(self, fileStore): #This runs Bob's covered intervals program, which combins the lastz alignment info into intervals of the query. alignments = fileStore.readGlobalFile(self.alignmentsID) query = fileStore.readGlobalFile(self.queryID) maskInfo = fileStore.getLocalTempFile() cactus_call(infile=alignments, outfile=maskInfo, parameters=[ "cactus_covered_intervals", "--queryoffsets", "--origin=one", "M=%s" % (int(self.repeatMaskOptions.period * 2)) ]) # the previous lastz command outputs a file of intervals (denoted with indices) to softmask. # we finish by applying these intervals to the input file, to produce the final, softmasked output. args = ["--origin=one"] if self.repeatMaskOptions.unmaskOutput: args.append("--unmask") args.append(maskInfo) maskedQuery = fileStore.getLocalTempFile() cactus_call(infile=query, outfile=maskedQuery, parameters=["cactus_fasta_softmask_intervals.py"] + args) tmp = fileStore.writeGlobalFile(maskedQuery) return tmp
def alignFastaFragments(self, fileStore, targetFiles, fragments): """ Align each query fragment against all the target chunks, stopping early to avoid exponential blowup if too many alignments are found. """ target = fileStore.getLocalTempFile() catFiles(targetFiles, target) lastZSequenceHandling = [ '%s[multiple][nameparse=darkspace]' % os.path.basename(target), '%s[nameparse=darkspace]' % os.path.basename(fragments) ] if self.repeatMaskOptions.unmaskInput: lastZSequenceHandling = [ '%s[multiple,unmask][nameparse=darkspace]' % os.path.basename(target), '%s[unmask][nameparse=darkspace]' % os.path.basename(fragments) ] alignment = fileStore.getLocalTempFile() # Each time a fragment aligns to a base in the sequence, that # base's match count is incremented. the plus three for the # period parameter is a fudge to ensure sufficient alignments # are found cactus_call( outfile=alignment, parameters=["cPecanLastz"] + lastZSequenceHandling + self.repeatMaskOptions.lastzOpts.split() + [ "--querydepth=keep,nowarn:%i" % (self.repeatMaskOptions.period + 3), "--format=general:name1,zstart1,end1,name2,zstart2+,end2+", "--markend" ]) return alignment
def run(self, fileStore): # Align each fragment against a chunk of the input sequence. Each time a fragment aligns to a base # in the sequence, that base's match count is incremented. # the plus three for the period parameter is a fudge to ensure sufficient alignments are found fragments = fileStore.readGlobalFile(self.fragmentsID) targetFiles = [ fileStore.readGlobalFile(fileID) for fileID in self.targetIDs ] target = fileStore.getLocalTempFile() catFiles(targetFiles, target) lastZSequenceHandling = [ '%s[multiple][nameparse=darkspace]' % os.path.basename(target), '%s[nameparse=darkspace]' % os.path.basename(fragments) ] if self.repeatMaskOptions.unmaskInput: lastZSequenceHandling = [ '%s[multiple,unmask][nameparse=darkspace]' % os.path.basename(target), '%s[unmask][nameparse=darkspace]' % os.path.basename(fragments) ] alignment = fileStore.getLocalTempFile() cactus_call( outfile=alignment, parameters=["cPecanLastz"] + lastZSequenceHandling + self.repeatMaskOptions.lastzOpts.split() + [ "--querydepth=keep,nowarn:%i" % (self.repeatMaskOptions.period + 3), "--format=general:name1,zstart1,end1,name2,zstart2+,end2+", "--markend" ]) return fileStore.writeGlobalFile(alignment)
def testInvariants(self): (seqs, _) = getCactusInputs_encode(random.uniform(0, 2)) # Chimp encode input has duplicate header names. seqs = [i for i in seqs if 'chimp' not in i] seqs = random.sample(seqs, 2) cigarPath = getTempFile() cactus_call(parameters=[ "cPecanLastz", "--format=cigar", "%s[multiple]" % seqs[0], "%s[multiple]" % seqs[1] ], outfile=cigarPath) bed = cactus_call(parameters=["cactus_coverage", seqs[1], cigarPath], check_output=True) prevChrom = None prevStart = None prevEnd = None # Check that everything is sorted and there are no overlaps for line in bed.split("\n"): line.strip() if line == "": continue fields = line.split() chrom = fields[0] start = int(fields[1]) end = int(fields[2]) self.assertTrue(end - start >= 1) if chrom == prevChrom: self.assertTrue(start > prevStart) self.assertTrue(start >= prevEnd) os.remove(cigarPath)
def run(self, fileStore): blastResultsFile = fileStore.getLocalTempFile() seqFile = fileStore.readGlobalFile(self.seqFileID) runSelfLastz(seqFile, blastResultsFile, lastzArguments=self.blastOptions.lastzArguments, gpuLastz=self.blastOptions.gpuLastz) if self.blastOptions.realign: realignResultsFile = fileStore.getLocalTempFile() runCactusSelfRealign( seqFile, inputAlignmentsFile=blastResultsFile, outputAlignmentsFile=realignResultsFile, realignArguments=self.blastOptions.realignArguments) blastResultsFile = realignResultsFile resultsFile = fileStore.getLocalTempFile() cactus_call(parameters=[ "cactus_blast_convertCoordinates", blastResultsFile, resultsFile, str(self.blastOptions.roundsOfCoordinateConversion) ]) if self.blastOptions.compressFiles: #TODO: This throws away the compressed file seqFile = compressFastaFile(seqFile) logger.info("Ran the self blast okay") return fileStore.writeGlobalFile(resultsFile)
def paf_to_lastz(job, paf_file, sort_secondaries=True, mask_bed_id=None): """ Makes lastz output using paf2lastz. Also splits the input paf_file into two files in the output, one for the primary and the other for secondary. sort_secondaries bool, if true, will cause fxn to return two files instead of one. """ work_dir = job.fileStore.getLocalTempDir() paf_path = os.path.join(work_dir, "alignments.paf") lastz_path = os.path.join(work_dir, "alignments.cigar") secondary_lastz_path = os.path.join(work_dir, "secondary_alignments.cigar") job.fileStore.readGlobalFile(paf_file, paf_path) cmd = ['paf2lastz', paf_path, '-q'] if sort_secondaries: cmd += ['-s', secondary_lastz_path] if mask_bed_id: mask_bed_path = os.path.join(work_dir, "mask.bed") job.fileStore.readGlobalFile(mask_bed_id, mask_bed_path) cmd[1] = '-' cmd = [['pafmask', paf_path, mask_bed_path], cmd] cactus_call(parameters=cmd, outfile=lastz_path) lastz_id = job.fileStore.writeGlobalFile(lastz_path) if sort_secondaries: secondary_id = job.fileStore.writeGlobalFile(secondary_lastz_path) return [lastz_id, secondary_id] else: return lastz_id
def run(self, fileStore): outChunkID = None if self.prepOptions.preprocessJob == "checkUniqueHeaders": inChunk = fileStore.readGlobalFile(self.inChunkID) seqPaths = [ fileStore.readGlobalFile(fileID) for fileID in self.seqIDs ] seqString = " ".join(seqPaths) args = [inChunk] if self.prepOptions.checkAssemblyHub: args += ["--checkAssemblyHub"] cactus_call(stdin_string=seqString, parameters=["cactus_checkUniqueHeaders.py"] + args) outChunkID = self.inChunkID elif self.prepOptions.preprocessJob == "lastzRepeatMask": repeatMaskOptions = RepeatMaskOptions( proportionSampled=self.prepOptions.proportionToSample, minPeriod=self.prepOptions.minPeriod) outChunkID = self.addChild( LastzRepeatMaskJob(repeatMaskOptions=repeatMaskOptions, queryID=self.inChunkID, targetIDs=self.seqIDs)).rv() elif self.prepOptions.preprocessJob == "none": outChunkID = self.inChunkID return outChunkID
def toil_call_blast(job, options, seq_file, project, event, cigar_name, dep_names, *dep_fa_ids): work_dir = job.fileStore.getLocalTempDir() # serialize the seqfile so cactus-blast can use it seq_file_path = os.path.join(work_dir, 'seqfile.txt') with open(seq_file_path, 'w') as sf: sf.write(str(seq_file)) # read the fasta files assert len(dep_names) == len(dep_fa_ids) fa_paths = [os.path.join(work_dir, "{}.pp.fa".format(name)) for name in dep_names] for fa_path, fa_id in zip(fa_paths, dep_fa_ids): job.fileStore.readGlobalFile(fa_id, fa_path) cactus_call(parameters=['cactus-blast', os.path.join(work_dir, 'js'), seq_file_path, os.path.join(work_dir, os.path.basename(cigar_name)), '--root', event, '--pathOverrides'] + fa_paths+ ['--pathOverrideNames'] + dep_names + ['--workDir', work_dir, '--maxCores', str(int(job.cores)), '--maxDisk', bytes2humanN(job.disk), '--maxMemory', bytes2humanN(job.memory)] + options.cactusOptions.strip().split(' ')) # scrape the output files out of the workdir out_nameids = [] for out_file in [f for f in os.listdir(work_dir) if os.path.isfile(os.path.join(work_dir, f))]: if out_file.startswith(os.path.basename(cigar_name)): out_nameids.append((os.path.basename(out_file), job.fileStore.writeGlobalFile(os.path.join(work_dir, out_file)))) return out_nameids
def get_mask_bed_from_fasta(job, event, fa_id, fa_path, min_length): """ make a bed file from one fasta""" work_dir = job.fileStore.getLocalTempDir() bed_path = os.path.join(work_dir, os.path.basename(fa_path) + '.mask.bed') fa_path = os.path.join(work_dir, os.path.basename(fa_path)) is_gz = fa_path.endswith(".gz") job.fileStore.readGlobalFile(fa_id, fa_path, mutable=is_gz) if is_gz: cactus_call(parameters=['gzip', '-fd', fa_path]) fa_path = fa_path[:-3] with open(bed_path, 'w') as bed_file, open(fa_path, 'r') as fa_file: for seq_record in SeqIO.parse(fa_file, 'fasta'): first_mask = None for i, c in enumerate(seq_record.seq): is_mask = c.islower() or c in ['n', 'N'] if ( is_mask is False or i == len(seq_record.seq) - 1 ) and first_mask is not None and i - first_mask >= min_length: # we're one past an interval: write it bed_file.write('{}\t{}\t{}\n'.format( 'id={}|{}'.format(event, seq_record.id), first_mask, i)) first_mask = None elif is_mask is True and first_mask is None: # we're starting a new interval: remember start position first_mask = i return job.fileStore.writeGlobalFile(bed_path)
def get_mask_bed_from_fasta(job, event, fa_id, fa_path, min_length, work_dir=None): """ make a bed file from one fasta""" return_id = False # hack in a toggle (work_dir) that lets this be called as a job or a function if not work_dir: work_dir = job.fileStore.getLocalTempDir() return_id = True bed_path = os.path.join(work_dir, os.path.basename(fa_path) + '.mask.bed') fa_path = os.path.join(work_dir, os.path.basename(fa_path)) is_gz = fa_path.endswith(".gz") if return_id: job.fileStore.readGlobalFile(fa_id, fa_path, mutable=is_gz) if is_gz: cactus_call(parameters=['gzip', '-fd', fa_path]) fa_path = fa_path[:-3] cactus_call(parameters=[ 'cactus_softmask2hardmask', fa_path, '-b', '-m', str(min_length) ], outfile=bed_path) if return_id: return job.fileStore.writeGlobalFile(bed_path) else: return bed_path
def clip_vg(job, options, config, vg_path, vg_id): """ run clip-vg """ work_dir = job.fileStore.getLocalTempDir() is_decoy = vg_path == options.decoyGraph vg_path = os.path.join(work_dir, os.path.basename(vg_path)) job.fileStore.readGlobalFile(vg_id, vg_path) out_path = vg_path + '.clip' cmd = ['clip-vg', vg_path, '-f'] if options.clipLength is not None and not is_decoy: cmd += ['-u', str(options.clipLength)] for rs in options.rename: cmd += ['-r', rs] if options.reference: cmd += ['-e', options.reference] # sort while we're at it cmd = [cmd, ['vg', 'ids', '-s', '-']] cactus_call(parameters=cmd, outfile=out_path) # worth it cactus_call(parameters=['vg', 'validate', out_path]) return job.fileStore.writeGlobalFile(out_path)
def run(self, fileStore): assert len(self.targetIDs) >= 1 assert self.repeatMaskOptions.fragment > 1 queryFile = fileStore.readGlobalFile(self.queryID) # chop up input fasta file into into fragments of specified size. fragments overlap by # half their length. fragOutput = fileStore.getLocalTempFile() cactus_call( infile=queryFile, outfile=fragOutput, parameters=[ "cactus_fasta_fragments.py", "--fragment=%s" % str(self.repeatMaskOptions.fragment), "--step=%s" % (str(self.repeatMaskOptions.fragment / 2)), "--origin=zero" ]) fragmentsID = fileStore.writeGlobalFile(fragOutput) alignmentJob = self.addChild( AlignFastaFragments(repeatMaskOptions=self.repeatMaskOptions, fragmentsID=fragmentsID, targetIDs=self.targetIDs)) maskCoveredIntervalsJob = self.addChild( MaskCoveredIntervals(repeatMaskOptions=self.repeatMaskOptions, alignmentsID=alignmentJob.rv(), queryID=self.queryID)) alignmentJob.addFollowOn(maskCoveredIntervalsJob) return maskCoveredIntervalsJob.rv()
def merge_gafs_into_paf(job, config, gaf_file_ids): """ Merge GAF alignments into a single PAF, applying some filters """ work_dir = job.fileStore.getLocalTempDir() paf_path = os.path.join(work_dir, "mz_alignments.paf") gaf_paths = [] for i, gaf_id in enumerate(gaf_file_ids): gaf_paths.append("mz_alignment_{}.gaf".format(i)) job.fileStore.readGlobalFile(gaf_id, os.path.join(work_dir, gaf_paths[-1])) xml_node = findRequiredNode(config.xmlRoot, "refgraph") mzgaf2paf_opts = [] mz_filter = getOptionalAttrib(xml_node, "universalMZFilter", float) if mz_filter: mzgaf2paf_opts += ['-u', str(mz_filter)] min_mz = getOptionalAttrib(xml_node, "minMZBlockLength", int) if min_mz: mzgaf2paf_opts += ['-m', str(min_mz)] mapq = getOptionalAttrib(xml_node, "minMAPQ", int) if mapq: mzgaf2paf_opts += ['-q', str(mapq)] gaf_block = getOptionalAttrib(xml_node, "minGAFBlockLength", int) if gaf_block: mzgaf2paf_opts += ['-b', str(gaf_block)] cactus_call(work_dir=work_dir, outfile=paf_path, parameters=["mzgaf2paf"] + gaf_paths + mzgaf2paf_opts) # these are big, get rid of them as soon as we can (which is now) for gaf_id in gaf_file_ids: job.fileStore.deleteGlobalFile(gaf_id) return job.fileStore.writeGlobalFile(paf_path)
def map_a_to_b(job, a, b, dipcall_filter): """Maps fasta a to fasta b. Args: a (global file): fasta file a. In map_all_to_ref, a is an assembly fasta. b (global file): fasta file b. In map_all_to_ref, b is the reference. Returns: [type]: [description] """ print("in map a to b. a:", a, "b:", b) # map_to_ref_paf = job.fileStore.writeGlobalFile(job.fileStore.getLocalTempFile()) tmp = job.fileStore.getLocalTempFile() map_to_ref_paf = job.fileStore.writeGlobalFile(tmp) if dipcall_filter: # note: in dipcall, they include argument "--paf-no-hit". # I don't see why they would include these "mappings", only to be filtered out # later. I have not included the argument. cactus_call(parameters=[ "minimap2", "-c", "-xasm5", "--cs", "-r2k", "-o", job.fileStore.readGlobalFile(map_to_ref_paf), job.fileStore.readGlobalFile(b), job.fileStore.readGlobalFile(a) ]) else: cactus_call(parameters=[ "minimap2", "-cx", "asm5", "-o", job.fileStore.readGlobalFile(map_to_ref_paf), job.fileStore.readGlobalFile(b), job.fileStore.readGlobalFile(a) ]) return map_to_ref_paf
def run(self, fileStore): """ mask alpha satellites with dna-brnn """ fastaFile = fileStore.readGlobalFile(self.fastaID) cmd = ['dna-brnn', fastaFile] + self.dnabrnnOpts.split() if '-i' not in self.dnabrnnOpts: # pull up the model # todo: is there are more robust way? cmd += ['-i', os.path.join(cactusRootPath(), 'attcc-alpha.knm')] if self.cores: cmd += ['-t', str(self.cores)] bedFile = fileStore.getLocalTempFile() # run dna-brnn to make a bed file cactus_call(outfile=bedFile, parameters=cmd) maskedFile = fileStore.getLocalTempFile() mask_cmd = [ 'cactus_fasta_softmask_intervals.py', '--origin=zero', '--minLength={}'.format(self.minLength), bedFile ] # do the softmasking cactus_call(infile=fastaFile, outfile=maskedFile, parameters=mask_cmd) return fileStore.writeGlobalFile(maskedFile)
def maskCoveredIntervals(self, fileStore, queryFile, alignment): """ Mask the query fasta using the alignments to the target. Anything with more alignments than the period gets masked. """ #This runs Bob's covered intervals program, which combines the lastz alignment info into intervals of the query. maskInfo = fileStore.getLocalTempFile() cactus_call( infile=alignment, outfile=maskInfo, parameters=[ "cactus_covered_intervals", "--queryoffsets", "--origin=one", # * 2 takes into account the effect of the overlap "M=%s" % (int(self.repeatMaskOptions.period * 2)) ]) # the previous lastz command outputs a file of intervals (denoted with indices) to softmask. # we finish by applying these intervals to the input file, to produce the final, softmasked output. args = ["--origin=one"] if self.repeatMaskOptions.unmaskOutput: args.append("--unmask") args.append(maskInfo) maskedQuery = fileStore.getLocalTempFile() cactus_call(infile=queryFile, outfile=maskedQuery, parameters=["cactus_fasta_softmask_intervals.py"] + args) return maskedQuery
def clip_vg(job, options, config, vg_path, vg_id): """ run clip-vg """ work_dir = job.fileStore.getLocalTempDir() is_decoy = vg_path == options.decoyGraph vg_path = os.path.join(work_dir, os.path.basename(vg_path)) job.fileStore.readGlobalFile(vg_id, vg_path) out_path = vg_path + '.clip' cmd = ['clip-vg', vg_path, '-f'] if options.clipLength is not None and not is_decoy: cmd += ['-u', str(options.clipLength)] for rs in options.rename: cmd += ['-r', rs] if options.reference: cmd += ['-e', options.reference] if getOptionalAttrib(findRequiredNode(config.xmlRoot, "hal2vg"), "includeMinigraph", typeFn=bool, default=False): # our vg file has minigraph sequences -- we'll filter them out, along with any nodes # that don't appear in a non-minigraph path graph_event = getOptionalAttrib(findRequiredNode(config.xmlRoot, "graphmap"), "assemblyName", default="_MINIGRAPH_") cmd += ['-d', graph_event] # sort while we're at it cmd = [cmd, ['vg', 'ids', '-s', '-']] cactus_call(parameters=cmd, outfile=out_path) # worth it cactus_call(parameters=['vg', 'validate', out_path]) return job.fileStore.writeGlobalFile(out_path)
def computePAFCoverage(job, config_node, paf_id): """ compute the gaps in PAF coverage, store them as a bed file, and add the bed file's filestore id into the config's dna-brnn xml element """ paf_file = job.fileStore.readGlobalFile(paf_id) bed_file = job.fileStore.getLocalTempFile() dnabrnn_node = None for node in config_node.findall("preprocessor"): if getOptionalAttrib(node, "preprocessJob") == 'dna-brnn': dnabrnn_node = node break assert dnabrnn_node is not None min_length = max( 1, getOptionalAttrib(dnabrnn_node, 'minLength', typeFn=int, default=0)) cactus_call( parameters=['pafcoverage', paf_file, '-g', '-m', str(min_length)], outfile=bed_file) dnabrnn_node.attrib["inputBedID"] = job.fileStore.writeGlobalFile(bed_file) return config_node
def toil_call_hal_append_subtrees(job, options, project, root_name, root_hal_id, event_names, *event_ids): work_dir = job.fileStore.getLocalTempDir() # donload the root hal file root_file = os.path.join(work_dir, '{}.hal'.format(root_name)) job.fileStore.readGlobalFile(root_hal_id, root_file, mutable=True) # download the hal files from the file store hal_files = [] for event_name, event_id in zip(event_names, event_ids): hal_files.append(os.path.join(work_dir, '{}.hal'.format(event_name))) job.fileStore.readGlobalFile(event_id, hal_files[-1]) # append to the root cactus_call(parameters=['halAppendSubtree', root_file, hal_files[-1], event_name, event_name, '--merge'] + options.halOptions.strip().split(' ')) # bypassing toil.exportFile for now as it only works on promises returned by the # start job, which isn't how this is set up. also in practice it's often more convenient # to output to s3 # todo: can we just use job.fileStore? if options.outHal.startswith('s3://'): # write it directly to s3 write_s3(root_file, options.outHal, region=get_aws_region(options.jobStore)) else: # write the output to disk shutil.copy2(root_file, options.outHal) return job.fileStore.writeGlobalFile(root_file)
def stopKtserver(dbElem): """Attempt to send the terminate signal to a ktserver.""" try: cactus_call(parameters=['ktremotemgr', 'set'] + getRemoteParams(dbElem) + ['TERMINATE', '1']) except: # The server is likely already down. pass
def testMirrorAndOrientAlignments(self): cactus_call(parameters=["cactus_mirrorAndOrientAlignments", self.logLevelString, self.simpleInputCigarPath, self.simpleOutputCigarPath]) with open(self.simpleOutputCigarPath, 'r') as fh: outputCigars = [ cigar[:-1] for cigar in fh.readlines() ] # Remove new lines # For each input alignment check that we have the two, oriented alignments for inputCigar in self.inputCigars: name1, start1, end1, strand1 = inputCigar.split()[5:9] start1, end1 = int(start1), int(end1) coordinates1 = name1, start1, end1, strand1 name2, start2, end2, strand2 = inputCigar.split()[1:5] start2, end2 = int(start2), int(end2) coordinates2 = name2, start2, end2, strand2 score = inputCigar.split()[9] ops = inputCigar.split()[10:] def invertStrand(coordinates): # cigar: simpleSeqB1 0 9 + simpleSeqA1 10 0 - 0 M 8 D 1 M 1 # cigar: simpleSeqB1 9 0 + simpleSeqA1 0 10 - 0 M 1 D 1 M 8 name, start, end, strand = coordinates assert strand in ("+", "-") if strand == "+": return name, end, start, "-" return name, end, start, "+" def reverseOps(ops): l = ops[:] l.reverse() l2 = [] for i, j in zip(l[1::2], l[::2]): l2 += [ i, j ] return l2 def invertOpStrands(ops): l = [ "I" if op == "D" else ("D" if op == "I" else op) for op in ops[::2] ] l2 = [] for op, length in zip(l, ops[1::2]): l2 += [ op, length ] return l2 if strand1 == "+": self.assertTrue(self.makeCigar(coordinates1, coordinates2, score, ops) in outputCigars) else: # Invert the strands self.assertTrue(self.makeCigar(invertStrand(coordinates1), invertStrand(coordinates2), score, reverseOps(ops)) in outputCigars) if strand2 == "+": self.assertTrue(self.makeCigar(coordinates2, coordinates1, score, invertOpStrands(ops)) in outputCigars) else: self.assertTrue(self.makeCigar(invertStrand(coordinates2), invertStrand(coordinates1), score, invertOpStrands(reverseOps(ops))) in outputCigars)
def tryRun(self, dbElem, logPath, fileStore, existingSnapshotID=None, snapshotExportID=None): snapshotDir = os.path.join(fileStore.getLocalTempDir(), 'snapshot') os.mkdir(snapshotDir) snapshotPath = os.path.join(snapshotDir, KTSERVER_SNAPSHOT_NAME) if existingSnapshotID is not None: # Extract the existing snapshot to the snapshot # directory so it will be automatically loaded fileStore.readGlobalFile(existingSnapshotID, userPath=snapshotPath) process = cactus_call(server=True, shell=False, parameters=getKtserverCommand( dbElem, logPath, snapshotDir), port=dbElem.getDbPort()) blockUntilKtserverIsRunning(logPath) if existingSnapshotID is not None: # Clear the termination flag from the snapshot cactus_call(parameters=["ktremotemgr", "remove"] + getRemoteParams(dbElem) + ["TERMINATE"]) while True: # Check for the termination signal try: cactus_call(parameters=["ktremotemgr", "get"] + getRemoteParams(dbElem) + ["TERMINATE"]) except: # No terminate signal sent yet pass else: # Terminate signal received break # Check that the DB is still alive if process.poll() is not None or isKtServerFailed(logPath): with open(logPath) as f: raise RuntimeError("KTServer failed. Log: %s" % f.read()) sleep(60) process.send_signal(signal.SIGINT) process.wait() blockUntilKtserverIsFinished(logPath) if snapshotExportID is not None: if not os.path.exists(snapshotPath): raise RuntimeError( "KTServer did not leave a snapshot on termination," " but a snapshot was requested.") if len(glob(os.path.join(snapshotDir, "*.ktss"))) != 1: # More than one snapshot file. It's not clear what # conditions trigger this--if any--but we # don't support it right now. raise RuntimeError("KTServer left more than one snapshot.") # Export the snapshot file to the file store fileStore.jobStore.updateFile(snapshotExportID, snapshotPath)
def gpuRepeatMask(self, fileStore, targetFile): """ This is the gpu version of above. It's much simpler in that there's no chunking or fragmenting """ alignment_dir = fileStore.getLocalTempDir() # dont think gpu lastz can handle this assert not self.repeatMaskOptions.unmaskInput # filter out some default lastz options in the config that aren't supported lastz_opts = self.repeatMaskOptions.lastzOpts.split() gpu_opts = [] for i in range(len(lastz_opts)): if lastz_opts[i] == "--ungapped" or lastz_opts[i] == "--nogapped": pass elif lastz_opts[i] is None or lastz_opts[i].startswith( "--queryhsplimit="): pass elif lastz_opts[i] == "--queryhsplimit": lastz_opts[i + 1] = None else: gpu_opts += [lastz_opts[i]] cmd = [ "segalign_repeat_masker", targetFile, "--lastz_interval={}".format( self.repeatMaskOptions.gpuLastzInterval), "--markend", "--neighbor_proportion", str(self.repeatMaskOptions.proportionSampled), # note: segalign now includes cactus_covered_intervals, so we pass the threshold here # and skip running it below "--M", str(self.repeatMaskOptions.period) ] + gpu_opts cactus_call(parameters=cmd, work_dir=alignment_dir) # scrape the segalign output into one big file, making an effort to read in numeric order merged_path = fileStore.getLocalTempFile() with open(merged_path, "a") as merged_file: for work_file in sorted( os.listdir(alignment_dir), key=lambda x: int(re.sub("[^0-9]", "", x))): # segalign_repeat_masker makes files that look like "tmp10.block0.intervals" # (not that there should be anything else in this directory) if work_file.startswith("tmp") and work_file.endswith( "intervals"): # append it do the merged file and delete it right away to keep disk usage lower with open(os.path.join(alignment_dir, work_file), "r") as frag_file: shutil.copyfileobj(frag_file, merged_file) os.remove(os.path.join(alignment_dir, work_file)) return merged_path
def subtractBed(bed1, bed2, destBed): """Subtract two non-bed12 beds""" # tmp. don't really want to use bedtools if os.path.getsize(bed1) == 0 or os.path.getsize(bed2) == 0: # bedtools will complain on zero-size beds os.rename(bed1, destBed) else: cactus_call(outfile=destBed, parameters=["subtract", "-a", bed1, "-b", bed2])
def run(self, fileStore): chunkList = [readGlobalFileWithoutCache(fileStore, fileID) for fileID in self.chunkIDList] #Docker expects paths relative to the work dir chunkList = [os.path.basename(chunk) for chunk in chunkList] outSequencePath = fileStore.getLocalTempFile() cactus_call(outfile=outSequencePath, stdin_string=" ".join(chunkList), parameters=["cactus_batch_mergeChunks"]) return fileStore.writeGlobalFile(outSequencePath)
def calculateCoverage(sequenceFile, cigarFile, outputFile, fromGenome=None, depthById=False, work_dir=None): logger.info("Calculating coverage of cigar file %s on %s, writing to %s" % ( cigarFile, sequenceFile, outputFile)) args = [sequenceFile, cigarFile] if fromGenome is not None: args += ["--from", fromGenome] if depthById: args += ["--depthById"] cactus_call(outfile=outputFile, work_dir=work_dir, parameters=["cactus_coverage"] + args)
def compress_gaf(job, gaf_file_id): gaf_path = job.fileStore.readGlobalFile(gaf_file_id) zip_path = job.fileStore.getLocalTempFile() cactus_call(parameters=[ 'gzip', gaf_path, '-c', ], outfile=zip_path) job.fileStore.deleteGlobalFile(gaf_file_id) return job.fileStore.writeGlobalFile(zip_path)
def mappingQualityRescoring(job, inputAlignmentFileID, minimumMapQValue, maxAlignmentsPerSite, alpha, logLevel): """ Function to rescore and filter alignments by calculating the mapping quality of sub-alignments Returns primary alignments and secondary alignments in two separate files. """ inputAlignmentFile = job.fileStore.readGlobalFile(inputAlignmentFileID) job.fileStore.logToMaster("Input cigar file has %s lines" % countLines(inputAlignmentFile)) # Get temporary file assert maxAlignmentsPerSite >= 1 tempAlignmentFiles = [ job.fileStore.getLocalTempFile() for i in range(maxAlignmentsPerSite) ] # Mirror and orient alignments, sort, split overlaps and calculate mapping qualities cactus_call(parameters=[ ["cat", inputAlignmentFile], ["cactus_mirrorAndOrientAlignments", logLevel], [ "sort", "-T{}".format(job.fileStore.getLocalTempDir()), "-k6,6", "-k7,7n", "-k8,8n" ], # This sorts by coordinate [ "uniq" ], # This eliminates any annoying duplicates if lastz reports the alignment in both orientations ["cactus_splitAlignmentOverlaps", logLevel], [ "cactus_calculateMappingQualities", logLevel, str(maxAlignmentsPerSite), str(minimumMapQValue), str(alpha) ] + tempAlignmentFiles ]) # Merge together the output files in order secondaryTempAlignmentFile = job.fileStore.getLocalTempFile() if len(tempAlignmentFiles) > 1: cactus_call(parameters=[["cat"] + tempAlignmentFiles[1:]], outfile=secondaryTempAlignmentFile) job.fileStore.logToMaster( "Filtered, non-overlapping primary cigar file has %s lines" % countLines(tempAlignmentFiles[0])) job.fileStore.logToMaster( "Filtered, non-overlapping secondary cigar file has %s lines" % countLines(secondaryTempAlignmentFile)) # Now write back alignments results file and return return job.fileStore.writeGlobalFile( tempAlignmentFiles[0]), job.fileStore.writeGlobalFile( secondaryTempAlignmentFile)
def minimap_index(job, ref_name, ref_id): """ make a minimap2 index of a reference genome """ work_dir = job.fileStore.getLocalTempDir() fa_path = os.path.join(work_dir, os.path.basename(ref_name)) idx_path = fa_path + ".idx" job.fileStore.readGlobalFile(ref_id, fa_path) cactus_call(parameters=['minimap2', fa_path, '-d', idx_path, '-x', 'asm5']) return job.fileStore.writeGlobalFile(idx_path)
def tryRun(self, dbElem, logPath, fileStore, existingSnapshotID=None, snapshotExportID=None): snapshotDir = os.path.join(fileStore.getLocalTempDir(), 'snapshot') os.mkdir(snapshotDir) snapshotPath = os.path.join(snapshotDir, KTSERVER_SNAPSHOT_NAME) if existingSnapshotID is not None: # Extract the existing snapshot to the snapshot # directory so it will be automatically loaded fileStore.readGlobalFile(existingSnapshotID, userPath=snapshotPath) process = cactus_call(server=True, shell=False, parameters=getKtserverCommand(dbElem, logPath, snapshotDir), port=dbElem.getDbPort()) blockUntilKtserverIsRunning(logPath) if existingSnapshotID is not None: # Clear the termination flag from the snapshot cactus_call(parameters=["ktremotemgr", "remove"] + getRemoteParams(dbElem) + ["TERMINATE"]) while True: # Check for the termination signal try: cactus_call(parameters=["ktremotemgr", "get"] + getRemoteParams(dbElem) + ["TERMINATE"], swallowStdErr=True) except: # No terminate signal sent yet pass else: # Terminate signal received break # Check that the DB is still alive if process.poll() is not None or isKtServerFailed(logPath): with open(logPath) as f: raise RuntimeError("KTServer failed. Log: %s" % f.read()) sleep(60) process.send_signal(signal.SIGINT) process.wait() blockUntilKtserverIsFinished(logPath) if snapshotExportID is not None: if not os.path.exists(snapshotPath): with open(logPath) as f: raise RuntimeError("KTServer did not leave a snapshot on termination," " but a snapshot was requested. Log: %s" % f.read()) if len(glob(os.path.join(snapshotDir, "*.ktss"))) != 1: # More than one snapshot file. It's not clear what # conditions trigger this--if any--but we # don't support it right now. with open(logPath) as f: raise RuntimeError("KTServer left more than one snapshot. Log: %s" % f.read()) # Export the snapshot file to the file store fileStore.jobStore.updateFile(snapshotExportID, snapshotPath)
def testCalculateMappingQualities(self): with open(self.simpleInputCigarPath, 'w') as fH: fH.write("\n".join(self.sortedNonOverlappingInputCigars) + "\n") cactus_call(parameters=[ "cactus_calculateMappingQualities", self.logLevelString, '1', '0', "1.0", self.simpleOutputCigarPath, self.simpleInputCigarPath ]) with open(self.simpleOutputCigarPath, 'r') as fh: outputCigars = [ cigar[:-1] for cigar in fh.readlines() ] # Remove new lines self.assertEqual(self.filteredSortedNonOverlappingInputCigars, outputCigars)
def testCactusCallPipes(self): inputFile = getTempFile(rootDir=self.tempDir) with open(inputFile, 'w') as f: f.write('foobar\n') # using 'cat' here rather than infile is intentional; it tests # whether the directory is mounted into containers correctly. output = cactus_call(parameters=[['cat', inputFile], ['sed', 's/foo/baz/g'], ['awk', '{ print "quux" $0 }']], check_output=True) self.assertEquals(output, 'quuxbazbar\n')
def testCactusCall(self): inputFile = getTempFile(rootDir=self.tempDir) with open("/dev/urandom") as randText: with open(inputFile, 'w') as fh: fh.write(randText.read(1024).encode('base64')) input = "".join(open(inputFile).read().split("\n")) #Send input to container's stdin through a file, get output #from stdout output = "".join(cactus_call(infile=inputFile, check_output=True, parameters=["docker_test_script"]).split("\n")) self.assertEquals(input, output) #Send input as string, get output from stdout output = "".join(cactus_call(stdin_string=input, check_output=True, parameters=["docker_test_script"]).split("\n")) self.assertEquals(input, output)
def run(self, fileStore): blastResultsFile = fileStore.getLocalTempFile() seqFile = fileStore.readGlobalFile(self.seqFileID) runSelfLastz(seqFile, blastResultsFile, lastzArguments=self.blastOptions.lastzArguments) if self.blastOptions.realign: realignResultsFile = fileStore.getLocalTempFile() runCactusSelfRealign(seqFile, inputAlignmentsFile=blastResultsFile, outputAlignmentsFile=realignResultsFile, realignArguments=self.blastOptions.realignArguments) blastResultsFile = realignResultsFile resultsFile = fileStore.getLocalTempFile() cactus_call(parameters=["cactus_blast_convertCoordinates", blastResultsFile, resultsFile, str(self.blastOptions.roundsOfCoordinateConversion)]) if self.blastOptions.compressFiles: #TODO: This throws away the compressed file seqFile = compressFastaFile(seqFile) logger.info("Ran the self blast okay") return fileStore.writeGlobalFile(resultsFile)
def run(self, fileStore): seqFile1 = fileStore.readGlobalFile(self.seqFileID1) seqFile2 = fileStore.readGlobalFile(self.seqFileID2) if self.blastOptions.compressFiles: seqFile1 = decompressFastaFile(seqFile1, fileStore.getLocalTempFile()) seqFile2 = decompressFastaFile(seqFile2, fileStore.getLocalTempFile()) blastResultsFile = fileStore.getLocalTempFile() runLastz(seqFile1, seqFile2, blastResultsFile, lastzArguments = self.blastOptions.lastzArguments) if self.blastOptions.realign: realignResultsFile = fileStore.getLocalTempFile() runCactusRealign(seqFile1, seqFile2, inputAlignmentsFile=blastResultsFile, outputAlignmentsFile=realignResultsFile, realignArguments=self.blastOptions.realignArguments) blastResultsFile = realignResultsFile resultsFile = fileStore.getLocalTempFile() cactus_call(parameters=["cactus_blast_convertCoordinates", blastResultsFile, resultsFile, str(self.blastOptions.roundsOfCoordinateConversion)]) logger.info("Ran the blast okay") return fileStore.writeGlobalFile(resultsFile)
def findOccupiedPorts(): """Attempt to find all currently taken TCP ports. Returns a set of ints, representing taken ports.""" netstatOutput = cactus_call(parameters=["netstat", "-tuplen"], check_output=True) ports = set() for line in netstatOutput.split("\n"): fields = line.split() if len(fields) != 9: # Header or other garbage line continue port = int(fields[3].split(':')[-1]) ports.add(port) logger.debug('Detected ports in use: %s' % repr(ports)) return ports
def exportHal(job, project, event=None, cacheBytes=None, cacheMDC=None, cacheRDC=None, cacheW0=None, chunk=None, deflate=None, inMemory=False): HALPath = "tmp_alignment.hal" # traverse tree to make sure we are going breadth-first tree = project.mcTree # find subtree if event specified rootNode = None if event is not None: assert event in tree.nameToId and not tree.isLeaf(tree.nameToId[event]) rootNode = tree.nameToId[event] for node in tree.breadthFirstTraversal(rootNode): genomeName = tree.getName(node) if genomeName in project.expMap: experimentFilePath = job.fileStore.readGlobalFile(project.expIDMap[genomeName]) experiment = ExperimentWrapper(ET.parse(experimentFilePath).getroot()) outgroups = experiment.getOutgroupEvents() experiment.setConfigPath(job.fileStore.readGlobalFile(experiment.getConfigID())) expTreeString = NXNewick().writeString(experiment.getTree(onlyThisSubtree=True)) assert len(expTreeString) > 1 assert experiment.getHalID() is not None assert experiment.getHalFastaID() is not None subHALPath = job.fileStore.readGlobalFile(experiment.getHalID()) halFastaPath = job.fileStore.readGlobalFile(experiment.getHalFastaID()) args = [os.path.basename(subHALPath), os.path.basename(halFastaPath), expTreeString, os.path.basename(HALPath)] if len(outgroups) > 0: args += ["--outgroups", ",".join(outgroups)] if cacheBytes is not None: args += ["--cacheBytes", cacheBytes] if cacheMDC is not None: args += ["--cacheMDC", cacheMDC] if cacheRDC is not None: args += ["--cacheRDC", cacheRDC] if cacheW0 is not None: args += ["--cacheW0", cacheW0] if chunk is not None: args += ["--chunk", chunk] if deflate is not None: args += ["--deflate", deflate] if inMemory is True: args += ["--inMemory"] cactus_call(parameters=["halAppendCactusSubtree"] + args) cactus_call(parameters=["halSetMetadata", HALPath, "CACTUS_COMMIT", cactus_commit]) with job.fileStore.readGlobalFileStream(project.configID) as configFile: cactus_call(parameters=["halSetMetadata", HALPath, "CACTUS_CONFIG", b64encode(configFile.read())]) return job.fileStore.writeGlobalFile(HALPath)
def logAssemblyStats(job, message, name, sequenceID, preemptable=True): sequenceFile = job.fileStore.readGlobalFile(sequenceID) analysisString = cactus_call(parameters=["cactus_analyseAssembly", sequenceFile], check_output=True) job.fileStore.logToMaster("%s, got assembly stats for genome %s: %s" % (message, name, analysisString))
def run(self, fileStore): # Trim outgroup, convert outgroup coordinates, and add to # outgroup fragments dir outgroupSequenceFiles = [fileStore.readGlobalFile(fileID) for fileID in self.outgroupSequenceIDs] mostRecentResultsFile = fileStore.readGlobalFile(self.mostRecentResultsID) trimmedOutgroup = fileStore.getLocalTempFile() outgroupCoverage = fileStore.getLocalTempFile() calculateCoverage(outgroupSequenceFiles[0], mostRecentResultsFile, outgroupCoverage) # The windowSize and threshold are fixed at 1: anything more # and we will run into problems with alignments that aren't # covered in a matching trimmed sequence. trimSequences(outgroupSequenceFiles[0], outgroupCoverage, trimmedOutgroup, flanking=self.blastOptions.trimOutgroupFlanking, windowSize=1, threshold=1) outgroupConvertedResultsFile = fileStore.getLocalTempFile() with open(outgroupConvertedResultsFile, 'w') as f: upconvertCoords(cigarPath=mostRecentResultsFile, fastaPath=trimmedOutgroup, contigNum=1, outputFile=f) self.outgroupFragmentIDs.append(fileStore.writeGlobalFile(trimmedOutgroup)) sequenceFiles = [fileStore.readGlobalFile(path) for path in self.sequenceIDs] untrimmedSequenceFiles = [fileStore.readGlobalFile(path) for path in self.untrimmedSequenceIDs] # Report coverage of the latest outgroup on the trimmed ingroups. for trimmedIngroupSequence, ingroupSequence, ingroupName in zip(sequenceFiles, untrimmedSequenceFiles, self.ingroupNames): tmpIngroupCoverage = fileStore.getLocalTempFile() calculateCoverage(trimmedIngroupSequence, mostRecentResultsFile, tmpIngroupCoverage) fileStore.logToMaster("Coverage on %s from outgroup #%d, %s: %s%% (current ingroup length %d, untrimmed length %d). Outgroup trimmed to %d bp from %d" % (ingroupName, self.outgroupNumber, self.outgroupNames[self.outgroupNumber - 1], percentCoverage(trimmedIngroupSequence, tmpIngroupCoverage), sequenceLength(trimmedIngroupSequence), sequenceLength(ingroupSequence), sequenceLength(trimmedOutgroup), sequenceLength(outgroupSequenceFiles[0]))) # Convert the alignments' ingroup coordinates. ingroupConvertedResultsFile = fileStore.getLocalTempFile() if self.sequenceIDs == self.untrimmedSequenceIDs: # No need to convert ingroup coordinates on first run. shutil.copy(outgroupConvertedResultsFile, ingroupConvertedResultsFile) else: cactus_call(parameters=["cactus_blast_convertCoordinates", "--onlyContig1", outgroupConvertedResultsFile, ingroupConvertedResultsFile, "1"]) # Append the latest results to the accumulated outgroup coverage file if self.outgroupResultsID: outgroupResultsFile = fileStore.readGlobalFile(self.outgroupResultsID, mutable=True) else: outgroupResultsFile = fileStore.getLocalTempFile() with open(ingroupConvertedResultsFile) as results: with open(outgroupResultsFile, 'a') as output: output.write(results.read()) self.outgroupResultsID = fileStore.writeGlobalFile(outgroupResultsFile) # Report coverage of the all outgroup alignments so far on the ingroups. ingroupCoverageFiles = [] self.ingroupCoverageIDs = [] for ingroupSequence, ingroupName in zip(untrimmedSequenceFiles, self.ingroupNames): ingroupCoverageFile = fileStore.getLocalTempFile() calculateCoverage(sequenceFile=ingroupSequence, cigarFile=outgroupResultsFile, outputFile=ingroupCoverageFile, depthById=self.blastOptions.trimOutgroupDepth > 1) ingroupCoverageFiles.append(ingroupCoverageFile) self.ingroupCoverageIDs.append(fileStore.writeGlobalFile(ingroupCoverageFile)) fileStore.logToMaster("Cumulative coverage of %d outgroups on ingroup %s: %s" % (self.outgroupNumber, ingroupName, percentCoverage(ingroupSequence, ingroupCoverageFile))) if len(self.outgroupSequenceIDs) > 1: # Trim ingroup seqs and recurse on the next outgroup. trimmedSeqs = [] # Use the accumulated results so far to trim away the # aligned parts of the ingroups. for i, sequenceFile in enumerate(untrimmedSequenceFiles): outgroupCoverageFile = ingroupCoverageFiles[i] selfCoverageFile = fileStore.getLocalTempFile() coverageFile = fileStore.getLocalTempFile() if self.blastOptions.keepParalogs: subtractBed(outgroupCoverageFile, selfCoverageFile, coverageFile) else: coverageFile = outgroupCoverageFile trimmed = fileStore.getLocalTempFile() trimSequences(sequenceFile, coverageFile, trimmed, complement=True, flanking=self.blastOptions.trimFlanking, minSize=self.blastOptions.trimMinSize, threshold=self.blastOptions.trimThreshold, windowSize=self.blastOptions.trimWindowSize, depth=self.blastOptions.trimOutgroupDepth) trimmedSeqs.append(trimmed) trimmedSeqIDs = [fileStore.writeGlobalFile(path, cleanup=True) for path in trimmedSeqs] return self.addChild(BlastFirstOutgroup( ingroupNames=self.ingroupNames, untrimmedSequenceIDs=self.untrimmedSequenceIDs, sequenceIDs=trimmedSeqIDs, outgroupNames=self.outgroupNames, outgroupSequenceIDs=self.outgroupSequenceIDs[1:], outgroupFragmentIDs=self.outgroupFragmentIDs, outgroupResultsID=self.outgroupResultsID, blastOptions=self.blastOptions, outgroupNumber=self.outgroupNumber + 1, ingroupCoverageIDs=self.ingroupCoverageIDs)).rv() else: # Finally, put the ingroups and outgroups results together return (self.outgroupResultsID, self.outgroupFragmentIDs, self.ingroupCoverageIDs)
def stopKtserver(dbElem): """Attempt to send the terminate signal to a ktserver.""" cactus_call(parameters=['ktremotemgr', 'set'] + getRemoteParams(dbElem) + ['TERMINATE', '1'])
def testCuTest(self): cactus_call(parameters=["referenceTests", getLogLevelString()])
def testPosetAlignerAPI(self): """Run all the cactus base aligner CuTests, fail if any of them fail. """ cactus_call(parameters=["cactus_barTests", getLogLevelString()])
def testSplitAlignmentsOverlaps(self): self.inputCigars = [ 'cigar: simpleSeqB1 9 18 + simpleSeqA1 2 6 + 1.000000 M 3 I 5 M 1', 'cigar: simpleSeqB1 9 18 + simpleSeqA1 2 6 + 4.000000 M 1 I 5 M 3', 'cigar: simpleSeqZ1 0 1 + simpleSeqA1 6 7 + 3.000000 M 1', 'cigar: simpleSeqB1 18 28 + simpleSeqA2 0 10 + 8.000000 M 1 I 2 M 2 D 2 M 5', 'cigar: simpleSeqB1 28 30 + simpleSeqA2 6 8 + 3.000000 M 2', 'cigar: simpleSeqB1 32 30 - simpleSeqA2 7 9 + 72.000000 M 2', 'cigar: simpleSeqBC 9 0 - simpleSeqAC 0 10 + 5.000000 M 1 D 1 M 8', 'cigar: simpleSeqA1 2 6 + simpleSeqB1 9 18 + 1.000000 M 3 D 5 M 1', 'cigar: simpleSeqA1 2 6 + simpleSeqB1 9 18 + 4.000000 M 1 D 5 M 3', 'cigar: simpleSeqA2 0 10 + simpleSeqB1 18 28 + 8.000000 M 1 D 2 M 2 I 2 M 5', 'cigar: simpleSeqA2 6 8 + simpleSeqB1 28 30 + 3.000000 M 2', 'cigar: simpleSeqA2 9 7 - simpleSeqB1 30 32 + 72.000000 M 2', 'cigar: simpleSeqAC 10 0 - simpleSeqBC 0 9 + 5.000000 M 8 I 1 M 1', 'cigar: simpleSeqD 0 5 + simpleSeqC1 0 5 + 2.000000 M 5', 'cigar: simpleSeqNonExistent 0 10 + simpleSeqC1 0 10 + 0.500000 M 10', 'cigar: simpleSeqD 5 10 + simpleSeqC1 5 10 + 8.000000 M 5', 'cigar: simpleSeqC1 15 20 + simpleSeqC1 10 15 + 19.000000 M 5', 'cigar: simpleSeqC1 10 15 + simpleSeqC1 15 20 + 19.000000 M 5', 'cigar: simpleSeqC1 0 5 + simpleSeqD 0 5 + 2.000000 M 5', 'cigar: simpleSeqC1 5 10 + simpleSeqD 5 10 + 8.000000 M 5', 'cigar: simpleSeqC1 0 10 + simpleSeqNonExistent 0 10 + 0.500000 M 10', 'cigar: simpleSeqA1 6 7 + simpleSeqZ1 0 1 + 3.000000 M 1' ] with open(self.simpleInputCigarPath, 'w') as fH: fH.write("\n".join(self.inputCigars) + "\n") cactus_call(parameters=["cactus_splitAlignmentOverlaps", self.logLevelString, self.simpleInputCigarPath, self.simpleOutputCigarPath]) with open(self.simpleOutputCigarPath, 'r') as fh: outputCigars = [ cigar[:-1] for cigar in fh.readlines() ] # Remove new lines # Get start and end coordinates of cigars ends = set() for inputCigar in self.inputCigars: name1, start1, end1, strand1 = inputCigar.split()[5:9] ends.add((name1, int(start1))) ends.add((name1, int(end1))) assert strand1 == "+" # Count of expected number of chopped up cigars totalExpectedCigars = 0 # Function to split a list of ops into a prefix and suffix list def splitPrefixOps(ops, cutPoint): pOps, sOps = [], [] j = 0 for i in range(0, len(ops), 2): op, length = ops[i], int(ops[i+1]) assert op in ("I", "D", "M") if op == "I": pOps.append(op) pOps.append(length) continue if j + length <= cutPoint: pOps.append(op) pOps.append(length) j += length if j == cutPoint: break else: assert j + length > cutPoint pOps.append(op) pOps.append(cutPoint - j) sOps.append(op) sOps.append(length - (cutPoint - j)) break sOps += ops[i+2:] return pOps, sOps # For each cigar: for inputCigar in self.inputCigars: name1, start1, end1, strand1 = inputCigar.split()[5:9] start1, end1 = int(start1), int(end1) assert strand1 == "+" name2, start2, end2, strand2 = inputCigar.split()[1:5] start2, end2 = int(start2), int(end2) score = float(inputCigar.split()[9]) ops = inputCigar.split()[10:] # For each intermediate chop point i = start1 for j in xrange(start1+1, end1+1): if (name1, j) in ends: # Chop up cigar coordinates1 = name1, i, j, "+" # Get sublist of ops pOps, subOps = splitPrefixOps(ops, i - start1) subOps, sOps = splitPrefixOps(subOps, j - i) x = lambda ops : sum([ int(ops[k+1]) for k in range(0, len(ops), 2) if ops[k] != 'D' ]) k = x(pOps) l = k + x(subOps) # Get second coordinates if strand2 == "+": coordinates2 = name2, start2 + k, start2 + l, strand2 else: assert strand2 == "-" coordinates2 = name2, start2 - k, start2 - l, strand2 choppedCigar = self.makeCigar(coordinates1, coordinates2, score, subOps) # Check each chopped up cigar is in output self.assertTrue(choppedCigar in outputCigars) # Inc. number of expected cigars totalExpectedCigars += 1 # Check previous coordinate i = j # Check we have the expected number of cigars self.assertEquals(totalExpectedCigars, len(outputCigars))
def testHalGeneratorFunctions(self): """Run all the CuTests, fail if any of them fail. """ cactus_call(parameters=["cactus_halGeneratorTests", getLogLevelString()])