def chainSamFileJobFunction(job, config, aln_struct): # Cull the files from the job store that we want if config["chain"] is None and config["realign"] is None: job.fileStore.logToMaster("[chainSamFileJobFunction]Nothing to do.") return if config["chain"] is not None: sam_file = job.fileStore.readGlobalFile(aln_struct.FileStoreID()) reference = job.fileStore.readGlobalFile(config["reference_FileStoreID"]) reads = job.fileStore.readGlobalFile(config["sample_FileStoreID"]) workdir = job.fileStore.getLocalTempDir() output_sam = LocalFile(workdir=workdir, filename="{}_chained.bam".format(config["sample_label"])) if config["debug"]: job.fileStore.logToMaster("[chainSamFileJobFunction] chaining {bwa_out} (locally: {sam})" "".format(bwa_out=aln_struct.FileStoreID(), sam=sam_file)) chainSamFile(parent_job=job, samFile=sam_file, outputSamFile=output_sam.fullpathGetter(), readFastqFile=reads, referenceFastaFile=reference) chainedSamFileId = job.fileStore.writeGlobalFile(output_sam.fullpathGetter()) deliverOutput(job, output_sam, config["output_dir"]) job.addFollowOnJobFn(realignmentRootJobFunction, config, chainedSamFileId) else: job.fileStore.logToMaster("[chainSamFileJobFunction]Not chaining SAM, passing alignment " "on to realignment") job.addFollowOnJobFn(realignmentRootJobFunction, config, aln_struct.FileStoreID())
def makeNanoporeRead(f5_path): # here we load the NanoporeRead and write it to a file np = NanoporeRead(fast_five_file=f5_path, twoD=False) # make this a config arg ok = np.Initialize(job) if not ok: return None _l = np.read_label tF = job.fileStore.getLocalTempFile() fH = open(tF, "w") ok = np.Write(job, fH, initialize=False) if not ok: fH.close() return None fH.close() # then we gzip it and deliver it to the readstore and return the ledger line fn = LocalFile(workdir=workdir, filename="%s.np.gz" % _l) fH = open(tF, "rb") gz = gzip.open(fn.fullpathGetter(), "wb") shutil.copyfileobj(fH, gz) fH.close() gz.close() try: deliverOutput(job, fn, readstore_dir) except RuntimeError: job.fileStore.logToMaster("[makeNanoporeReadsJobFunction]Read %s failed to upload" % _l) return None return (_l, "%s%s\n" % (readstore_dir, fn.filenameGetter()))
def deliverLedgerJobFunction(job, config, ledger_fids): fHs = [open(job.fileStore.readGlobalFile(f), "r") for f in ledger_fids] ls = [pickle.load(f) for f in fHs] ledger = ls[0] [ledger.update(d) for d in ls[1:]] fn = LocalFile(workdir=job.fileStore.getLocalTempDir(), filename="%s_ledger.pkl" % config["ledger_name"]) _h = open(fn.fullpathGetter(), "w") pickle.dump(ledger, _h) _h.close() deliverOutput(job, fn, config["readstore_ledger_dir"])
def consolidateMethylationCallsJobFunction(job, config, methylation_prob_fids): outfile = LocalFile(workdir=job.fileStore.getLocalTempDir(), filename="%s_%s.tsv" % (config["sample_label"], config["degenerate"])) _handle = open(outfile.fullpathGetter(), "w") files = fileinput.input( [job.fileStore.readGlobalFile(fid) for fid in methylation_prob_fids]) map(lambda l: _handle.write(l), files) files.close() _handle.close() deliverOutput(job, outfile, config["output_dir"]) return
def _sumExpectationsOverColumns(): f = LocalFile(workdir=workdir) _h = open(f.fullpathGetter(), "w") for pos, pos_df in aligned_pairs.groupby(["ref_pos"]): for base, base_df in pos_df.groupby("base"): marginal_prob = base_df["posterior"].sum() coverage = len(base_df["read_label"].unique()) l = "%s\t%s\t%s\t%s\t%s\n" % (cPecan_config["contig_name"], pos, base, marginal_prob, coverage) _h.write(l) _h.close() return f
def consolidateVariantCallsJobFunction(job, config, posterior_prob_fids): variants = getVariantCallFunctions(config["degenerate"]) parser = variants.parseVariantCalls file_iter = (job.fileStore.readGlobalFile(fid) for fid in posterior_prob_fids) table = pd.concat([parser(f) for f in file_iter]).sort_values(["contig", "ref_pos"]) outfile = LocalFile(workdir=job.fileStore.getLocalTempDir(), filename="%s_%s.tsv" % (config["sample_label"], config["degenerate"])) _handle = open(outfile.fullpathGetter(), "w") variants.writeVariantCalls(table, _handle) _handle.close() deliverOutput(job, outfile, config["output_dir"])
def processReferenceSequence(ref_seq, workdir, motif_key=None, sub_char="X", parent_job=None): # make the forward and backward sequences, substituting the necessary motifs if motif_key is not None: motif, ok = getMotif(motif_key, ref_seq) require( ok, "[processReferenceSequence]Illegal motif_key given %s" % motif_key) if parent_job is not None: parent_job.fileStore.logToMaster( "[processReferenceSequence]Made %s substitutions" % motif.substitutionPositionCount()) try: fw_refseq = motif.forwardSubstitutedSequence(sub_char) bw_refseq = motif.complementSubstitutedSequence(sub_char) except AssertionError: return None, None, False else: fw_refseq = ref_seq.upper() bw_refseq = _reverseComplement(fw_refseq, reverse=False, complement=True) fw_refseqfile = LocalFile(workdir=workdir) bw_refseqfile = LocalFile(workdir=workdir) sequences = [fw_refseq, bw_refseq] sequence_files = [fw_refseqfile, bw_refseqfile] for f, s in zip(sequence_files, sequences): _h = open(f.fullpathGetter(), "w") _h.write(s + "\n") _h.close() [ require(os.path.exists(f.fullpathGetter()), "[processReferenceSequence]Missing %s" % f.filenameGetter()) for f in sequence_files ] return fw_refseqfile, bw_refseqfile, True
def signalAlignRootJobFunction(job, config, sample): # download the reference config["reference_FileStoreID"] = job.addChildJobFn(urlDownlodJobFunction, config["ref"], disk=config["ref_size"]).rv() # download the BAM, and shard by region alignment_fid = job.addChildJobFn(urlDownlodJobFunction, sample.URL, disk=sample.size).rv() # download the models config["HMM_fid"] = job.addChildJobFn(urlDownlodJobFunction, config["HMM_file"], disk="10M").rv() config["HDP_fid"] = job.addChildJobFn(urlDownlodJobFunction, config["HDP_file"], disk="250M").rv() # setup labels config["sample_label"] = sample.sample_label # download and load the ledger # TODO use new function here ledger = LocalFile(workdir=job.fileStore.getLocalTempDir(), filename="%s.tmp" % uuid.uuid4().hex) urlDownload(job, config["ledger_url"], ledger) config["ledger"] = cPickle.load(open(ledger.fullpathGetter(), "r")) job.addFollowOnJobFn(shardAlignmentJobNode, config, alignment_fid)
def prepareFast5Tarfile(job, split_tars_bigger_than_this, batchsize, download_slots, part_size, rs_sample): job.fileStore.logToMaster("[prepareFast5Tarfile]Working on sample %s" % rs_sample.sample_label) workdir = job.fileStore.getLocalTempDir() archive = LocalFile(workdir=workdir, filename="%s.tar" % uuid.uuid4().hex) urlDownload(job, rs_sample.URL, archive, download_slots=str(download_slots), part_size=str(part_size)) _handle = tarfile.open(archive.fullpathGetter(), "r") members = _handle.getmembers()[1:] # the first member is often just the directory with the fast5s paths = [os.path.join(workdir, m.name) for m in members] _handle.extractall(path=workdir) if rs_sample.size >= split_tars_bigger_than_this: _iter = [paths[i:i + batchsize] for i in range(0, len(paths), batchsize)] tar_fids = [archiveBatchAndUploadToFileStore(job, b, workdir) for b in _iter] _handle.close() job.fileStore.logToMaster("[prepareFast5Tarfile]Split %s into %s smaller tars" % (rs_sample.sample_label, len(tar_fids))) return tar_fids else: tar_fid = archiveBatchAndUploadToFileStore(job, paths, workdir) _handle.close() return [tar_fid]
def _SignalMachine(read_label, cigar, nanopore_read): guide_aln = LocalFile(workdir=workdir) _handle = open(guide_aln.fullpathGetter(), "w") _handle.write(cigar) _handle.close() require(os.path.exists(guide_aln.fullpathGetter()), "NO guide aln file") signalMachine_args = [ "--sm3Hdp", "-s", "1", "-o", "%s" % degenerate_enum, "-L", "%s" % read_label, "-T", "%s%s" % (DOCKER_DIR, models.localFileName(hmmfid)), "-q", "%s%s" % (DOCKER_DIR, nanopore_read.filenameGetter()), "-f", "%s%s" % (DOCKER_DIR, fw_seqfile.filenameGetter()), "-b", "%s%s" % (DOCKER_DIR, bw_seqfile.filenameGetter()), "-p", "%s%s" % (DOCKER_DIR, guide_aln.filenameGetter()), "-u", "%s%s" % (DOCKER_DIR, posteriors.filenameGetter()), "-v", "%s%s" % (DOCKER_DIR, models.localFileName(hdpfid)), ] try: docker_call(job=job, tool=signalMachine_image, parameters=signalMachine_args, work_dir=(workdir + "/")) except subprocess.CalledProcessError: pass
def getFastqFromBam(job, bam_sample, samtools_image="quay.io/ucsc_cgl/samtools"): # n.b. this is NOT a jobFunctionWrappingJob, it just takes the parent job as # an argument to have access to the job store # download the BAM to the local directory, use a uid to aviod conflicts uid = uuid.uuid4().hex work_dir = job.fileStore.getLocalTempDir() local_bam = LocalFile(workdir=work_dir, filename="bam_{}.bam".format(uid)) fastq_reads = LocalFile(workdir=work_dir, filename="fastq_reads{}.fq".format(uid)) urlDownload(parent_job=job, source_url=bam_sample.URL, destination_file=local_bam) require(not os.path.exists(fastq_reads.fullpathGetter()), "[getFastqFromBam]fastq file already exists") # run samtools to get the reads from the BAM # TODO use DOCKER_DIR and clean this up. idea: make globls.py or something samtools_parameters = [ "fastq", "/data/{}".format(local_bam.filenameGetter()) ] with open(fastq_reads.fullpathGetter(), 'w') as fH: docker_call(job=job, tool=samtools_image, parameters=samtools_parameters, work_dir=work_dir, outfile=fH) require(os.path.exists(fastq_reads.fullpathGetter()), "[getFastqFromBam]didn't generate reads") # upload fastq to fileStore return job.fileStore.writeGlobalFile(fastq_reads.fullpathGetter())
def calculateMethylationProbabilityJobFunction( job, config, cPecan_config, ignore_hmm, batch_number, signalMachine_image="quay.io/artrand/signalmachine"): def _get_url(read_label): try: return ledger[read_label] except KeyError: return None def _SignalMachine(read_label, cigar, nanopore_read): guide_aln = LocalFile(workdir=workdir) _handle = open(guide_aln.fullpathGetter(), "w") _handle.write(cigar) _handle.close() require(os.path.exists(guide_aln.fullpathGetter()), "NO guide aln file") signalMachine_args = [ "--sm3Hdp", "-s", "1", "-o", "%s" % degenerate_enum, "-L", "%s" % read_label, "-T", "%s%s" % (DOCKER_DIR, models.localFileName(hmmfid)), "-q", "%s%s" % (DOCKER_DIR, nanopore_read.filenameGetter()), "-f", "%s%s" % (DOCKER_DIR, fw_seqfile.filenameGetter()), "-b", "%s%s" % (DOCKER_DIR, bw_seqfile.filenameGetter()), "-p", "%s%s" % (DOCKER_DIR, guide_aln.filenameGetter()), "-u", "%s%s" % (DOCKER_DIR, posteriors.filenameGetter()), "-v", "%s%s" % (DOCKER_DIR, models.localFileName(hdpfid)), ] try: docker_call(job=job, tool=signalMachine_image, parameters=signalMachine_args, work_dir=(workdir + "/")) except subprocess.CalledProcessError: pass def _parse_probabilities(): return pd.read_table( posteriors.fullpathGetter(), usecols=(1, 2, 3, 6), names=["ref_pos", "base", "posterior", "read_label"], dtype={ "ref_pos": np.int, "base": np.str, "posterior": np.float64, "read_label": np.str }) def _sumExpectationsOverColumns(): f = LocalFile(workdir=workdir) _h = open(f.fullpathGetter(), "w") for pos, pos_df in aligned_pairs.groupby(["ref_pos"]): for base, base_df in pos_df.groupby("base"): marginal_prob = base_df["posterior"].sum() coverage = len(base_df["read_label"].unique()) l = "%s\t%s\t%s\t%s\t%s\n" % (cPecan_config["contig_name"], pos, base, marginal_prob, coverage) _h.write(l) _h.close() return f job.fileStore.logToMaster( "[calculateMethylationProbabilityJobFunction]Running on batch %s" % batch_number) workdir = job.fileStore.getLocalTempDir() fw_seqfile, bw_seqfile, ok = processReferenceSequence( cPecan_config["contig_seq"], workdir, config["motif_key"], config["substitute_char"]) if not ok: raise RuntimeError( "[calculateMethylationProbabilityJobFunction]ERROR processing reference sequences" ) # get the models hmmfid = config["HMM_fid"] hdpfid = config["HDP_fid"] try: models = LocalFileManager(job=job, fileIds_to_get=[hmmfid, hdpfid], workdir=workdir) except AssertionError: raise RuntimeError( "[calculateMethylationProbabilityJobFunction]ERROR getting models locally" ) # download the npRead files ledger = config["ledger"] url_iter = (_get_url(l.strip()) for l in cPecan_config["query_labels"]) read_urls = [u for u in url_iter if u is not None] if config["debug"]: job.fileStore.logToMaster( "[calculateMethylationProbabilityJobFunction]Got %s URLs" % len(read_urls)) npReads = [ unzipLocalFile(f) for f in [urlDownloadToLocalFile(job, workdir, url) for url in read_urls] if f is not None ] failed = len(read_urls) - len(npReads) if failed > 0 and config["stop_at_failed_reads"]: raise RuntimeError( "[calculateMethylationProbabilityJobFunction]Got %s failed npRead" "downloads and stop_at_failed_reads is True" % failed) else: if config["debug"]: job.fileStore.logToMaster( "[calculateMethylationProbabilityJobFunction]" "Failed to download and upzip %s NanoporeReads" % failed) # file to collect the posterior probs posteriors = LocalFile(workdir=workdir, filename="%s_%s.dat" % (config["sample_label"], uuid.uuid4())) degenerate_enum = getVariantCallFunctions(config["degenerate"]).enum() # do the signal alignment, and get the posterior probabilities map( lambda (l, c, n): _SignalMachine(l.strip(), c, n), zip(cPecan_config["query_labels"], cPecan_config["exonerate_cigars"], npReads)) # the reads may not produce any posteriors, if, for example, they don't align to a region where # there are any ambiguity characters the posteriors file will be empty and we just return # None, which is the convention if not os.path.exists(posteriors.fullpathGetter()) or os.stat( posteriors.fullpathGetter()).st_size == 0: return None # reminder: the convention is that 'expectations' are un-normalized posterior probabilities # so this file is a table of expectatiosn, I also use the convention that the trailing # underscore means `file` or `file-path` aligned_pairs = _parse_probabilities() expectations_ = _sumExpectationsOverColumns() if config["probs_output_dir"] is not None: deliverOutput(job, posteriors, config["probs_output_dir"]) return job.fileStore.writeGlobalFile(expectations_.fullpathGetter())