def create_index_file(bam_filename, bam_dxlink): """Create Index file. Sorts BAM if needed """ print "Creating Index file." index_filename = "{bam}.bai".format(bam=bam_filename) cmd_index = ['samtools', 'index', bam_filename] sorted_filename = bam_filename try: run_cmd(cmd_index) except NotIndexedException: print "Sorting BAM" sorted_filename = bam_filename[:-4] + '.sorted.bam' cmd_sort = [ 'samtools', 'sort', bam_filename, bam_filename[:-4] + '.sorted'] run_cmd(cmd_sort) print "Indexing BAM" index_cmd = ['samtools', 'index', sorted_filename] index_filename = "{sorted_bam_name}.bai".format( sorted_bam_name=sorted_filename) run_cmd(index_cmd) finally: index_file_link = dxpy.dxlink(dxpy.upload_local_file(index_filename)) aligned_sorted_bam = dxpy.dxlink(dxpy.upload_local_file(sorted_filename)) return aligned_sorted_bam, index_file_link
def create_final_set_of_peak_calls(job_inputs): replicate_idr_prefixes = [r.replace('.tar.gz', '') for r in job_inputs['replicate_idr_files']] pseudo_replicate_idr_prefixes = [r.replace('.tar.gz', '') for r in job_inputs['pseudo_replicate_idr_files']] pooled_pseudo_replicate_idr_prefix = job_inputs['pooled_pseudo_replicate_idr_files'].replace('.tar.gz', '') (num_peaks_each_rep, num_peaks_each_pseudo_rep, numPeaks_Rep0) = get_thresholds(replicate_idr_prefixes, pseudo_replicate_idr_prefixes, pooled_pseudo_replicate_idr_prefix, job_inputs['replicate_peaks_threshold'], job_inputs['pseudo_replicate_peaks_threshold'], job_inputs['pooled_pseudo_replicate_peaks_threshold']) max_numPeaks_Rep = max(num_peaks_each_rep) pooled_replicates_peaks_fn = download_and_gunzip_file(job_inputs['pooled_replicate_peaks_file']) coi = {'signal.value': 7, 'p.value': 8, 'q.value': 9}[job_inputs['ranking_measure']] cmd = 'sort -k{0}nr,{0}nr "{1}" | head -n {2} | gzip -c > "{3}_conservative.regionPeak.gz"'.format(coi, pooled_replicates_peaks_fn, max_numPeaks_Rep, job_inputs['output_prefix']) print cmd subprocess.check_output(cmd, shell=True) opt_thresh = max(max_numPeaks_Rep, numPeaks_Rep0) cmd = 'sort -k{0}nr,{0}nr "{1}" | head -n {2} | gzip -c > "{3}_optimal.regionPeak.gz"'.format(coi, pooled_replicates_peaks_fn, opt_thresh, job_inputs['output_prefix']) print cmd subprocess.check_output(cmd, shell=True) conservative_result = dxpy.upload_local_file('{0}_conservative.regionPeak.gz'.format(job_inputs['output_prefix'])) optimal_result = dxpy.upload_local_file('{0}_optimal.regionPeak.gz'.format(job_inputs['output_prefix'])) return {'conservative_peak_calls': dxpy.dxlink(conservative_result), 'optimal_peak_calls': dxpy.dxlink(optimal_result), 'num_peaks_each_rep': num_peaks_each_rep, 'num_peaks_each_pseudo_rep': num_peaks_each_pseudo_rep, 'num_peaks_pooled_pseudo_rep': numPeaks_Rep0}
def test_alignment_count(applet_id, project_id, folder, tmpdir): """Run BWA on a FASTQ file and verify that the number of alignments produced is correct. """ # Recall that applet_id is set in the associated conftest.py, which either # gets it from the command line or builds the applet and retrieves its id. # And tmpdir is some pytest magic. It's type is py.path.local.LocalPath. # It's strpath property just returns a string. applet = dxpy.DXApplet(applet_id) input_dict = {"fastq": dxpy.dxlink(SAMPLE_FASTQ), "genomeindex_targz": dxpy.dxlink(HS37D5_BWA_INDEX)} job = applet.run(input_dict, instance_type="mem1_ssd1_x16", folder=folder, project=project_id) job.wait_on_done() output_bam_dxfile = dxpy.DXFile(job.describe()["output"]["bam"]) local_filename = os.path.join(tmpdir.strpath, "test.bam") dxpy.download_dxfile(output_bam_dxfile.get_id(), local_filename) count_alignments_cmd = "samtools view {bam} | wc -l".format( bam=local_filename) num_alignments = int(subprocess.check_output(count_alignments_cmd, shell=True)) assert num_alignments == 1951476
def run_test_analyses(project, folder, workflow, find_test_data): # test cases: one or more named input hashes to run the workflow with test_inputs = { "21+Y": { "construct.reference_genome": dxpy.dxlink(find_test_data("hs37d5.fa.gz").get_id()), "construct.reference_variants": dxpy.dxlink(find_test_data("ALL.wgs.phase3_shapeit2_mvncall_integrated_v5a.20130502.sites.vcf.gz").get_id()), "construct.reference_contigs": ["21", "Y"], "map.reads": dxpy.dxlink(find_test_data("HS1011_unitigs_Y.fastq.gz").get_id()) } } # The tests might only need smaller instance types than the applet # defaults (reduces cost of running tests). stage_instance_types = { "construct": "mem3_ssd1_x8", "index": "mem3_ssd1_x8", "map": "mem3_ssd1_x8" } git_revision = workflow.describe(incl_properties=True)["properties"]["git_revision"] analyses = [] for test_name, test_input in test_inputs.iteritems(): test_folder = os.path.join(folder, test_name) project.new_folder(test_folder, parents=True) analyses.append(workflow.run(test_input, project=project.get_id(), folder=test_folder, stage_instance_types=stage_instance_types, delay_workspace_destruction=True, name="dxvg {} {}".format(test_name, git_revision))) return analyses
def coverage(CpG_context_dxlink, CHG_context_dxlink, CHH_context_dxlink, dme_ix_dxlink, target_root): '''subjob runs bismark2bedGraph and coverage2cytosine on mem3_hdd2_x8''' print "* coverage(): Retrieve context files and index..." CpG_context = 'output/CpG_context_%s.txt' % target_root CHG_context = 'output/CHG_context_%s.txt' % target_root CHH_context = 'output/CHH_context_%s.txt' % target_root run_cmd('mkdir -p output/') dxpy.download_dxfile(CpG_context_dxlink, CpG_context) dxpy.download_dxfile(CHG_context_dxlink, CHG_context) dxpy.download_dxfile(CHH_context_dxlink, CHH_context) dme_ix = "dme_index.tar.gz" dxpy.download_dxfile(dme_ix_dxlink, dme_ix) print "* coverage(): Uncompress index..." run_cmd('tar -zxf ' + dme_ix) (bedGraph_gz, cx_report) = bismark_coverage(target_root, CpG_context, CHG_context, CHH_context) print "* coverage(): Storing coverage results..." cx_report_dxfile = dxpy.upload_local_file(cx_report) bedgraph_gz_dxfile = dxpy.upload_local_file(bedGraph_gz) print "* coverage(): Check storage..." run_cmd('ls -l') run_cmd('df -k .') return { "cx_report_dxlink": dxpy.dxlink(cx_report_dxfile), "bedgraph_gz_dxlink": dxpy.dxlink(bedgraph_gz_dxfile) }
def test_paired_with_contam(self): bed_file = dxpy.find_one_data_object( name="hg19_GRCh37_Feb2009_RefSeq.bed")['id'] mappings = dxpy.find_one_data_object( name="SRR018256_paired_RNA_Mappings", typename="LetterMappings")['id'] contam_contig = dxpy.find_one_data_object(name="human rRNA", typename="ContigSet")['id'] reads = dxpy.find_one_data_object(name="SRR018256_reads", typename="LetterReads")['id'] if bed_file == None: print "Cannot find hg19_GRCh37_Feb2009_RefSeq.bed. Please upload it" return False if mappings == None: print "Cannot find Mappings. Please upload them" return False if contam_contig == None: print "Cannot find human rRNA. Please upload it" return False if reads == None: print "Cannot find SRR018256_reads. Please upload it" return False input = { 'rna_seq_mappings': dxpy.dxlink(mappings), 'bed_file': dxpy.dxlink(bed_file), 'contaminants': [dxpy.dxlink(contam_contig)], 'original_reads': [dxpy.dxlink(reads)] } print "Running program with", input job = self.program.run(input) print "launched test_paired_with_contam ", job.get_id()
def test_paired_with_contam(self): bed_file = dxpy.find_one_data_object(name="hg19_GRCh37_Feb2009_RefSeq.bed")['id'] mappings = dxpy.find_one_data_object(name="SRR018256_paired_RNA_Mappings", typename="LetterMappings")['id'] contam_contig = dxpy.find_one_data_object(name="human rRNA", typename="ContigSet")['id'] reads = dxpy.find_one_data_object(name="SRR018256_reads", typename="LetterReads")['id'] if bed_file == None: print "Cannot find hg19_GRCh37_Feb2009_RefSeq.bed. Please upload it" return False if mappings == None: print "Cannot find Mappings. Please upload them" return False if contam_contig == None: print "Cannot find human rRNA. Please upload it" return False if reads == None: print "Cannot find SRR018256_reads. Please upload it" return False input = { 'rna_seq_mappings': dxpy.dxlink(mappings), 'bed_file': dxpy.dxlink(bed_file), 'contaminants': [dxpy.dxlink(contam_contig)], 'original_reads': [dxpy.dxlink(reads)] } print "Running program with", input job = self.program.run(input) print "launched test_paired_with_contam ", job.get_id()
def _format_data_file(self, df: DataFile) -> dict: if isinstance(df.localizer, UrlLocalizer): ul = cast(UrlLocalizer, df.localizer) if ul.url.startswith("dx://"): return dxpy.dxlink(*ul.url[5:].split(":")) file_name = df.local_path.name existing_files = list(dxpy.find_data_objects( classname="file", state="closed", name=file_name, project=self._project_id, folder=self._folder, recurse=False )) if not existing_files: # TODO: batch uploads and use dxpy.sugar.transfers.Uploader for # parallelization return dxpy.dxlink(dxpy.upload_local_file( str(df.path), name=file_name, project=self._project_id, folder=self._folder, parents=True, wait_on_close=True )) elif len(existing_files) == 1: return dxpy.dxlink(existing_files[0]["id"], self._project_id) else: raise RuntimeError( f"Multiple files with name {file_name} found in " f"{self._project_id}:{self._folder}" )
def main(input_bam, paired=True, params=''): # The following line(s) initialize your data object inputs on the platform # into dxpy.DXDataObject instances that you can start using immediately. input_bam = dxpy.DXFile(input_bam) base_name = remove_extensions(input_bam.describe()['name'], [".bam", ".BAM", ".sam", ".SAM"]) # The following line(s) download your file inputs to the local file system # using variable names for the filenames. dxpy.download_dxfile(input_bam.get_id(), "input.bam") # Fill in your application code here. command = "java -Xmx6g -jar /opt/jar/SamToFastq.jar INPUT=input.bam F=%s_1.fastq" % base_name if paired: command += " F2=%s_2.fastq" % base_name subprocess.check_call(command, shell=True) # The following line(s) use the Python bindings to upload your file outputs # after you have created them on the local file system. It assumes that you # have used the output field name for the filename for each output, but you # can change that behavior to suit your needs. output = {} fastq_file = dxpy.upload_local_file("%s_1.fastq" % base_name); output["fastq_file"] = dxpy.dxlink(fastq_file) if paired: paired_fastq_file = dxpy.upload_local_file("%s_2.fastq" % base_name); output["paired_fastq_file"] = dxpy.dxlink(paired_fastq_file) return output
def main(input_bam, paired=True, params=''): # The following line(s) initialize your data object inputs on the platform # into dxpy.DXDataObject instances that you can start using immediately. input_bam = dxpy.DXFile(input_bam) base_name = remove_extensions(input_bam.describe()['name'], [".bam", ".BAM", ".sam", ".SAM"]) # The following line(s) download your file inputs to the local file system # using variable names for the filenames. dxpy.download_dxfile(input_bam.get_id(), "input.bam") # Fill in your application code here. command = "java -Xmx6g -jar /opt/jar/SamToFastq.jar INPUT=input.bam F=%s_1.fastq" % base_name if paired: command += " F2=%s_2.fastq" % base_name subprocess.check_call(command, shell=True) # The following line(s) use the Python bindings to upload your file outputs # after you have created them on the local file system. It assumes that you # have used the output field name for the filename for each output, but you # can change that behavior to suit your needs. output = {} fastq_file = dxpy.upload_local_file("%s_1.fastq" % base_name) output["fastq_file"] = dxpy.dxlink(fastq_file) if paired: paired_fastq_file = dxpy.upload_local_file("%s_2.fastq" % base_name) output["paired_fastq_file"] = dxpy.dxlink(paired_fastq_file) return output
def _type_convert_primitive(val, klass): retval = None ref_files = [] if klass == 'string': retval = val elif klass == 'int': retval = int(val) elif klass == "boolean": retval = bool(val) elif klass == 'float': retval = float(val) elif klass == 'hash': retval = json.loads(val) elif klass == 'file': if val.startswith("project-"): val = val.split(":") retval = dxpy.dxlink(object_id=val[1], project_id=val[0]) ref_files.append(retval) elif val.startswith("file-"): retval = dxpy.dxlink(val) ref_files.append(retval) else: raise Exception( "Malformed file {}, must start with 'file-' or 'project-'". format(val)) else: raise Exception("class {} not currently supported".format(klass)) return retval, ref_files
def setUpClass(cls): if RUN_JOB_ON_DX: if not project_name: print "'PROJ_NAME' environment variable must be defined!" sys.exit(1) working_project_id = dxpy.find_one_project(more_ok=False, name=project_name)["id"] run_args = {} run_args["project"] = working_project_id run_args["name"] = "vcfscope-measure on chr21" run_args["folder"] = "/purge/" + app_name input_hash = {} input_hash["vcfgz"] = dxpy.dxlink("file-BkkjFkj098Gb2jZ1Yx533JFv", project_id) input_hash["bam"] = dxpy.dxlink("file-Bkkjj5Q098Gkvkb3Xx5Pxj1J", project_id) input_hash["bai"] = dxpy.dxlink("file-Bkkjj5Q098GzYx2bG5YJ3z34", project_id) input_hash["region"] = dxpy.dxlink("file-Bkkj22Q098Gz5yK1Q955G5gX", project_id) app = dxpy.DXApp(name=app_name, alias="9.9.7") cls.job = app.run(input_hash, **run_args) else: job_id = "job-F1JpY9Q0pVj0BgpYBp14f31Q" cls.job = dxpy.DXJob(job_id) cls.job.wait_on_done()
def setUp(self): setUpTempProjects(self) self.dxapplet = dxpy.DXApplet() self.dxapplet.new(name="identity-record", dxapi="1.04", inputSpec=[{"name": "record", "class": "record"} ], outputSpec=[{"name": "record", "class": "record"}], runSpec={"code": ''' @dxpy.entry_point('main') def main(record): return {'record': record}''', "interpreter": "python2.7"}) dxrecord = dxpy.new_dxrecord(name='workflowname', details={"stages": [{"job": None, "inputs": {}, "app": dxpy.dxlink(self.dxapplet), "id": "stage0-id" }, {"job": None, "inputs": {"record": {"connectedTo": {"output": "record", "stage": "stage0-id"} } }, "app": dxpy.dxlink(self.dxapplet), "id": "stage1-id" }], "version": 5}, types=['pipeline']) self.workflow = dxpy.DXWorkflow(dxrecord.get_id()) self.closedrecord = dxpy.new_dxrecord(name='a record') self.closedrecord.close()
def makeInputsBwa(): try: contigset_importer = dxpy.DXApplet(dxpy.find_data_objects(classname="applet", properties={"name": "fasta_contigset_importer"}).next()['id']) reads_importer = dxpy.DXApplet(dxpy.find_data_objects(classname="applet", properties={"name": "Letter Space FASTQ importer"}).next()['id']) except StopIteration: raise Exception("fasta_contigset_importer or Letter Space FASTQ importer not found, please upload them") genome_archive = dxpy.upload_local_file(os.path.join(test_resources_dir, "hg19_chr22.fa.xz"), wait_on_close=True) contigset_importer_input = {"name": "hg19_chr22", "sequence_file": dxpy.dxlink(genome_archive)} print "Running fasta_contigset_importer with", contigset_importer_input job = contigset_importer.run(contigset_importer_input) job.wait_on_done() contig_set = job.describe()["output"]["contig_set"] left_reads = dxpy.upload_local_file(os.path.join(test_resources_dir, "small_left.fq"), wait_on_close=True) right_reads = dxpy.upload_local_file(os.path.join(test_resources_dir, "small_right.fq"), wait_on_close=True) #left_reads = dxpy.upload_local_file(os.path.join(test_resources_dir, "SRR188205_1_1M.fastq.xz"), wait_on_close=True) #right_reads = dxpy.upload_local_file(os.path.join(test_resources_dir, "SRR188205_2_1M.fastq.xz"), wait_on_close=True) reads_importer_input = {"left_file": dxpy.dxlink(left_reads), "right_file": dxpy.dxlink(right_reads)} print "Running LetterSpaceFileObjectToReadsTable with", reads_importer_input job = reads_importer.run(reads_importer_input) job.wait_on_done() reads = job.describe()["output"]["reads"] return {"reads": [reads] * 3, "reference": contig_set}
def make_indexed_reference( ref_ID ): run_shell("dx-contigset-to-fasta %s reference.fasta" % ref_ID) ref_details = dxpy.DXRecord(ref_ID).get_details() ref_name = dxpy.DXRecord(ref_ID).describe()['name'] # call bowtie2-build run_shell("bowtie2-build reference.fasta indexed_ref") # package it into an archive for uploading run_shell("XZ_OPT=-0 tar -cJf reference.tar.xz indexed_ref*") indexed_ref_dxfile = dxpy.upload_local_file("reference.tar.xz", hidden=True, wait_on_close=True) indexed_ref_record = dxpy.new_dxrecord(name=ref_name + " (indexed for Bowtie2)", types=["BowtieLetterContigSetV2"], details={'index_archive': dxpy.dxlink(indexed_ref_dxfile.get_id()), 'original_contigset': dxpy.dxlink(ref_ID)}) indexed_ref_record.close() ''' # TODO: dxpy project workspace convenience functions if "projectWorkspace" in job: indexed_ref_record.clone(job["projectWorkspace"]) ''' return indexed_ref_record.get_id()
def make_indexed_reference(ref_ID): run_shell("dx-contigset-to-fasta %s reference.fasta" % ref_ID) ref_details = dxpy.DXRecord(ref_ID).get_details() ref_name = dxpy.DXRecord(ref_ID).describe()['name'] # call bowtie2-build run_shell("bowtie2-build reference.fasta indexed_ref") # package it into an archive for uploading run_shell("XZ_OPT=-0 tar -cJf reference.tar.xz indexed_ref*") indexed_ref_dxfile = dxpy.upload_local_file("reference.tar.xz", hidden=True, wait_on_close=True) indexed_ref_record = dxpy.new_dxrecord( name=ref_name + " (indexed for Bowtie2)", types=["BowtieLetterContigSetV2"], details={ 'index_archive': dxpy.dxlink(indexed_ref_dxfile.get_id()), 'original_contigset': dxpy.dxlink(ref_ID) }) indexed_ref_record.close() ''' # TODO: dxpy project workspace convenience functions if "projectWorkspace" in job: indexed_ref_record.clone(job["projectWorkspace"]) ''' return indexed_ref_record.get_id()
def build(incl_map): nm = "vg_construct_index_map" if incl_map else "vg_construct_index" wf = dxpy.new_dxworkflow(title=nm, name=nm, description=nm, project=project.get_id(), folder=folder, properties={"git_revision": git_revision}) construct_applet = find_applet("vg_construct") construct_input = { } construct_stage_id = wf.add_stage(construct_applet, stage_input=construct_input, name="construct") index_input = { "vg_tar": dxpy.dxlink({"stage": construct_stage_id, "outputField": "vg_tar"}) } index_stage_id = wf.add_stage(find_applet("vg_index"), stage_input=index_input, name="index") if incl_map: map_input = { "vg_indexed_tar": dxpy.dxlink({"stage": index_stage_id, "outputField": "vg_indexed_tar"}) } map_stage_id = wf.add_stage(find_applet("vg_map"), stage_input=map_input, name="map") return wf
def test_alignment_count(applet_id, project_id, folder, tmpdir): """Run BWA on a FASTQ file and verify that the number of alignments produced is correct. """ # Recall that applet_id is set in the associated conftest.py, which either # gets it from the command line or builds the applet and retrieves its id. # And tmpdir is some pytest magic. It's type is py.path.local.LocalPath. # It's strpath property just returns a string. applet = dxpy.DXApplet(applet_id) input_dict = { "fastq": dxpy.dxlink(SAMPLE_FASTQ), "genomeindex_targz": dxpy.dxlink(HS37D5_BWA_INDEX) } job = applet.run(input_dict, instance_type="mem1_ssd1_x16", folder=folder, project=project_id) job.wait_on_done() output_bam_dxfile = dxpy.DXFile(job.describe()["output"]["bam"]) local_filename = os.path.join(tmpdir.strpath, "test.bam") dxpy.download_dxfile(output_bam_dxfile.get_id(), local_filename) count_alignments_cmd = "samtools view {bam} | wc -l".format( bam=local_filename) num_alignments = int( subprocess.check_output(count_alignments_cmd, shell=True)) assert num_alignments == 1951476
def create_index_file(bam_filename, bam_dxlink): """Create Index file. Sorts BAM if needed """ print("Creating Index file.") index_filename = "{bam}.bai".format(bam=bam_filename) cmd_index = ['samtools', 'index', bam_filename] sorted_filename = bam_filename try: run_cmd(cmd_index) except NotIndexedException: print("Sorting BAM") sorted_filename = bam_filename[:-4] + '.sorted.bam' cmd_sort = [ 'samtools', 'sort', bam_filename, bam_filename[:-4] + '.sorted'] run_cmd(cmd_sort) print("Indexing BAM") index_cmd = ['samtools', 'index', sorted_filename] index_filename = "{sorted_bam_name}.bai".format( sorted_bam_name=sorted_filename) run_cmd(index_cmd) finally: index_file_link = dxpy.dxlink(dxpy.upload_local_file(index_filename)) aligned_sorted_bam = dxpy.dxlink(dxpy.upload_local_file(sorted_filename)) return aligned_sorted_bam, index_file_link
def postprocess(bam_files, report_files, bam_root, nthreads=8, use_cat=False, use_sort=False): # This is the "gather" phase which aggregates and performs any # additional computation after the "map" (and therefore after all # the "process") jobs are done. if DEBUG: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) logger.debug("** In Postprocess - refactored dme-merge-bams - *") versions = "Unknown" if os.path.isfile(VERSION_SCRIPT): try: versions = subprocess.check_output( shlex.split( 'tool_versions.py --dxjson dnanexus-executable.json')) except: pass merged_bam = merge_bams(bam_files, bam_root, use_cat, use_sort, nthreads) (merged_report, report_file_names) = merge_reports(bam_root, report_files, bam_root) (merged_qc, nreads, metadata) = merge_qc(bam_root, report_file_names) props = { 'SW': versions, 'reads': nreads, } output = { "bam_techrep": dxpy.dxlink( dxpy.upload_local_file(merged_bam, details=metadata, properties=props)), "bam_techrep_qc": dxpy.dxlink( dxpy.upload_local_file(merged_qc, details=metadata, properties={'SW': versions})), "map_techrep": dxpy.dxlink( dxpy.upload_local_file(merged_report, details=metadata, properties={'SW': versions})), "reads": nreads, "metadata": json.dumps(metadata) } return output
def main(input_bam, paired_end): input_bam_file = dxpy.DXFile(input_bam) input_bam_filename = input_bam_file.name input_bam_basename = input_bam_file.name.rstrip('.bam') dxpy.download_dxfile(input_bam_file.get_id(), input_bam_filename) intermediate_TA_filename = input_bam_basename + ".tagAlign" if paired_end: end_infix = 'PE2SE' else: end_infix = 'SE' final_TA_filename = input_bam_basename + '.' + end_infix + '.tagAlign.gz' subprocess.check_output('ls -l', shell=True) # =================== # Create tagAlign file # =================== out, err = common.run_pipe([ "bamToBed -i %s" % (input_bam_filename), r"""awk 'BEGIN{OFS="\t"}{$4="N";$5="1000";print $0}'""", "tee %s" % (intermediate_TA_filename), "gzip -cn"], outfile=final_TA_filename) subprocess.check_output('ls -l', shell=True) # ================ # Create BEDPE file # ================ if paired_end: final_nmsrt_bam_prefix = input_bam_basename + ".nmsrt" final_nmsrt_bam_filename = final_nmsrt_bam_prefix + ".bam" command = \ "samtools sort -@ %d -n %s %s" \ % (cpu_count(), input_bam_filename, final_nmsrt_bam_prefix) logger.info(command) subprocess.check_call(shlex.split(command)) final_BEDPE_filename = input_bam_basename + ".bedpe.gz" out, err = common.run_pipe([ "bamToBed -bedpe -mate1 -i %s" % (final_nmsrt_bam_filename), "gzip -cn"], outfile=final_BEDPE_filename) subprocess.check_output('ls -l', shell=True) tagAlign_file = dxpy.upload_local_file(final_TA_filename) if paired_end: BEDPE_file = dxpy.upload_local_file(final_BEDPE_filename) output = {} output["tagAlign_file"] = dxpy.dxlink(tagAlign_file) if paired_end: output["BEDPE_file"] = dxpy.dxlink(BEDPE_file) return output
def main(input_SAM, deviations=None, histogram_width=None, min_percent=None, metric_acc_level=None, ref=None, is_sorted=None, stop_after=None): # The following line(s) download your file inputs to the local file system # using variable names for the filenames. dxpy.download_dxfile(input_SAM, "input") if ref != None: dxpy.download_dxfile(ref, "ref.fa") command = "java -Xmx2g -jar /CollectInsertSizeMetrics.jar" command += " INPUT=input" command += " OUTPUT=insert_distribution.txt" command += " HISTOGRAM_FILE=histogram.pdf" if deviations != None: command += " DEVIATIONS="+str(deviations) if histogram_width != None: command += " HISTOGRAM_WIDTH="+str(histogram_width) if min_percent != None: command += " MINIMUM_PCT="+str(histogram_width) if metric_acc_level != None: for level in metric_acc_level: command += " METRIC_ACCUMULATION_LEVEL="+str(level) if ref != None: command += " REFERENCE_SEQUENCE=ref.fa" if is_sorted != None: if is_sorted: command += " ASSUME_SORTED=true" else: command += " ASSUME_SORTED=false" if stop_after != None: command += " STOP_AFTER="+str(stop_after) print "Executing:" print command # CALL the command here: subprocess.check_call(command, shell=True) # The following line(s) use the Python bindings to upload your file outputs # after you have created them on the local file system. It assumes that you # have used the output field name for the filename for each output, but you # can change that behavior to suit your needs. histogram = dxpy.upload_local_file("histogram.pdf") histogram.rename(dxpy.DXFile(input_SAM).describe()['name']+"_histogram.pdf") output_dist = dxpy.upload_local_file("insert_distribution.txt") output_dist.rename(dxpy.DXFile(input_SAM).describe()['name']+"_insert_dist.txt") # The following line fills in some basic dummy output and assumes # that you have created variables to represent your output with # the same name as your output fields. output = {} output["histogram"] = dxpy.dxlink(histogram) output["output"] = dxpy.dxlink(output_dist) return output
def main(**kwargs): dxpy.download_folder(DCC_CREDENTIALS_PROJECT, '.', folder=DCC_CREDENTIALS_FOLDER) if 'key' in kwargs: key = '-'.join([dxpy.api.system_whoami()['id'], kwargs.pop('key')]) else: key = dxpy.api.system_whoami()['id'] key_tuple = common.processkey(key, KEYFILE) if not key_tuple: logger.error("Key %s is not found in the keyfile %s" % (key, KEYFILE)) raise PortalCredentialsError("Supply a valid keypair ID") authid, authpw, server = key_tuple if 'url' in kwargs: server = kwargs.pop('url') keypair = (authid, authpw) tokens = ['python3 checkfiles.py'] for k, v in kwargs.iteritems(): if isinstance(v, bool): if v: tokens.append("--" + k.replace('_', '-')) continue if isinstance(v, str) or isinstance(v, unicode) or isinstance(v, int): tokens.append(' '.join(["--" + k.replace('_', '-'), str(v)])) if 'dx_file' in kwargs: dxfile = dxpy.DXFile(kwargs.get('dx_file')) local_file = dxpy.download_dxfile(dxfile, dxfile.name) tokens.append("--local-file %s" % (dxfile.name)) # this is just to get a command string to print that has no secrets tokens_safe = deepcopy(tokens) tokens_safe.append("--username %s --password %s" % ("." * len(authid), "." * len(authpw))) tokens_safe.append(server) logger.info(' '.join(tokens_safe)) tokens.append("--username %s --password %s" % (authid, authpw)) # this needs to be the last token tokens.append(server) checkfiles_command = ' '.join(tokens) subprocess.check_call(shlex.split(checkfiles_command)) output = {} outfilename = kwargs.get('out') errfilename = kwargs.get('err') if outfilename: out = dxpy.upload_local_file(outfilename) output.update({'out': dxpy.dxlink(out)}) if errfilename: err = dxpy.upload_local_file(errfilename) output.update({'err': dxpy.dxlink(err)}) return output
def get_dxfile(filePath, project=None): '''Returns dxfile object.''' dxfile = None #if filePath.find("$dnanexus_link") != -1: # filePath = filePath.split(' ')[1] # filePath = filePath.replace("'","").replace('"','').replace("}","").replace("{","") try: dxlink = json.loads(filePath.strip("'")) except: dxlink = None if project != None: try: if dxlink != None: dxfile = dxpy.get_handler(dxlink, project=project) else: dxfile = dxpy.get_handler(filePath, project=project) except: try: dxlink = dxpy.dxlink(filePath, project=project) dxfile = dxpy.get_handler(dxlink) except: try: proj_id = env_get_current_project_id() dxfile = dxpy.DXFile(filePath, project=proj_id) except: sys.stderr.write('ERROR: unable to find file "' + filePath + '": \n') sys.exit(0) # Do not error on tool run in dx script else: try: if dxlink != None: dxfile = dxpy.get_handler(dxlink) else: dxfile = dxpy.get_handler(filePath) except: try: dxlink = dxpy.dxlink(filePath) dxfile = dxpy.get_handler(dxlink) except: try: proj_id = env_get_current_project_id() dxfile = dxpy.DXFile(filePath, project=proj_id) except: sys.stderr.write('ERROR: unable to find file "' + filePath + '": \n') sys.exit(0) # Do not error on tool run in dx script if dxfile == None: sys.stderr.write('ERROR: unable to find file "' + filePath + '": \n') sys.exit(0) # Do not error on tool run in dx script return dxfile
def pooled(files): pool_applet = dxpy.find_one_data_object( classname='applet', name='pool', project=dxpy.PROJECT_CONTEXT_ID, zero_ok=False, more_ok=False, return_handler=True) logger.debug('input files:%s' %(files)) logger.debug('input file ids:%s' %([dxf.get_id() for dxf in files])) logger.debug('input files dxlinks:%s' %([dxpy.dxlink(dxf) for dxf in files])) pool_subjob = pool_applet.run({"inputs": [dxpy.dxlink(dxf) for dxf in files]}) pooled_file = pool_subjob.get_output_ref("pooled") return pooled_file
def get_dxfile(filePath,project=None): '''Returns dxfile object.''' dxfile = None #if filePath.find("$dnanexus_link") != -1: # filePath = filePath.split(' ')[1] # filePath = filePath.replace("'","").replace('"','').replace("}","").replace("{","") try: dxlink = json.loads(filePath.strip("'")) except: dxlink = None if project != None: try: if dxlink != None: dxfile = dxpy.get_handler(dxlink,project=project) else: dxfile = dxpy.get_handler(filePath,project=project) except: try: dxlink = dxpy.dxlink(filePath,project=project) dxfile = dxpy.get_handler(dxlink) except: try: proj_id = env_get_current_project_id() dxfile = dxpy.DXFile(filePath,project=proj_id) except: sys.stderr.write('WARNING: unable to find file "' + filePath + '": \n') sys.exit(0) # Do not error on tool run in dx script else: try: if dxlink != None: dxfile = dxpy.get_handler(dxlink) else: dxfile = dxpy.get_handler(filePath) except: try: dxlink = dxpy.dxlink(filePath) dxfile = dxpy.get_handler(dxlink) except: try: proj_id = env_get_current_project_id() dxfile = dxpy.DXFile(filePath,project=proj_id) except: sys.stderr.write('WARNING: unable to find file "' + filePath + '": \n') sys.exit(0) # Do not error on tool run in dx script if dxfile == None: sys.stderr.write('WARNING: unable to find file "' + filePath + '": \n') sys.exit(0) # Do not error on tool run in dx script return dxfile
def main(cons1, cons2, outroot, xchr=True, recalnums=1, skip=20, timemax=7500000.0): # The following line(s) initialize your data object inputs on the platform # into dxpy.DXDataObject instances that you can start using immediately. cons1 = dxpy.DXFile(cons1) cons2 = dxpy.DXFile(cons2) # The following line(s) download your file inputs to the local file system # using variable names for the filenames. dxpy.download_dxfile(cons1.get_id(), "cons1") dxpy.download_dxfile(cons2.get_id(), "cons2") outname1 = outroot + '.psmcfa' outname2 = outroot + '.psmc' # Fill in your application code here. #create the psmcfa file createPSMCfa('cons1', 'cons2', outname1, skip) print 'Generated the PSMC fasta file.' sys.stdout.flush() #run psmc the first time subprocess.check_call(['psmc', '-t', '15', '-r', '5', '-p', "4+25*2+4+6", '-o', 'test.psmc', outname1]) print 'Done with first run of PSMC.' sys.stdout.flush() #run the recal script and run psmc again. while (recalnums > 1): (tmaxNew, parfile) = writeRecalFile('test.psmc', timemax, skip, xchr) subprocess.check_call(['psmc', '-t', str(round(tmaxNew,4)), '-i', parfile, '-o', 'test.psmc', outname1]) recalnums -= 1 print 'Recals left', recalnums sys.stdout.flush() (tmaxNew, parfile) = writeRecalFile('test.psmc', timemax, skip, xchr) subprocess.check_call(['psmc', '-t', str(round(tmaxNew,4)), '-i', parfile, '-o', outname2, outname1]) print 'Finished final recalibration run.' # The following line(s) use the Python bindings to upload your file outputs # after you have created them on the local file system. It assumes that you # have used the output field name for the filename for each output, but you # can change that behavior to suit your needs. outfile1 = dxpy.upload_local_file(outname1); outfile2 = dxpy.upload_local_file(outname2); # The following line fills in some basic dummy output and assumes # that you have created variables to represent your output with # the same name as your output fields. output = {} output["outfile1"] = dxpy.dxlink(outfile1) output["outfile2"] = dxpy.dxlink(outfile2) return output
def create_final_set_of_peak_calls(job_inputs): replicate_idr_prefixes = [ r.replace('.tar.gz', '') for r in job_inputs['replicate_idr_files'] ] pseudo_replicate_idr_prefixes = [ r.replace('.tar.gz', '') for r in job_inputs['pseudo_replicate_idr_files'] ] pooled_pseudo_replicate_idr_prefix = job_inputs[ 'pooled_pseudo_replicate_idr_files'].replace('.tar.gz', '') (num_peaks_each_rep, num_peaks_each_pseudo_rep, numPeaks_Rep0) = get_thresholds( replicate_idr_prefixes, pseudo_replicate_idr_prefixes, pooled_pseudo_replicate_idr_prefix, job_inputs['replicate_peaks_threshold'], job_inputs['pseudo_replicate_peaks_threshold'], job_inputs['pooled_pseudo_replicate_peaks_threshold']) max_numPeaks_Rep = max(num_peaks_each_rep) pooled_replicates_peaks_fn = download_and_gunzip_file( job_inputs['pooled_replicate_peaks_file']) coi = { 'signal.value': 7, 'p.value': 8, 'q.value': 9 }[job_inputs['ranking_measure']] cmd = 'sort -k{0}nr,{0}nr "{1}" | head -n {2} | gzip -c > "{3}_conservative.regionPeak.gz"'.format( coi, pooled_replicates_peaks_fn, max_numPeaks_Rep, job_inputs['output_prefix']) print cmd subprocess.check_output(cmd, shell=True) opt_thresh = max(max_numPeaks_Rep, numPeaks_Rep0) cmd = 'sort -k{0}nr,{0}nr "{1}" | head -n {2} | gzip -c > "{3}_optimal.regionPeak.gz"'.format( coi, pooled_replicates_peaks_fn, opt_thresh, job_inputs['output_prefix']) print cmd subprocess.check_output(cmd, shell=True) conservative_result = dxpy.upload_local_file( '{0}_conservative.regionPeak.gz'.format(job_inputs['output_prefix'])) optimal_result = dxpy.upload_local_file('{0}_optimal.regionPeak.gz'.format( job_inputs['output_prefix'])) return { 'conservative_peak_calls': dxpy.dxlink(conservative_result), 'optimal_peak_calls': dxpy.dxlink(optimal_result), 'num_peaks_each_rep': num_peaks_each_rep, 'num_peaks_each_pseudo_rep': num_peaks_each_pseudo_rep, 'num_peaks_pooled_pseudo_rep': numPeaks_Rep0 }
def process(scattered_input, dme_ix, ncpus, reads_root): # Fill in code here to process the input and create output. if DEBUG: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) dme_ix = dxpy.DXFile(dme_ix) # The following line(s) download your file inputs to the local file system # using variable names for the filenames. dxpy.download_dxfile(dme_ix.get_id(), "index.tgz") fq = dxpy.DXFile(scattered_input) name = fq.describe()['name'] dxpy.download_dxfile(fq.get_id(), name) bam_root = name + '_techrep' logger.info("* === Calling DNAnexus and ENCODE independent script... ===") logger.debug("** DIR: %s" % os.listdir('./')) logger.debug(subprocess.check_output(shlex.split('head %s' % name))) if os.path.isfile(ALIGN_SCRIPT): logger.debug("** Executable %s exists" % ALIGN_SCRIPT) else: logger.debug("** Executable %s DOES NOT exist" % ALIGN_SCRIPT) exit(1) align_cmd = '%s index.tgz %s %s %s no_stats' % (ALIGN_SCRIPT, name, str(ncpus), bam_root) logger.debug('** command line: %s' % align_cmd) map_out = subprocess.check_output(shlex.split(align_cmd)) logger.info("* === Returned from dname_align_se ===") # As always, you can choose not to return output if the # "postprocess" stage does not require any input, e.g. rows have # been added to a GTable that has been created in advance. Just # make sure that the "postprocess" job does not run until all # "process" jobs have finished by making it wait for "map" to # finish using the depends_on argument (this is already done for # you in the invocation of the "postprocess" job in "main"). logger.debug("** DIR: %s" % os.listdir('./')) logger.debug("** OUTPUT DIR: %s" % os.listdir('output/')) os.rename(bam_root + '_bismark.bam', bam_root + '.bam') return { "bam_file": dxpy.dxlink(dxpy.upload_local_file(bam_root + '.bam')), "report_file": dxpy.dxlink( dxpy.upload_local_file(bam_root + '_bismark_map_report.txt')) }
def run_wg_build(project, folder, workflow, find_test_data, depends_on): wg_input = { "construct.reference_genome": dxpy.dxlink(find_test_data("hs37d5.fa.gz").get_id()), "construct.reference_variants": dxpy.dxlink(find_test_data("ALL.wgs.phase3_shapeit2_mvncall_integrated_v5a.20130502.sites.vcf.gz").get_id()), "map.reads": dxpy.dxlink(find_test_data("HS1011.mag.gz").get_id()) } git_revision = workflow.describe(incl_properties=True)["properties"]["git_revision"] test_folder = os.path.join(folder, "whole-genome") project.new_folder(test_folder, parents=True) return workflow.run(wg_input, project=project.get_id(), folder=test_folder, depends_on=depends_on, priority="normal", name="dxvg whole-genome {}".format(git_revision))
def main(reads1, bwa_aln_params, bwa_version, samtools_version, reads2, reference_tar, key, debug): if debug: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) #for each input fastq decide if it's specified as an ENCODE file accession number (ENCFF*) reads1_files = [resolve_file(read, key) for read in reads1] if len(reads1_files) > 1: pool_applet = dxpy.find_one_data_object( classname='applet', name='pool', project=dxpy.PROJECT_CONTEXT_ID, zero_ok=False, more_ok=False, return_handler=True) logger.debug('reads1_files:%s' %(reads1_files)) logger.debug('reads1_files ids:%s' %([dxf.get_id() for dxf in reads1_files])) logger.debug('reads1_files dxlinks:%s' %([dxpy.dxlink(dxf) for dxf in reads1_files])) pool_subjob = pool_applet.run({"inputs": [dxpy.dxlink(dxf) for dxf in reads1_files]}) reads1_file = pool_subjob.get_output_ref("pooled") else: reads1_file = reads1_files[0] reads2_file = resolve_file(reads2, key) reference_tar_file = resolve_file(reference_tar, key) logger.info('Resolved reads1 to %s', reads1_file) if reads2: logger.info('Resolved reads2 to %s', reads2_file) logger.info('Resolved reference_tar to %s', reference_tar_file) output = {} output.update({'reads1': reads1_file}) if reads2: output.update({"reads2": reads2_file}) output_json = { "reads1": reads1_file, "reference_tar": reference_tar_file, "bwa_aln_params": bwa_aln_params, "bwa_version": bwa_version, "samtools_version": samtools_version } if reads2: output_json.update({'reads2': reads2_file}) output.update({'output_JSON': output_json}) #logger.info('Exiting with output_JSON: %s' %(json.dumps(output))) #return {'output_JSON': json.dumps(output)} logger.info('Exiting with output: %s' %(output)) return output
def copy_files(fids, project_id, folder): new_fids = [] for file_dict in fids: f = dxpy.DXFile(dxid=file_dict['id'], project=file_dict['project']) fn = f.describe()['name'] # Check to see if file already exists. found_file = dxpy.find_one_data_object(classname='file', project=project_id, folder=folder, zero_ok=True, name=fn) if found_file is None: new_fids += [dxpy.dxlink(f.clone(project_id, folder))] else: new_fids += [dxpy.dxlink(found_file)] return new_fids
def merge_extract(bam_set, map_report_set, dme_ix_dxlink, uncompress_bam, props): '''subjob runs bismark_methylation_extractor on mem1_hdd2_x32''' (target_root,biorep_bam) = merge_bams(bam_set, 32) (biorep_map,all_reports) = merge_map_reports(map_report_set, target_root) (qc_metrics, reads, biorep_bam_qc) = biorep_bam_qc_metrics(target_root, all_reports) print "* merge_extract(): Retrieve and uncompress index..." dme_ix = "dme_index.tar.gz" dxpy.download_dxfile(dme_ix_dxlink, dme_ix) run_cmd('tar -zxf ' + dme_ix) # NOTE: Better to use sam and let extractor use more threads, but this takes up precious storage (alignments, ncores) = bam_or_sam(biorep_bam, uncompress_bam, target_root) bismark_simple_extract(target_root, alignments, ncores) qc_metrics = bismark_qc_metrics(target_root, qc_metrics) print "* Retrieve split report..." append_line("\n===== bismark_methylation_extractor: splitting_report =====",biorep_bam_qc) run_cmd('cat %s_splitting_report.txt' % target_root,out=biorep_bam_qc,append=True,silent=True) # TODO: Is this even needed? Currently we do to get the size! #if len(bam_set) > 1: # Wouldn't need to do this unless there is a merge # print "* merge_extract(): Storing biorep bam..." # props_ex = props.copy() # props_ex.update({ 'reads': str(reads) }) # biorep_bam_dxlink = dxpy.dxlink(dxpy.upload_local_file(biorep_bam,properties=props_ex,details=qc_metrics,wait_on_close=True)) #else: # biorep_bam_dxlink = bam_set[0] print "* merge_extract(): Storing extraction results..." biorep_bam_qc_dxfile = dxpy.upload_local_file(biorep_bam_qc,properties=props,details=qc_metrics) biorep_map_dxfile = dxpy.upload_local_file(biorep_map, properties=props,details=qc_metrics) split_report_dxfile = dxpy.upload_local_file(target_root+'_splitting_report.txt') split_report_dxfile = dxpy.upload_local_file(target_root+'_splitting_report.txt') chrom_sizes_dxfile = dxpy.upload_local_file('input/chrom.sizes') mbias_report_dxfile = dxpy.upload_local_file(target_root+'_mbias_report.txt',properties=props,details=qc_metrics) CpG_context_dxfile = dxpy.upload_local_file('output/CpG_context_%s.txt' % (target_root)) CHG_context_dxfile = dxpy.upload_local_file('output/CHG_context_%s.txt' % (target_root)) CHH_context_dxfile = dxpy.upload_local_file('output/CHH_context_%s.txt' % (target_root)) print "* merge_extract(): Check storage..." run_cmd('ls -l') run_cmd('df -k .') return { #"biorep_bam_dxlink": biorep_bam_dxfile, "biorep_bam_qc_dxlink": dxpy.dxlink(biorep_bam_qc_dxfile), "biorep_map_dxlink": dxpy.dxlink(biorep_map_dxfile), "CpG_context_dxlink": dxpy.dxlink(CpG_context_dxfile), "CHG_context_dxlink": dxpy.dxlink(CHG_context_dxfile), "CHH_context_dxlink": dxpy.dxlink(CHH_context_dxfile), "split_report_dxlink": dxpy.dxlink(split_report_dxfile), "chrom_sizes_dxlink": dxpy.dxlink(chrom_sizes_dxfile), "mbias_report_dxlink": dxpy.dxlink(mbias_report_dxfile), "target_root": target_root, "qc_metrics": qc_metrics }
def find_file(filePath,project=None,verbose=False,multiple=False, recurse=True): '''Using a DX style file path, find the file.''' proj = project path = filePath fileName = filePath if filePath.find(':') != -1: proj, path = filePath.split(':', 1) if path.rfind('/') != -1: path, fileName = path.rsplit('/', 1) else: fileName = path path = '/' if proj == None: if verbose: print "ERROR: Don't know what project to use for '" + path + "'." return None if proj.find('project-') == 0: projId = proj else: projId = get_project(proj, level='VIEW').get_id() mode = 'exact' if filePath.find('*') or filePath.find('?'): mode = 'glob' fileDicts = list(dxpy.find_data_objects(classname='file', folder=path, name=fileName, recurse=recurse, name_mode=mode, project=projId, return_handler=False)) if fileDicts == None or len(fileDicts) == 0: #print "- Found 0 files from '" + proj + ":" + filePath + "'." if verbose: print "ERROR: Failed to find '" + proj + ":" + filePath + "'." return None elif len(fileDicts) > 1 or multiple: #print "- Found "+str(len(fileDict))+" files from '" + proj + ":" + filePath + "'." if not multiple: if verbose: print "ERROR: Found "+str(len(fileDicts))+" files when expecting 1 '" + proj + ":" + filePath + "'." return None else: if verbose: print " Found "+str(len(fileDicts))+" files for '" + proj + ":" + filePath + "'." fids = [] for fileDict in fileDicts: FILES[fileDict['id']] = dxpy.dxlink(fileDict) fids.append( fileDict['id'] ) return fids else: #print "- FOUND '" + proj + ":" + filePath + "'." FILES[fileDicts[0]['id']] = dxpy.dxlink(fileDicts[0]) return fileDicts[0]['id']
def pooled(files): pool_applet = dxpy.find_one_data_object(classname='applet', name='pool', project=dxpy.PROJECT_CONTEXT_ID, zero_ok=False, more_ok=False, return_handler=True) logger.debug('input files:%s' % (files)) logger.debug('input file ids:%s' % ([dxf.get_id() for dxf in files])) logger.debug('input files dxlinks:%s' % ([dxpy.dxlink(dxf) for dxf in files])) pool_subjob = pool_applet.run( {"inputs": [dxpy.dxlink(dxf) for dxf in files]}) pooled_file = pool_subjob.get_output_ref("pooled") return pooled_file
def run_bwa_mem(sample, fastq_dict, mapper_app_dxid, ref_genome_index, project_id): ''' Description: Maps sample fastq files to a reference genome Args: sample (dict) - sample[<barcode>] = [<fastq files>] mapper (dxid) ref_genome (dxid) ''' ## Stock DNAnexus BWA-MEM app #mapper_app_name = 'bwa_mem_fastq_read_mapper' #mapper_app_version = '1.5.0' #mapper_app = MapperApp(name=mapper_app_name, version=mapper_app_version) # DXApp object dxpy.set_workspace_id(project_id) # Create dict to store mapper app inputs mapper_app = dxpy.DXApp(mapper_app_dxid) mapper_input = { 'genomeindex_targz': dxpy.dxlink(ref_genome_index) } # hg19 : file-B6qq53v2J35Qyg04XxG0000V # Add fastq files to mapper app input dict if len(fastq_dict) == 0: print 'Error: No fastq files listed for sample %s' % sample sys.exit() elif len(fastq_dict) == 1: mapper_input['reads_fastqgz'] = dxpy.dxlink(fastq_dict['1']) elif len(fastq_dict) == 2: mapper_input['reads_fastqgz'] = dxpy.dxlink(fastq_dict['1']) mapper_input['reads2_fastqgz'] = dxpy.dxlink(fastq_dict['2']) else: print 'Error: More than 2 fastq files passed for mapping sample %s' % sample sys.exit() print mapper_input mapper_job = mapper_app.run(mapper_input) mapper_output = { "BAM": { "job": mapper_job.get_id(), "field": "sorted_bam" }, "BAI": { "job": mapper_job.get_id(), "field": "sorted_bai" } } return mapper_output
def process(scattered_input, dme_ix, ncpus, reads_root): # Fill in code here to process the input and create output. if DEBUG: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) dme_ix = dxpy.DXFile(dme_ix) # The following line(s) download your file inputs to the local file system # using variable names for the filenames. dxpy.download_dxfile(dme_ix.get_id(), "index.tgz") fq = dxpy.DXFile(scattered_input) name = fq.describe()['name'] dxpy.download_dxfile(fq.get_id(), name) bam_root = name + '_techrep' logger.info("* === Calling DNAnexus and ENCODE independent script... ===") logger.debug("** DIR: %s" % os.listdir('./')) logger.debug(subprocess.check_output(shlex.split('head %s' % name))) if os.path.isfile(ALIGN_SCRIPT): logger.debug("** Executable %s exists" % ALIGN_SCRIPT) else: logger.debug("** Executable %s DOES NOT exist" % ALIGN_SCRIPT) exit(1) align_cmd = '%s index.tgz %s %s %s no_stats' % (ALIGN_SCRIPT, name, str(ncpus), bam_root) logger.debug('** command line: %s' % align_cmd) map_out = subprocess.check_output(shlex.split(align_cmd)) logger.info("* === Returned from dname_align_se ===") # As always, you can choose not to return output if the # "postprocess" stage does not require any input, e.g. rows have # been added to a GTable that has been created in advance. Just # make sure that the "postprocess" job does not run until all # "process" jobs have finished by making it wait for "map" to # finish using the depends_on argument (this is already done for # you in the invocation of the "postprocess" job in "main"). logger.debug("** DIR: %s" % os.listdir('./')) logger.debug("** OUTPUT DIR: %s" % os.listdir('output/')) os.rename(bam_root+'_bismark.bam', bam_root+'.bam') return { "bam_file": dxpy.dxlink(dxpy.upload_local_file(bam_root+'.bam')), "report_file": dxpy.dxlink(dxpy.upload_local_file(bam_root+'_bismark_map_report.txt')) }
def sort_bam(job_inputs): input_bam = dxpy.DXFile(job_inputs['input_bam']) fn = input_bam.describe()['name'] dxpy.download_dxfile(input_bam.get_id(), fn) # Sort and optionally remove unmapped and multimapped reads sorted_ofn = os.path.splitext(fn)[0] + '_sorted.bam' cmd = '/sambamba sort -t {0} -o /dev/stdout {1} '.format( multiprocessing.cpu_count() - 1, fn) if job_inputs['quality_filter']: cmd += '| /sambamba view -f bam -F "(mapping_quality > 1) and not unmapped" -o /dev/stdout /dev/stdin ' cmd += '> ' + sorted_ofn print cmd subprocess.check_call(cmd, shell=True) # Count mapped, unique reads. cmd = '/sambamba view -f bam -F "(mapping_quality > 1) and not unmapped" -c ' + sorted_ofn print cmd num_uniquely_mapped_reads = int( subprocess.check_output(cmd, shell=True).strip()) pcr_bottleneck_coefficient = calc_pcr_bottleneck_coefficient(sorted_ofn) final_ofn = sorted_ofn if job_inputs['remove_duplicates']: deduped_ofn = os.path.splitext(sorted_ofn)[0] + '_deduped.bam' md_metrics_ofn = os.path.splitext( sorted_ofn)[0] + '_deduped_metrics.txt' cmd = get_java_cmd() cmd += ' -jar /MarkDuplicates.jar I={0} O={1} METRICS_FILE={2} ASSUME_SORTED=true VALIDATION_STRINGENCY=LENIENT REMOVE_DUPLICATES=true '.format( sorted_ofn, deduped_ofn, md_metrics_ofn) print cmd subprocess.check_call(cmd, shell=True) bam_file = dxpy.dxlink(dxpy.upload_local_file(deduped_ofn).get_id()) metrics_file = dxpy.dxlink( dxpy.upload_local_file(md_metrics_ofn).get_id()) final_ofn = deduped_ofn else: bam_file = dxpy.dxlink(dxpy.upload_local_file(sorted_ofn).get_id()) metrics_file = None return { 'output_bam': bam_file, 'dedup_metrics_file': metrics_file, 'qc_uniquely_mapped_reads': num_uniquely_mapped_reads, 'qc_pcr_bottleneck_coefficient': pcr_bottleneck_coefficient }
def map_reads_minimap2(reads, genome_fastagz, genome_mmi, datatype): # Download inputs reads = [ dx_utils.download_and_gunzip_file(f, skip_decompress=True) for f in reads ] ref_genome = dx_utils.download_and_gunzip_file(genome_fastagz) ref_genome_mmi = dx_utils.download_and_gunzip_file(genome_mmi) # configure preset params if datatype == 'PacBio': preset_param = 'map-pb' else: preset_param = 'map-ont' # Iterate over reads files output_ofns = [] for read in reads: output_prefix = re.sub("\.(fastq|fasta|fa|fq){1}(.gz)?$", "", read) ofn = '{0}.mapped.bam'.format(output_prefix) # Get help info dx_utils.run_cmd(['minimap2', '-h']) # Call minimap2 minimap2_cmd = ['minimap2', '-ax', preset_param, ref_genome, read] view_cmd = [ 'sambamba', 'view', '--sam-input', '--format=bam', '--compression-level=0', '/dev/stdin' ] sort_cmd = [ 'sambamba', 'sort', '-m', '{0}G'.format(int(dx_utils.get_memory(suffix='G'))), '-o', ofn, '-t', str(multiprocessing.cpu_count()), '/dev/stdin' ] dx_utils.run_pipe(minimap2_cmd, view_cmd, sort_cmd) # index dx_utils.run_cmd(['sambamba', 'index', ofn]) # append to outputs output_ofns.append(ofn) return { 'mapped_reads': [dxpy.dxlink(dxpy.upload_local_file(ofn)) for ofn in output_ofns], 'mapped_reads_index': [ dxpy.dxlink(dxpy.upload_local_file(ofn + '.bai')) for ofn in output_ofns ] }
def main(BAMs, params='USE_THREADING=true SORT_ORDER=coordinate VALIDATION_STRINGENCY=LENIENT'): inputFiles = [] for i in range(len(BAMs)): fh = dxpy.DXFile(BAMs[i]) dxpy.download_dxfile(fh.get_id(), "input%d.bam" % (i)) name = dxpy.DXFile(BAMs[0]).describe()['name'].rstrip(".bam") # Fill in your application code here. command = "java -Xmx4g -jar /opt/jar/MergeSamFiles.jar OUTPUT=%s.bam %s" % (name, params) for i in range(len(BAMs)): command += " INPUT=input%d.bam" % (i) subprocess.check_call(command, shell=True) # The following line(s) use the Python bindings to upload your file outputs # after you have created them on the local file system. It assumes that you # have used the output field name for the filename for each output, but you # can change that behavior to suit your needs. BAM = dxpy.upload_local_file("%s.bam" % name); # The following line fills in some basic dummy output and assumes # that you have created variables to represent your output with # the same name as your output fields. output = {} output["BAM"] = dxpy.dxlink(BAM) return output
def file_handler_from_fid(fid): '''Returns dx file handler from fid.''' try: dxlink = FILES[fid] except: dxlink = dxpy.dxlink(fid) return dxpy.get_handler(dxlink)
def copy_files(fids, projectId, folder, overwrite=False): '''Copies array of dx file dicts to project:/folder, returning new array of dx file dicts.''' newFids = [] for fid in fids: fileDict = dxpy.describe(FILES[fid]) # FILES contain dxLinks if fileDict['project'] == projectId: # cannot copy into the same project!!! # so just leave in place and pretend that we did! #proj = dxpy.DXProject(projectId) #proj.move(folder,[fid]) newFids.append(fid) continue # Check to see if file already exists. alreadyThere = find_file(folder + '/' + fileDict['name'], projectId) if alreadyThere is None or overwrite: # remove what is alreadyThere? #if alreadyThere is not None: # proj = dxpy.DXProject(projectId) # proj.remove_objects([alreadyThere]) dxFile = dxpy.get_handler(FILES[fid]) newLink = dxpy.dxlink(dxFile.clone(projectId, folder)) else: newLink = FILES(alreadyThere) if newLink == None: print "ERROR: Failed in copy of '" + fileDict['project'] + ":" + fileDict['name'] + \ "' to '" + projectId + ":" + folder + "'." sys.exit(1) newDict = dxpy.describe(newLink) FILES[newDict['id']] = newLink newFids.append(newDict['id']) return newFids
def copy_across_regions(local_path, record, dest_region, dest_proj, dest_folder): print("copy_across_regions {} {} {} {}:{}".format(local_path, record.get_id(), dest_region, dest_proj.get_id(), dest_folder)) # check if we haven't already created this record, and uploaded the file dest_asset = find_asset(dest_proj, dest_folder) if dest_asset is not None: print("Already copied to region {}".format(dest_region)) return AssetDesc(dest_region, dest_asset.get_id(), dest_proj) # upload dest_proj.new_folder(dest_folder, parents=True) dxfile = upload_local_file(local_path, dest_proj, dest_folder, hidden=True) fid = dxfile.get_id() dest_asset = dxpy.new_dxrecord(name=record.name, types=['AssetBundle'], details={'archiveFileId': dxpy.dxlink(fid)}, properties=record.get_properties(), project=dest_proj.get_id(), folder=dest_folder, close=True) return AssetDesc(dest_region, dest_asset.get_id(), dest_proj)
def main(gvcf, N, sample_name_prefix, output_name): K = len(gvcf) # download all the source gVCFs sh("dx-download-all-inputs --parallel") # create output directory os.mkdir("gvcf") # parallel generate gVCF files pool = Pool(multiprocessing.cpu_count()) inputs = [{ "source_index": i % K, "sample_name_prefix": sample_name_prefix, "dest_index": i } for i in xrange(N)] pool.map(generate_gvcf_kwargs, inputs) # tar and upload dxid = subprocess.check_output([ "/bin/bash", "-e", "-o", "pipefail", "-c", 'tar cv gvcf | dx upload --brief --destination "{}.tar" -'.format( output_name) ]).strip() return {"tar": dxpy.dxlink(dxid)}
def geneBody_coverage(BAM_file, BED_file): dxpy.download_dxfile(BED_file, "genes.bed") dxpy.download_dxfile(BAM_file, "mappings.bam") # split mappings into chunks that can be done on a single worker # all mappings are loaded into RAM so can only do 5 million at a time run_shell(" ".join(["samtools", "view", "mappings.bam", "|", "split", "-l 10000000", "-", "split_map"])) run_shell(" ".join(["samtools", "view", "-H", "mappings.bam", ">", "header_only.sam"])) files = os.listdir(".") jobs = [] for f in files: if f.startswith("split_map"): # add header run_shell(" ".join(["cat", "header_only.sam", f, ">", "temp.sam"])) # convert to BAM run_shell(" ".join(["samtools", "view", "-S", "-b", "temp.sam", ">", "temp.bam"])) # upload file split_bam = dxpy.upload_local_file("temp.bam") # run analysis jobs.append(dxpy.new_dxjob({"BAM_file":dxpy.dxlink(split_bam.get_id()), "BED_file":BED_file}, "run_gbc")) run_shell( "ls -l" ) gbc_agg_input = {"sub_reports":[]} for j in jobs: gbc_agg_input["sub_reports"].append({"job":j.get_id(), "field":"file"}) agg_job = dxpy.new_dxjob(gbc_agg_input, "gbc_agg").get_id() return {"results":{"job":agg_job, "field":"cover"}}
def copy_files(fids, projectId, folder, overwrite=False): '''Copies array of dx file dicts to project:/folder, returning new array of dx file dicts.''' newFids = [] for fid in fids: fileDict = dxpy.describe(FILES[fid]) # FILES contain dxLinks if fileDict['project'] == projectId: # cannot copy into the same project!!! # so just leave in place and pretend that we did! #proj = dxpy.DXProject(projectId) #proj.move(folder,[fid]) newFids.append( fid ) continue # Check to see if file already exists. alreadyThere = find_file(folder+'/'+fileDict['name'],projectId) if alreadyThere is None or overwrite: # remove what is alreadyThere? #if alreadyThere is not None: # proj = dxpy.DXProject(projectId) # proj.remove_objects([alreadyThere]) dxFile = dxpy.get_handler(FILES[fid]) newLink = dxpy.dxlink(dxFile.clone(projectId, folder)) else: newLink = FILES(alreadyThere) if newLink == None: print "ERROR: Failed in copy of '" + fileDict['project'] + ":" + fileDict['name'] + \ "' to '" + projectId + ":" + folder + "'." sys.exit(1) newDict = dxpy.describe(newLink) FILES[newDict['id']] = newLink newFids.append( newDict['id'] ) return newFids
def test_mapping(): dxpy.set_workspace_id('project-BpBjyqQ0Jk0Xv2B11Q8P6X59') applet = dxpy.find_one_data_object( name='bwa_mem_fastq_read_mapper', classname='applet', return_handler=True, zero_ok=False, project='project-B406G0x2fz2B3GVk65200003') applet.run({ 'genomeindex_targz': dxpy.dxlink('file-B6qq53v2J35Qyg04XxG0000V'), 'reads_fastqgz': dxpy.dxlink('file-BpBjzFQ0Jk0Xk73YqQgJKg9Z'), 'reads2_fastqgz': dxpy.dxlink('file-BpBk0400Jk0Xk73YqQgJKg9f') })
def upload_lane_html(self, raw_properties, tags): '''Upload lane.html file to DNAnexus project. Args: local_file_path (str): Local path of sample sheet. raw_properties (dict): Properties with values of different types. Returns: str: DXLink to lane.html file on DNAnexus object store. ''' # Convert all property values to strings properties = {key: str(value) for key, value in raw_properties.items()} properties['file_type'] = 'lane_html' project_folder = '{}/miscellany'.format(self.project_path) local_file_path = ( '{}/Reports/html/'.format(LOCAL_OUTPUT) + '{}/all/all/all/lane.html'.format(properties['flowcell_id'])) remote_file_name = '{}_L{}.lane.html'.format(properties['run_name'], properties['lane_index']) lane_html_dxid = dxpy.upload_local_file(filename=local_file_path, name=remote_file_name, properties=properties, tags=tags, project=self.project_dxid, folder=project_folder, parents=True) return dxpy.dxlink(lane_html_dxid)
def upload_tools_used(self, tools_used_dict, raw_properties): '''Write console commands to Tools Used file & upload. Args: tools_used_dict (dict): Description of executables & configurations. raw_properties (dict): Properties with values of different types. Returns: str: DXLink to "tools used" file on DNAnexus object store. ''' # Convert all property values to strings properties = {key: str(value) for key, value in raw_properties.items()} properties['file_type'] = 'lane_html' # Write file local_file_path = 'bcl2fastq_tools_used.json' with open(local_file_path, 'w') as TOOLS: TOOLS.write(json.dumps(tools_used_dict)) # Upload file properties['file_type'] = 'tools_used' project_folder = '{}/miscellany'.format(self.project_path) tools_used_dxid = dxpy.upload_local_file(filename=local_file_path, properties=properties, project=self.project_dxid, folder=project_folder, parents=True) return dxpy.dxlink(tools_used_dxid)
def test_unpaired(self): bed_file = dxpy.find_one_data_object(name="hg19_GRCh37_Feb2009_RefSeq.bed")['id'] mappings = dxpy.find_one_data_object(name="unpaired_RNA-Seq_mappings", typename="LetterMappings")['id'] if bed_file == None: print "Cannot find hg19_GRCh37_Feb2009_RefSeq.bed. Please upload it" return False if mappings == None: print "Cannot find unpaired_RNA-Seq_mappings. Please upload it" return False input = { 'rna_seq_mappings': dxpy.dxlink(mappings), 'bed_file': dxpy.dxlink(bed_file) } print "Running program with", input job = self.program.run(input) print "launched test_unpaired ", job.get_id()
def main(fastq, genomeindex_targz): print "something else" fastq_dxfile = dxpy.DXFile(fastq) dxpy.download_dxfile(fastq_dxfile.get_id(), "input.fastq") genome_dxfile = dxpy.DXFile(genomeindex_targz) dxpy.download_dxfile(genome_dxfile.get_id(), "genome.tar.gz") os.makedirs("genome") tar_cmd = "tar xzvf genome.tar.gz -C genome" subprocess.check_call(tar_cmd, shell=True) genome_file = glob.glob("genome/*.bwt")[0] genome_file = re.sub("\.bwt$", "", genome_file) bwa_cmd = ( "bwa mem -t {nproc} {genome} {fastq} | " "samtools view -u -S - | " "samtools sort -m 256M -@ {nproc} - output".format( nproc=multiprocessing.cpu_count(), genome=genome_file, fastq="input.fastq" ) ) subprocess.check_call(bwa_cmd, shell=True) bam = dxpy.upload_local_file("output.bam") # The following line fills in some basic dummy output and assumes # that you have created variables to represent your output with # the same name as your output fields. output = {} output["bam"] = dxpy.dxlink(bam) return output
def produce_qc_report(individual_json_outputs, sample_name, output_project, output_folder, properties={}): """Combine the various statistics collected into a single dict for output.""" output = {'Sample name': sample_name} misc_subfolder = output_folder + '/miscellany' for j in individual_json_outputs: for k in j: if k in output: output[k].update(j[k]) else: output[k] = j[k] ofn = sample_name + '_stats.json' with open(ofn, 'w') as output_fh: output_fh.write(json.dumps(output)) properties['file_type'] = 'qc_stats' output_json_file = dxpy.upload_local_file(filename=ofn, project=output_project, properties=properties, folder=misc_subfolder, parents=True) return {'combined_json_file': dxpy.dxlink(output_json_file)}
def main(input_file): # The following line(s) initialize your data object inputs on the platform # into dxpy.DXDataObject instances that you can start using immediately. input_file = dxpy.DXFile(input_file) # The following line(s) download your file inputs to the local file system # using variable names for the filenames. dxpy.download_dxfile(input_file.get_id(), "input_file") # Fill in your application code here. subprocess.check_call( "fastq_quality_trimmer -t 20 -Q 33 -i input_file -o output_file", shell=True) # The following line(s) use the Python bindings to upload your file outputs # after you have created them on the local file system. It assumes that you # have used the output field name for the filename for each output, but you # can change that behavior to suit your needs. output_file = dxpy.upload_local_file("output_file") # The following line fills in some basic dummy output and assumes # that you have created variables to represent your output with # the same name as your output fields. output = {} output["output_file"] = dxpy.dxlink(output_file) return output
def main(inputs): # The following line(s) initialize your data object inputs on the platform # into dxpy.DXDataObject instances that you can start using immediately. input_filenames = [] for input_file in inputs: dxf = dxpy.DXFile(input_file) input_filenames.append(dxf.name) dxpy.download_dxfile(dxf.get_id(), dxf.name) extension = splitext(splitext(input_filenames[-1])[0])[ 1] #uses last extension - presumably they are all the same pooled_filename = '-'.join( [splitext(splitext(fn)[0])[0] for fn in input_filenames]) + "_pooled%s.gz" % (extension) out, err = run_pipe( ['gzip -dc %s' % (' '.join(input_filenames)), 'gzip -c'], outfile=pooled_filename) pooled = dxpy.upload_local_file(pooled_filename) # The following line fills in some basic dummy output and assumes # that you have created variables to represent your output with # the same name as your output fields. output = {} output["pooled"] = dxpy.dxlink(pooled) return output
def main(repo_url, ref='master', credentials=None, build_options=None): clone_dir = app_builder.clone_repository(repo_url, ref=ref, credentials=credentials) applet_id = app_builder.create_applet(clone_dir, build_options=build_options) return { 'output_applet': dxpy.dxlink(applet_id) }
def compile_output_generic(oname, ovalue): if isinstance(ovalue, list): return [ compile_output_generic(oname, x) for x in ovalue ] elif isinstance(ovalue, dict): if is_output_file(ovalue): def remove_prefix(text, prefix): if text.startswith(prefix): return text[len(prefix):] return text def upload_file(ovalue): sh("unset DX_WORKSPACE_ID && dx cd $DX_PROJECT_CONTEXT_ID: && dx mkdir -p {}".format(folder)) return dxpy.dxlink(dxpy.upload_local_file(remove_prefix(ovalue['location'], "file://"), wait_on_close=True, project=dxpy.PROJECT_CONTEXT_ID, folder=folder)) if skip_downloads: files = dxpy.dxlink(open(remove_prefix(ovalue['location'], "file://")).read().rstrip()) else: files = upload_file(ovalue) if 'secondaryFiles' in ovalue: files = {'primaryFile': files, 'secondaryFiles': compile_output_generic(oname, ovalue['secondaryFiles'])} return files # TODO: This feature needs to be completed to reset env here, smartly check whether files exist already, and work for inputs elif is_output_directory(ovalue): sh("unset DX_WORKSPACE_ID && dx cd $DX_PROJECT_CONTEXT_ID: && dx upload -r {}".format(ovalue['path'])) return ovalue else: return { k : compile_output_generic(k,v) for k,v in ovalue.items() } else: return ovalue
def postprocess(**inputs): kwargs = inputs["kwargs"] subjob_outputs = inputs["subjob_outputs"] print "\nMerging outputs from {n} subjobs".format(n=len(subjob_outputs)) output_prefix = kwargs["output_prefix"] variant_suffixes = kwargs["variant_suffixes"] app_output_fn = {} for subjob_output in subjob_outputs: for type, id in subjob_output.iteritems(): file_id = id["$dnanexus_link"] filename = output_prefix + "_" + variant_suffixes[type] print "Downloading " + str(file_id) + " into " + filename dxpy.download_dxfile(dxid=file_id, filename=filename, append=True) app_output_fn[type] = filename postprocess_outputs = {} need_to_renumber = ["deletions", "short_inserts", "tandem_duplications", "inversions", "large_inserts"] for type, fn in app_output_fn.iteritems(): out_fn = fn if type in need_to_renumber: out_fn = RenumberMergedOutput(fn, fn+"_renumbered") print "\nUploading {file} as {fn}".format(file=out_fn, fn=fn) postprocess_outputs[type] = dxpy.dxlink(dxpy.upload_local_file(out_fn, name=fn)) if kwargs["export_vcf"]: DownloadRefFasta(kwargs["reference_fasta"]) postprocess_outputs["vcf"] = ExportVCF(kwargs=kwargs, output_path=output_prefix, ref_fn="reference_fasta") return postprocess_outputs
def combine_files(countDXlinks, resultfn): """The 'gather' subjob of the applet. Arguments: countDXlinks (list[dict]): list of DXlinks to process job output files. resultfn (str): Filename to use for job output file. Returns: DXLink for the main function to return as the job output. Note: Only the DXLinks are passed as parameters. Subjobs work on a fresh instance so files must be downloaded to the machine """ if resultfn.endswith(".bam"): resultfn = resultfn[:-4] + '.txt' sum_reads = 0 with open(resultfn, 'w') as f: for i, dxlink in enumerate(countDXlinks): dxfile = dxpy.DXFile(dxlink) filename = "countfile{0}".format(i) dxpy.download_dxfile(dxfile, filename) with open(filename, 'r') as fsub: for line in fsub: sum_reads += parse_line_for_readcount(line) f.write(line) f.write('Total Reads: {0}'.format(sum_reads)) countDXFile = dxpy.upload_local_file(resultfn) countDXlink = dxpy.dxlink(countDXFile.get_id()) return {"countDXLink": countDXlink}