def test_workflow_string_not_null(self): w = WorkflowBuilder("wf") w.input("inp", Optional[str]) w.output("out", source=w.inp.assert_not_null()) cwltool = w.translate("cwl", allow_empty_container=True, to_console=False)[0] print(cwltool)
def process_subpipeline(**connections): w = WorkflowBuilder("somatic_subpipeline") w.input("bam", BamBai) w.input("intervals", Bed) w.input("reference", FastaWithDict) w.input("known_sites", Array(VcfTabix)) w.step( "base_recalibrator", gatk4.Gatk4BaseRecalibratorLatest( bam=w.bam, intervals=w.intervals, reference=w.reference, knownSites=w.known_sites, ), ) w.step( "apply_bqsr", gatk4.Gatk4ApplyBqsrLatest( bam=w.bam, recalFile=w.base_recalibrator.out, intervals=w.intervals, reference=w.reference, ), ) w.output("out", source=w.apply_bqsr.out) return w(**connections)
def test_string_formatter(self): wf = WorkflowBuilder("wf") wf.input("sampleName", str) wf.input("platform", str) wf.input( "readGroupHeaderLine", String(optional=True), default=StringFormatter( "@RG\\tID:{name}\\tSM:{name}\\tLB:{name}\\tPL:{pl}", name=InputSelector("sampleName"), pl=InputSelector("platform"), ), ) wf.step("print", EchoTestTool(inp=wf.readGroupHeaderLine)) wf.output("out", source=wf.print) d, _ = cwl.CwlTranslator.translate_workflow( wf, with_container=False, allow_empty_container=True ) stepinputs = d.save()["steps"][0]["in"] self.assertEqual(4, len(stepinputs)) expression = stepinputs[-1]["valueFrom"] expected = ( "$((inputs._print_inp_readGroupHeaderLine != null) " "? inputs._print_inp_readGroupHeaderLine " ': "@RG\\\\tID:{name}\\\\tSM:{name}\\\\tLB:{name}\\\\tPL:{pl}".replace(/\\{name\\}/g, inputs._print_inp_sampleName).replace(/\\{pl\\}/g, inputs._print_inp_platform))' ) self.assertEqual(expected, expression)
def test_add_output(self): w = WorkflowBuilder("test_add_input") w.step("stp", SingleTestTool(), ignore_missing=True) w.output("outputStep", str, source=w.stp) self.assertEqual(len(w.output_nodes), 1) self.assertEqual(w.outputStep, next(iter(w.output_nodes.values()))) self.assertIsNotNone(w.nodes["stp"])
def test_alias_selector(self): w = WorkflowBuilder("wf") w.input("inp", str) w.step("echo", EchoTestTool(inp=w.inp.as_type(str))) w.output("out", source=w.echo.out) sn: List[cwlgen.WorkflowStep] = cwl.translate_step_node( w.step_nodes["echo"], inputs_dict={"inp": ToolInput("inp", str)} ) self.assertEqual("inp", sn[0].in_[0].source)
def test_output_name_and_folder(self): w = WorkflowBuilder("wf") w.input("inp", str) w.step("print", Echo(inp=w.inp)) w.output("out", source=w.print, output_name=w.inp, output_folder=[w.inp]) inputs = {"inp": ["test1", "test2"]} modifier = BatchPipelineModifier(BatchRunRequirements(["inp"], "inp")) new_workflow = modifier.tool_modifier(w, inputs, {}) print(new_workflow)
def process_subpipeline(**connections): w = WorkflowBuilder("split_bam_subpipeline") w.input("bam", BamBai) w.input("intervals", Bed(optional=True)) w.step("split_bam", gatk4.Gatk4SplitReads_4_1_3(bam=w.bam, intervals=w.intervals)) w.output("out", source=w.split_bam.out) return w(**connections)
def test_read_contents(self): w = WorkflowBuilder("wf") w.input("inp", str) w.step("stp", EchoTestTool(inp=w.inp)) w.output("out", source=w.stp.out.contents()) w_cwl = cwl.CwlTranslator().translate_workflow(w, with_container=False)[0] self.assertEqual(2, len(w_cwl.steps)) self.assertEqual( "${return {out: inputs._stpout.contents }}", w_cwl.steps[1].run.expression ) self.assertTrue(w_cwl.steps[1].run.inputs[0].loadContents)
def test_expression_default(self): wf = WorkflowBuilder("test_expression_defaults") wf.input("inp", Optional[str]) wf.step( "echo", EchoTestTool(inp="Hello, " + If(IsDefined(wf.inp), wf.inp, ", Michael!")), ) wf.output("out", source=wf.echo) wf.translate("cwl")
def test_subworkflow(self): w = WorkflowBuilder("test_subworkflow") sub_w = WorkflowBuilder("subworkflow") sub_w.input("sub_inp", str) sub_w.step("sub_stp", SingleTestTool(inputs=sub_w.sub_inp)) sub_w.output("sub_out", source=sub_w.sub_stp.out) w.input("inp", str) w.step("stp_workflow", sub_w(sub_inp=w.inp)) w.output("out", source=w.stp_workflow.sub_out) # would be good to come up with some tests # w.translate("wdl") self.assertTrue(True)
def setUpClass(cls): w = WorkflowBuilder("test_operators") w.input("inp", Array(File())) inval = w.inp[0].basename() w.step("echo", SingleTestTool(input1=inval)) w.output("out", source=w.echo) cls.wf = w w2 = WorkflowBuilder("test_scattered_operator_with_alias") w2.input("inp", Array(Array(String))) w2.step("echo", SingleTestTool(input1=w2.inp[0]), scatter="input1") w2.output("out", source=w.echo) cls.wf2 = w2
def recursively_build_workflow_with_layers(layers): w = WorkflowBuilder(f"scattered_with_{layers}") w.input("scatters", int, default=3) w.input("seed_hash", Optional[str]) w.input("bias", Optional[int]) w.step("generate_random_ints", GenerateIntegers(numbers_to_generate=w.scatters)) w.step( "generate_hashes", CalculateMd5HashOfInt(value=w.generate_random_ints.out), scatter="value", ) joined_generate_hashes = JoinOperator(w.generate_hashes.out, ",") if layers > 0: innerworkflow = recursively_build_workflow_with_layers(layers - 1) w.step( "inner", innerworkflow(scatters=w.generate_random_ints), scatter="scatters", ) joined_inner = JoinOperator(w.inner.out_hash, ",") post_hash_inp = (j.logical.If( j.logical.IsDefined(w.seed_hash), w.seed_hash + joined_inner, joined_inner, ) + joined_generate_hashes) else: post_hash_inp = j.logical.If( j.logical.IsDefined(w.seed_hash), w.seed_hash + joined_generate_hashes, joined_generate_hashes, ) w.step( "post_hash", CalculateMd5Hash(value=post_hash_inp), ) w.output("out_hash", source=w.post_hash.out) return w
def process_subpipeline(**connections): w = WorkflowBuilder("somatic_subpipeline") w.input("reference", FastaWithDict) w.input("reads", Array(FastqGzPair)) w.input("cutadapt_adapters", File(optional=True)) w.input("sample_name", String) w.step("fastqc", FastQC_0_11_5(reads=w.reads), scatter="reads") w.step( "getfastqc_adapters", ParseFastqcAdaptors( fastqc_datafiles=w.fastqc.datafile, cutadapt_adaptors_lookup=w.cutadapt_adapters, ), scatter="fastqc_datafiles", ) w.step( "align_and_sort", BwaAligner( fastq=w.reads, reference=w.reference, sample_name=w.sample_name, sortsam_tmpDir=".", cutadapt_adapter=w.getfastqc_adapters, cutadapt_removeMiddle3Adapter=w.getfastqc_adapters, ), scatter=[ "fastq", "cutadapt_adapter", "cutadapt_removeMiddle3Adapter" ], ) w.step( "merge_and_mark", MergeAndMarkBams_4_1_3(bams=w.align_and_sort.out, sampleName=w.sample_name), ) w.output("out", source=w.merge_and_mark.out) w.output("reports", source=w.fastqc.out, output_folder=[w.sample_name, "reports"]) return w(**connections)
def test_with_str_default(self): w = WorkflowBuilder("wf") w.input("inp", str, default="hello") w.output("out", source=w.inp) out, _, _ = w.translate("wdl", to_console=False) expected = """\ version development workflow wf { input { String? inp } output { String out = select_first([inp, "hello"]) } }""" self.assertEqual(expected, out)
def test_with_int_default(self): w = WorkflowBuilder("wf") w.input("inp", int, default=0) w.output("out", source=w.inp) out, _, _ = w.translate("wdl", to_console=False) expected = """\ version development workflow wf { input { Int? inp } output { Int out = select_first([inp, 0]) } }""" self.assertEqual(expected, out)
def process_subpipeline(**connections): w = WorkflowBuilder("samtools_mpileup_subpipeline") w.input("vcf", Vcf) w.input("bam", BamBai) w.input("reference", FastaWithDict) w.step( "samtools_mpileup", SamToolsMpileupLatest( bam=w.bam, positions=w.vcf, reference=w.reference, countOrphans=True, noBAQ=True, minBQ=0, maxDepth=10000, ), ) w.output("out", source=w.samtools_mpileup.out) return w(**connections)
def test_filter_null(self): T = CommandToolBuilder( tool="testsingleinput", base_command="echo", inputs=[ToolInput("inp", str, position=0)], outputs=[ToolOutput("out", Stdout)], version="v1", container=None, ) w = WorkflowBuilder("wf") w.input("inp", Array(Optional[str], optional=True)) w.step("stp", T(inp=FilterNullOperator(w.inp)), scatter="inp") w.output("out", source=w.stp.out) w_cwl = cwl.CwlTranslator().translate_workflow(w, with_container=False)[0] self.assertEqual(2, len(w_cwl.steps)) self.assertEqual( "_evaluate_prescatter-stp-inp/out", w_cwl.steps[1].in_[0].source )
def test_simple(self): w = WorkflowBuilder("wf") w.input("inp", str) w.output("out", source=w.inp) out, _, _ = w.translate("wdl", to_console=False) expected = """\ version development workflow wf { input { String inp } output { String out = inp } }""" self.assertEqual(expected, out)
def test_array_step_input(self): wf = WorkflowBuilder("cwl_test_array_step_input") wf.input("inp1", Optional[str]) wf.input("inp2", Optional[str]) wf.step( "print", ArrayTestTool( inps=[ If(IsDefined(wf.inp1), wf.inp1, "default1"), If(IsDefined(wf.inp2), wf.inp2 + "_suffix", ""), ] ), ), wf.output("out", source=wf.print) ret, _, _ = wf.translate("cwl", allow_empty_container=True, to_console=False) self.maxDiff = None self.assertEqual(cwl_arraystepinput, ret)
def test_basic(self): w = WorkflowBuilder("my_conditional_workflow") w.input("inp", String(optional=True)) w.step( "print_if_has_value", TestTool(testtool=w.inp), # only print if the input "inp" is defined. when=IsDefined(w.inp), ) w.output("out", source=w.print_if_has_value) inputs_dict = {"inp": ToolInput("inp", str)} c = cwl.translate_step_node(w.print_if_has_value, inputs_dict=inputs_dict)[0] self.assertEqual("$((inputs.__when_inp != null))", c.when) extra_input: cwlgen.WorkflowStepInput = c.in_[-1] self.assertEqual("__when_inp", extra_input.id)
def test_string_formatter_stepinput(self): wf = WorkflowBuilder("wf") wf.input("sampleName", str) wf.input("platform", str) wf.step( "print", EchoTestTool( inp=StringFormatter( "@RG\\tID:{name}\\tSM:{name}\\tLB:{name}\\tPL:{pl}", name=wf.sampleName, pl=wf.platform, ) ), ) wf.output("out", source=wf.print) d, _ = cwl.CwlTranslator.translate_workflow( wf, with_container=False, allow_empty_container=True ) stepinputs = d.save()["steps"][0]["in"] self.assertEqual(3, len(stepinputs)) expression = stepinputs[-1]["valueFrom"] expected = '$("@RG\\\\tID:{name}\\\\tSM:{name}\\\\tLB:{name}\\\\tPL:{pl}".replace(/\\{name\\}/g, inputs._print_inp_sampleName).replace(/\\{pl\\}/g, inputs._print_inp_platform))' self.assertEqual(expected, expression)
def process_subpipeline(**connections): w = WorkflowBuilder("somatic_subpipeline") # INPUTS w.input("reads", Array(FastqGzPair)) w.input("sample_name", String) w.input("reference", FastaWithDict) w.input("cutadapt_adapters", File(optional=True)) w.input("gatk_intervals", Array(Bed)) w.input("snps_dbsnp", VcfTabix) w.input("snps_1000gp", VcfTabix) w.input("known_indels", VcfTabix) w.input("mills_indels", VcfTabix) # STEPS w.step("fastqc", FastQC_0_11_8(reads=w.reads), scatter="reads") w.step( "getfastqc_adapters", ParseFastqcAdaptors( fastqc_datafiles=w.fastqc.datafile, cutadapt_adaptors_lookup=w.cutadapt_adapters, ), scatter="fastqc_datafiles", ) w.step( "align_and_sort", BwaAligner( fastq=w.reads, reference=w.reference, sample_name=w.sample_name, sortsam_tmpDir=None, cutadapt_adapter=w.getfastqc_adapters, cutadapt_removeMiddle3Adapter=w.getfastqc_adapters, ), scatter=[ "fastq", "cutadapt_adapter", "cutadapt_removeMiddle3Adapter" ], ) w.step( "merge_and_mark", MergeAndMarkBams_4_1_3(bams=w.align_and_sort.out, sampleName=w.sample_name), ) # Temporarily remove GATK4 DepthOfCoverage for performance reasons, see: # https://gatk.broadinstitute.org/hc/en-us/community/posts/360071895391-Speeding-up-GATK4-DepthOfCoverage # w.step( # "coverage", # Gatk4DepthOfCoverage_4_1_6( # bam=w.merge_and_mark.out, # reference=w.reference, # intervals=w.gatk_intervals, # omitDepthOutputAtEachBase=True, # # countType="COUNT_FRAGMENTS_REQUIRE_SAME_BASE", # summaryCoverageThreshold=[1, 50, 100, 300, 500], # outputPrefix=w.sample_name, # ), # ) w.step( "calculate_performancesummary_genomefile", GenerateGenomeFileForBedtoolsCoverage(reference=w.reference), ) w.step( "performance_summary", PerformanceSummaryGenome_0_1_0( bam=w.merge_and_mark.out, sample_name=w.sample_name, genome_file=w.calculate_performancesummary_genomefile.out, ), ) # OUTPUTS w.output("out_bam", source=w.merge_and_mark.out) w.output("out_fastqc_reports", source=w.fastqc.out) # w.output("depth_of_coverage", source=w.coverage.out_sampleSummary) w.output( "out_performance_summary", source=w.performance_summary.performanceSummaryOut, ) return w(**connections)
def tool_modifier(self, tool: Tool, inputs: Dict, hints: Dict[str, str]) -> Tool: # Build custom pipeline w = WorkflowBuilder(tool.id(), friendly_name=tool.friendly_name(), version=tool.version()) ins = tool.tool_inputs() insdict = {i.id(): i for i in ins} fields = set(self.batch.fields) inkeys = set(i.id() for i in ins) invalid_keys = fields - inkeys if len(invalid_keys) > 0: raise Exception( f"Couldn't create batchtool from fields {', '.join(invalid_keys)} " f"as they do not exist on '{tool.id()}'") if self.batch.groupby not in inputs: raise Exception( f"the group_by field '{self.batch.groupby}' was not found in the inputs" ) innode_base = {} for i in ins: if i.id() in fields: continue default = i.default if isinstance(default, Selector): default = None innode_base[i.id()] = w.input(i.id(), i.intype, default=default, doc=i.doc) raw_groupby_values = inputs[self.batch.groupby] duplicate_keys = find_duplicates(raw_groupby_values) if len(duplicate_keys) > 0: raise Exception( f"There are duplicate group_by ({self.batch.groupby}) keys in the input: " + ", ".join(duplicate_keys)) groupby_values = [ Validators.transform_identifier_to_be_valid(ident) for ident in raw_groupby_values ] duplicate_keys = find_duplicates(groupby_values) if len(duplicate_keys) > 0: raise Exception( f"Janis transformed values in the group_by field ({self.batch.groupby}) to be a valid identifiers, " f"after this transformation, there were duplicates keys: " + ", ".join(duplicate_keys)) w.input(self.GROUPBY_FIELDNAME, Array(str), value=groupby_values) steps_created = [] stepid_from_gb = lambda gb: f"{gb}_{tool.id()}" for gbvalue in groupby_values: extra_ins = {} for f in fields: newkey = f"{f}_{gbvalue}" extra_ins[f] = w.input(newkey, insdict[f].intype) steps_created.append( w.step(stepid_from_gb(gbvalue), tool(**innode_base, **extra_ins))) for out in tool.tool_outputs(): output_folders = [] output_name = out.id() if isinstance(tool, WorkflowBase): outnode = tool.output_nodes[out.id()] output_folders = outnode.output_folder or [] if outnode.output_name is not None: output_name = outnode.output_name for idx, gbvalue, raw_gbvalue in zip(range(len(groupby_values)), groupby_values, raw_groupby_values): transformed_inputs = { **inputs, **{f: inputs[f][idx] for f in fields} } output_folders_transformed = Operator.evaluate_arg( output_folders, transformed_inputs) output_name_transformed = Operator.evaluate_arg( output_name, transformed_inputs) w.output( f"{gbvalue}_{out.id()}", source=w[stepid_from_gb(gbvalue)][out.id()], output_name=output_name_transformed, output_folder=[ raw_gbvalue, *(output_folders_transformed or []) ], ) return w
"samtoolsview", SamToolsView_1_9(sam=w.bwamem.out), ) # Use `gatk4 MarkDuplicates` on the output of samtoolsview # - The output of BWA is query-grouped, providing "queryname" is good enough w.step( "markduplicates", Gatk4MarkDuplicates_4_1_4(bam=w.samtoolsview.out, assumeSortOrder="queryname"), ) # Use `gatk4 SortSam` on the output of markduplicates # - Use the "coordinate" sortOrder w.step("sortsam", Gatk4SortSam_4_1_4( bam=w.markduplicates.out, sortOrder="coordinate", )) # Use `gatk4 SetNmMdAndUqTags` to calculate standard tags for BAM w.step( "fix_tags", Gatk4SetNmMdAndUqTags_4_1_4( bam=w.sortsam.out, reference=w.reference, ), ) # Output our final bam w.output("out_bam", source=w.fix_tags.out)
@staticmethod def base_command(): return "echo" # non functional tool def inputs(self) -> List[ToolInput]: return [ToolInput("inp", DataTypeWithSecondary())] def outputs(self) -> List[ToolOutput]: return [ ToolOutput("out", DataTypeWithSecondary(), glob=InputSelector("inp")) ] if __name__ == "__main__": w = WorkflowBuilder("test_workflow") w.input("inp", DataTypeWithSecondary) w.step("stp", ToolThatAcceptsAndReturnsSecondary(inp=w.inp)) w.output("out", source=w.stp) w.translate("wdl") w2 = WorkflowBuilder("scattered_test_workflow") w2.input("inp", Array(DataTypeWithSecondary), default=["path/to/file.ext"]) w2.step("stp", ToolThatAcceptsAndReturnsSecondary(inp=w2.inp), scatter="inp") w2.output("out", source=w2.stp) w2.translate("wdl")
def tool_modifier(self, tool: Tool, inputs: Dict, hints: Dict[str, str]) -> Tool: from janis_bioinformatics.data_types import FastaWithDict, Vcf, Bed from janis_bioinformatics.tools.illumina import HapPyValidator_0_3_9 failed_outputs, untyped_outputs = ensure_outputs_are_in_workflow_and_are_compatible( tool, self.validation.fields, Vcf()) if len(failed_outputs) > 0: raise Exception( f"Some outputs for validation were not found in the tool '{tool.id()}': " f"{', '.join(failed_outputs)}") if len(untyped_outputs) > 0: Logger.critical( f"Some outputs for validation from the tool '{tool.id()}' were not " f"compatible with VCF: {', '.join(untyped_outputs)}") w = WorkflowBuilder(tool.id() + "_validated") w.input("validatorReference", FastaWithDict, value=self.validation.reference) w.input("validatorTruthVCF", Vcf, value=self.validation.truthVCF) w.input("validatorIntervals", Bed(optional=True), value=self.validation.intervals) inpdict = { i.id(): w.input(i.id(), i.intype) for i in tool.tool_inputs() } toolstp = w.step(tool.id(), tool(**inpdict)) if isinstance(tool, Workflow): wf: Workflow = tool for o in wf.output_nodes.values(): w.output( identifier=o.id(), source=toolstp[o.id()], output_folder=o.output_folder, output_name=o.output_name, ) else: for o in tool.tool_outputs(): w.output(identifier=o.id(), source=toolstp[o.id()]) for o in self.validation.fields: sid = "validator_" + o valstp = w.step( sid, HapPyValidator_0_3_9( compareVCF=toolstp[o], reportPrefix= o, # this will generate an input node with format validator_{o}_reportPrefix reference=w.validatorReference, truthVCF=w.validatorTruthVCF, intervals=w.validatorIntervals, ), ) # Connect all the outputs of the validator to an output for vo in valstp.tool.outputs(): w.output( f"validated_{o}_{vo.id()}", source=valstp[vo.id()], output_folder="validated", ) return w
w.step( "applybqsr", Gatk4ApplyBQSR_4_1_4( bam=w.fix_tags.out, reference=w.reference, recalFile=w.baserecalibration.out_recalibration_report, ), ) # Use HaplotypeCaller as our variant caller w.step( "haplotypecaller", Gatk4HaplotypeCaller_4_1_4(bam=w.applybqsr.out_bam, reference=w.reference), ) w.output("out_recalibration_table", source=w.baserecalibration.out_recalibration_report) w.output("out_bam", source=w.applybqsr.out_bam) w.output("out_assembledbam", source=w.haplotypecaller.out_bam) w.output("out_variants", source=w.haplotypecaller.out_vcf) if __name__ == "__main__": import json from janis_core.translations.cwl import CwlTranslator out = CwlTranslator.translate_workflow_to_all_in_one(w).save() with open("/Users/franklinmichael/Desktop/tmp/janis/bcc/vc.json", "w+") as f: json.dump(out, f) # w.translate("cwl", export_path="~/Desktop/tmp/janis/bcc/", to_disk=True)
FastaBwa, glob=InputSelector("reference")), ] def base_command(self): return "echo" if __name__ == "__main__": w = WorkflowBuilder("test_workflow") # EXAMPLE 1 w.input("inp", DataTypeWithSecondary) w.step("stp", ToolThatAcceptsAndReturnsSecondary(inp=w.inp)) w.output("out", source=w.stp) w.translate("wdl") # EXAMPLE 2 w2 = WorkflowBuilder("scattered_test_workflow") w2.input("inp", Array(DataTypeWithSecondary), default=["path/to/file.ext"]) w2.step("stp", ToolThatAcceptsAndReturnsSecondary(inp=w2.inp), scatter="inp") w2.output("out", source=w2.stp) w2.translate("wdl") # EXAMPLE 3 w3 = WorkflowBuilder("scattered_bioinf_complex")
from janis_bioinformatics.tools.bwa import BwaMemLatest from janis_bioinformatics.tools.samtools import SamToolsView_1_9 from janis_bioinformatics.tools.gatk4 import Gatk4SortSam_4_1_2 w = WorkflowBuilder("alignmentWorkflow") # Inputs w.input("sample_name", String) w.input("read_group", String) w.input("fastq", FastqGzPair) w.input("reference", FastaWithDict) # Steps w.step( "bwamem", BwaMemLatest(reads=w.fastq, readGroupHeaderLine=w.read_group, reference=w.reference), ) w.step("samtoolsview", SamToolsView_1_9(sam=w.bwamem.out)) w.step( "sortsam", Gatk4SortSam_4_1_2(bam=w.samtoolsview.out, sortOrder="coordinate", createIndex=True), ) # Outputs w.output("out", source=w.sortsam.out)
) w = WorkflowBuilder("alignmentWorkflow") # Inputs w.input("sample_name", String) w.input("read_group", String) w.input("fastq", FastqGzPair) w.input("reference", FastaWithDict) # Steps w.step( "bwamem", BwaMemLatest(reads=w.fastq, readGroupHeaderLine=w.read_group, reference=w.reference), ) w.step("samtoolsview", SamToolsView_1_9(sam=w.bwamem.out)) w.step( "sortsam", Gatk4SortSam_4_1_2(bam=w.samtoolsview.out, sortOrder="coordinate", createIndex=True), ) w.step("markduplicates", Gatk4MarkDuplicates_4_1_4(bam=w.sortsam.out)) # Outputs w.output("out", source=w.markduplicates.out)