class Filter(AbstractApplication): NAME = "FILTER" VERSION = "0.1" ASSEMBLY = "hg19_2" SPECIES = "HUMAN" cli_help = "Filtering output of gridss and delly." #cli_options = [options.PAIRS, options.PAIRS_FROM_FILE] cli_options = [options.TARGETS] application_description = cli_help application_results = { "svs": { "frontend_type": "tsv-file", "description": "SVS VCF.", "verbose_name": "Somatic SVS VCF", "external_link": None, } } dir_path = dirname(realpath(__file__)) application_settings = { "docker_pysam": get_docker_command("danielrbroad/pysamdocker"), "script_filtering": "/mnt/efs/myisabl/filtering.py", "cores": "1", } @cached_property def _apps(self): return { "delly": Delly(), "gridss": Gridss(), } @cached_property def analyses_dependencies(self): return [ { "app": self._apps["gridss"], "result": "svs", "name": "gridss_vcf" }, { "app": self._apps["delly"], "result": "svs", "name": "delly_vcf" }, ] def get_dependencies(self, targets, references, settings): inputs = {} analyses = [] for dependency in self.analyses_dependencies: input_name = dependency["name"] inputs[input_name], key = self.get_result( targets[0], application_key=dependency["app"].primary_key, application_name=dependency["app"].NAME, result_key=dependency["result"], targets=targets, references=references, ) analyses.append(key) return analyses, inputs def get_experiments_from_cli_options(self, **cli_options): return [([i], []) for i in cli_options["targets"]] def validate_experiments(self, targets, references): self.validate_one_target_no_references(targets, references) def get_command(self, analysis, inputs, settings): outdir = analysis.storage_url outdir1 = join(outdir, "filt1.vcf") outdir2 = join(outdir, "filt2.vcf") return " ".join( map( str, [ settings.docker_pysam, "python", settings.script_filtering, "-vcf1", inputs["gridss_vcf"], "-vcf2", inputs["delly_vcf"], "-out1", outdir1, "-out2", outdir2, "&& sudo chown -R ec2-user {}".format(outdir), ], )) def get_analysis_results(self, analysis): results = { "svs": join(analysis.storage_url, "filt1.vcf"), "svs2": join(analysis.storage_url, "filt2.vcf"), } for i in results.values(): assert isfile(i), f"Missing result file {i}" return results
class Circos(AbstractApplication): NAME = "CIRCOS" VERSION = "0.1" ASSEMBLY = "hg19_2" SPECIES = "HUMAN" cli_help = "Circos plot." cli_options = [options.TARGETS] application_description = cli_help application_results = { "circos_png": { "frontend_type": "image", "description": "Circos image.", "verbose_name": "Circos image", "external_link": None, } } application_settings = { "docker_pysam": "docker run -it --entrypoint '' -v /mnt/efs/myisabl:/mnt/efs/myisabl danielrbroad/pysamdocker /bin/bash ", "make_circos": "/mnt/efs/myisabl/circosplot/make_circos.r", "circos_prep": "python /mnt/efs/myisabl/circosplot/circos_prep.py", "cores": "1", "docker_circos": get_docker_command("danielavarelat/circosr"), "docker_py": get_docker_command("danielrbroad/pysamdocker"), "bed": "/mnt/efs/myisabl/circosplot/circos_genes.bed", "cns": "/mnt/efs/myisabl/circosplot/empty.cns", } @cached_property def _apps(self): return { "merge": Merge(), } @cached_property def analyses_dependencies(self): return [ {"app": self._apps["merge"], "result": "svs", "name": "merge"}, ] def get_dependencies(self, targets, references, settings): inputs = {} analyses = [] for dependency in self.analyses_dependencies: input_name = dependency["name"] inputs[input_name], key = self.get_result( targets[0], application_key=dependency["app"].primary_key, application_name=dependency["app"].NAME, result_key=dependency["result"], targets=targets, references=references, ) analyses.append(key) return analyses, inputs def get_experiments_from_cli_options(self, **cli_options): return [([i], []) for i in cli_options["targets"]] def validate_experiments(self, targets, references): self.validate_one_target_no_references(targets, references) def command_prep(self, analysis, inputs, settings): inp = inputs["merge"] outdir = analysis.storage_url return " ".join( map( str, [ settings.docker_py, settings.circos_prep, "-cns", settings.cns, "-vcf", inp, "-o", outdir, " && ", ], ) ) def get_command(self, analysis, inputs, settings): cmd = self.command_prep(analysis, inputs, settings) outdir = analysis.storage_url outcircos = join(outdir, "circos.png") circostsv = join(outdir, "circos_svs.tsv") circossegs = join(outdir, "segs.csv ") return " ".join( map( str, [ cmd, settings.make_circos, circostsv, settings.bed, circossegs, outcircos, "&& sudo chown -R ec2-user {}".format(outdir), ], ) ) def get_analysis_results(self, analysis): results = { "svs": join(analysis.storage_url, "merged.vcf"), } for i in results.values(): assert isfile(i), f"Missing result file {i}" return results
class QualityControl(AbstractApplication): """See https://quay.io/repository/biocontainers/rna-seqc.""" NAME = "QC_DATA" VERSION = "0.1.0" ASSEMBLY = "GRCh37" SPECIES = "HUMAN" cli_help = "Get Quality Control metrics for NGS data." cli_options = [options.TARGETS] application_project_level_results = BASE_APPLICATION_RESULTS application_individual_level_results = BASE_APPLICATION_RESULTS application_results = APPLICATION_RESULTS application_description = "Quality Control metrics for NGS data." application_settings = { "reference": "reference_data_id:genome_fasta", "java_args": "-Xmx4g -XX:-UsePerfData", # RNA settings "gc_gencode": None, "fasta_rrna": None, "gtf": None, # executables "multiqc": get_docker_command("ewels/multiqc:v1.5", "multiqc"), "picard": get_docker_command("leukgen/docker-picard"), "fastqc": get_docker_command("biocontainers/fastqc:v0.11.5", "fastqc"), "rna_seqc": get_docker_command( "quay.io/biocontainers/rna-seqc:1.1.8--2", "rna-seqc" ), } def get_experiments_from_cli_options(self, **cli_options): return [([i], []) for i in cli_options["targets"]] def validate_experiments(self, targets, references): self.validate_bams(targets + references) self.validate_one_target_no_references(targets, references) if targets[0].technique.category == "DNA": self.validate_bedfiles(targets + references) # msk specific validation for i in targets[0]["raw_data"]: if i["file_type"].startswith("FASTQ") and i["file_url"]: assert not i["file_url"].startswith( "/warm" ), "fastq in /warm, cant process" def validate_settings(self, settings): self.validate_reference_genome(settings.reference) def get_command(self, analysis, inputs, settings): target = analysis["targets"][0] outdir = analysis["storage_url"] system_id = target["system_id"] fastqc_dir = join(outdir, "fastqc") picard_dir = join(outdir, "picard") multiqc_dir = join(outdir, "multiqc") rna_seqc_dir = join(outdir, "rna_seqc") commands = [f"mkdir -p {multiqc_dir}"] bampath = self.get_bam(target) # build fastqc command, it is possible some samples don't gave fastq fastqc_cmds = [] fastqc_input = join(fastqc_dir, f"{system_id}.fastq.gz") for i in target["raw_data"]: if i["file_type"].startswith("FASTQ"): fastq = i["file_url"] if fastq.endswith(".gz"): fastqc_cmds.append(f"cat {fastq} > {fastqc_input}") else: fastqc_cmds.append(f"gzip -c {fastq} >> {fastqc_input}") if fastqc_cmds: commands += [f"mkdir -p {fastqc_dir}"] + fastqc_cmds commands += [f"{settings.fastqc} -o {fastqc_dir} {fastqc_input}"] commands += [f"rm {fastqc_input}"] # run picard for DNA data, bedfiles can't have a chr prefix if target["technique"]["category"] == "DNA": commands.append(f"mkdir -p {picard_dir}") picard_cmd = f'{settings.picard} -j "{settings.java_args}" ' picard_kwargs = dict( bampath=bampath, reference=settings.reference, outbase=join(picard_dir, system_id), bedfile=self.get_bedfile(target), ) for i in PICARD_BASE_COMMANDS: commands.append(picard_cmd + i.format(**picard_kwargs)) if target["technique"]["method"] in ["WE", "TD"]: for i in PICARD_TARGETED_COMMANDS: commands.append(picard_cmd + i.format(**picard_kwargs)) # run RNA-SeQC for RNA data if target["technique"]["category"] == "RNA": if not settings.gtf or not settings.fasta_rrna: raise exceptions.ConfigurationError( "Settings 'gtf' and 'fastq_rrna' must be set" ) commands.append("mkdir -p {0}".format(rna_seqc_dir)) strat_args = "" if settings.gc_gencode: strat_args = f"-strat gc -gc {settings.gc_gencode}" commands.append( f"{settings.rna_seqc} {settings.java_args} " f"-o {join(rna_seqc_dir, system_id)} " f"-r {settings.reference} " f"-t {settings.gtf} " f"-BWArRNA {settings.fasta_rrna} " f"-s \"'{system_id}|{bampath}|NA'\" " f"{strat_args}" ) # run multiqc on current sample commands.append(f"{settings.multiqc} -f -p -o {multiqc_dir} {outdir}") return " && ".join(commands) def get_analysis_results(self, analysis): target = analysis["targets"][0] outdir = analysis["storage_url"] multiqc = join(outdir, "multiqc") multiqc_data = join(multiqc, "multiqc_data") results = { "multiqc_html": join(multiqc, "multiqc_report.html"), "multiqc_data": join(multiqc_data, "multiqc_data.json"), "multiqc_stats": join(multiqc_data, "multiqc_general_stats.txt"), "read_length": None, } for key, i in results.items(): if key == "multiqc_data": continue assert True if i is None else isfile(i), f"Missing result {i}" if target["technique"]["category"] == "DNA": read_length_column = "MEAN_READ_LENGTH" read_length_path = "multiqc_picard_AlignmentSummaryMetrics.txt" read_length_path = join(multiqc_data, read_length_path) else: read_length_column = "Read Length" read_length_path = join(multiqc_data, "multiqc_rna_seqc.txt") with open(read_length_path) as f: row = next(csv.DictReader(f, delimiter="\t")) results["read_length"] = float(row[read_length_column]) if "read_length" in target: api.patch_instance( endpoint="experiments", instance_id=target["pk"], read_length=results["read_length"], ) return results def validate_project_analyses(self, project, analyses): assert ( len(analyses) <= 500 ), "Project level QC only valid for projects with lestt than 500 samples" def get_project_analysis_results(self, analysis): return self.get_merged_results(analysis) def get_individual_analysis_results(self, analysis): return self.get_merged_results(analysis) def merge_project_analyses(self, analysis, analyses): return self.merge_analyses( analysis, analyses, f"QC report for project {analysis['project_level_analysis']['pk']} " f"created using {len(analyses)} samples.", ) def merge_individual_analyses(self, analysis, analyses): return self.merge_analyses( analysis, analyses, f"QC report for individual {analysis['individual_level_analysis']['pk']} " f"created using {len(analyses)} samples.", ) def get_merged_results(self, analysis): outdir = analysis["storage_url"] multiqc_data = join(outdir, "multiqc_data") results = { "multiqc_html": join(outdir, "multiqc_report.html"), "multiqc_data": join(multiqc_data, "multiqc_data.json"), "multiqc_stats": None, } for i, j in results.items(): if i != "multiqc_stats": assert isfile(j), f"Missing result {j}" elif j and isfile(j): results["multiqc_stats"] = join( multiqc_data, "multiqc_general_stats.txt" ) return results def merge_analyses(self, analysis, analyses, comment): subprocess.check_call( [i for i in self.settings.multiqc.split(" ") if i] + [ "--comment", comment, "--outdir", analysis["storage_url"], "--data-dir", "--force", ] + [i["storage_url"] for i in analyses], stdout=sys.stdout, stderr=sys.stderr, )
class BwaMem(AbstractApplication): NAME = "BWA_MEM" #VERSION = "0.7.17-r1188" VERSION = "1" #ASSEMBLY = "GRCh38" ASSEMBLY = "hg19_2" SPECIES = "HUMAN" URL = "https://github.com/cancerit/PCAP-core/wiki/Scripts-Reference-implementations" cli_help = "Align DNA data with bwa-mem." cli_options = [options.TARGETS] application_description = constants.APPLICATION_DESCRIPTION application_results = constants.APPLICATION_RESULTS application_settings = { "cores": { "WG": 32, "WE": 32, "TD": 32 }, "reference": "reference_data_id:genome_fasta", "bwa_mem_pl": get_docker_command("leukgen/docker-pcapcore:v0.1.1"), } def get_experiments_from_cli_options(self, **cli_options): return [([i], []) for i in cli_options["targets"]] def validate_experiments(self, targets, references): self.validate_dna_only(targets + references) self.validate_single_data_type(targets + references) self.validate_one_target_no_references(targets, references) self.validate_methods(targets, ["WG", "WE", "TD"]) #assert not targets[0].is_pdx, "Use Disambiguate for PDX experiments" def validate_settings(self, settings): self.validate_reference_genome(settings.reference) def get_command(self, analysis, inputs, settings): #sequencing_data = analysis.targets[0].sequencing_data return self.get_bwa_mem_command(analysis, inputs, settings) def get_bwa_mem_command(self, analysis, inputs, settings): target = analysis["targets"][0] method = target["technique"]["method"] sample_name = target["system_id"] outdir = analysis["storage_url"] groupinfo, sequencing_data = utils.write_groupinfo( target, outdir, sample_name) command = [ settings.bwa_mem_pl, "-fragment", 10, "-reference", settings.reference, "-threads", settings.cores[method], "-map_threads", settings.cores[method], "-sample", sample_name, "-outdir", outdir, ] if target.technique["custom_fields"].get("nomarkdup"): command += ["-nomarkdup"] if groupinfo: command += ["-groupinfo", groupinfo] return (" ".join(map(str, command + sequencing_data)) + f" && sudo chown -R ec2-user {outdir}" # make sure index is older than bam + f" && touch {outdir}/{sample_name}.bam.bai") def get_analysis_results(self, analysis): results = utils.get_bwa_mem_pl_results(analysis) self.update_experiment_bam_file( experiment=analysis["targets"][0], bam_url=results["bam"], analysis_pk=analysis["pk"], ) return results
class Gridss(AbstractApplication): NAME = "GRIDSS" VERSION = "2.2.2" #ASSEMBLY = "GRCh38" ASSEMBLY = "hg19_2" SPECIES = "HUMAN" cli_help = "Find structural variants with GRIDSS." cli_options = [options.PAIRS, options.PAIRS_FROM_FILE] application_description = cli_help application_results = { "svs": { "frontend_type": "tsv-file", "description": "GRIDSS somatic SVS VCF.", "verbose_name": "Somatic SVS VCF", "external_link": None, }, "assembly_bam": { "frontend_type": "igv_bam:assembly_bam_bai", "description": "Gridss Assembled Bam", "verbose_name": "Assembly Bam", "external_link": None, }, "assembly_bam_bai": { "frontend_type": None, "description": "Gridss Assembled Bam Index", "verbose_name": "Assembly Bam Index", "external_link": None, }, } application_settings = { "config": "/mnt/efs/myisabl/config.txt", "blacklist": "/mnt/efs/myisabl/wgEncodeDacMapabilityConsensusExcludable.bed", "gridss": get_docker_command("papaemmelab/docker-gridss"), "reference": "reference_data_id:genome_fasta", "cores": "1", } def get_experiments_from_cli_options(self, **cli_options): return cli_options["pairs"] + cli_options["pairs_from_file"] def validate_experiments(self, targets, references): self.validate_bams(targets + references) self.validate_dna_pairs(targets, references) self.validate_same_technique(targets, references) #self.validate_methods(targets + references, ["WG"]) def validate_settings(self, settings): self.validate_reference_genome(settings.reference) def get_command(self, analysis, inputs, settings): outdir = analysis.storage_url tumor = analysis.targets[0] normal = analysis.references[0] #x="/home/danielavt/cli2/myapps/myapps/apps/gridss/config.txt" return ( f"cd {outdir} && " f"{settings.gridss} " f"CONFIGURATION_FILE={settings.config} " f"WORKING_DIR={outdir} " f"REFERENCE_SEQUENCE={settings.reference} " f"INPUT={self.get_bam(normal)} " f"INPUT_LABEL={normal.system_id} " f"INPUT={self.get_bam(tumor)} " f"INPUT_LABEL={tumor.system_id} " f"OUTPUT={join(outdir, 'somatic.sv.vcf')} " f"ASSEMBLY={join(outdir, 'somatic.gridss.assembly.bam')} " f"BLACKLIST={settings.blacklist} " f"WORKER_THREADS={settings.cores} " f"&& sudo chown -R ec2-user {outdir}" f"&& rm -rf {join(outdir, f'{normal.system_id}.bam.gridss.working')} " f"&& rm -rf {join(outdir, f'{tumor.system_id}.bam.gridss.working')}" ) def get_analysis_results(self, analysis): results = { "svs": join(analysis.storage_url, "somatic.sv.vcf"), "assembly_bam": join(analysis.storage_url, "somatic.gridss.assembly.bam"), "assembly_bam_bai": join(analysis.storage_url, "somatic.gridss.assembly.bai"), } for i in results.values(): assert isfile(i), f"Missing result file {i}" return results
class Delly(AbstractApplication): NAME = "DELLY" VERSION = "2" ASSEMBLY = "hg19_2" #ASSEMBLY = "GRCh38" SPECIES = "HUMAN" cli_help = "Find structural variants with GRIDSS." cli_options = [options.PAIRS, options.PAIRS_FROM_FILE] application_description = cli_help application_results = { "svs": { "frontend_type": "tsv-file", "description": "DELLY somatic SVS VCF.", "verbose_name": "Somatic SVS VCF", "external_link": None, } } application_settings = { "delly": get_docker_command("dellytools/delly"), "bcftools": get_docker_command("dceoy/bcftools"), "reference": "reference_data_id:genome_fasta", "cores": "2", } def get_experiments_from_cli_options(self, **cli_options): return cli_options["pairs"] + cli_options["pairs_from_file"] def validate_experiments(self, targets, references): self.validate_bams(targets + references) self.validate_same_technique(targets, references) def validate_settings(self, settings): self.validate_reference_genome(settings.reference) def get_command(self, analysis, inputs, settings): outdir = join(analysis.storage_url, "delly.bcf") outdirvcf = join(analysis.storage_url, "delly.vcf") target = analysis.targets[0] reference = analysis.references[0] command = [ settings.delly, "delly", "call", "-o", outdir, "-g", settings.reference, self.get_bam(reference), self.get_bam(target), ] command2 = [ settings.bcftools, "view", outdir, ">", outdirvcf, ] com = (" ".join(command)) cmd2 = (" ".join(command2)) return (com + f" && sudo touch {outdirvcf}" + f" && sudo chown -R ec2-user {outdirvcf}" + f" && {cmd2}") def get_analysis_results(self, analysis): results = { "svs": join(analysis.storage_url, "delly.vcf"), } for i in results.values(): assert isfile(i), f"Missing result file {i}" return results
class Svaba(AbstractApplication): #ASSEMBLY = "GRCh37" #SPECIES = "HUMAN" NAME = "SVABA" VERSION = "0.2.1" cli_help = "Find structural variants with Svaba." cli_options = [options.PAIRS, options.PAIRS_FROM_FILE] application_description = cli_help application_results = { "svs": { "frontend_type": "tsv-file", "description": "Svaba somatic SVS VCF.", "verbose_name": "Somatic SVS VCF", "external_link": None, } } application_settings = { "svab": get_docker_command("papaemmelab/docker-svaba:v1.0.0"), "reference": "reference_data_id:genome_fasta", "cores": "16", } def get_experiments_from_cli_options(self, **cli_options): return cli_options["pairs"] + cli_options["pairs_from_file"] def validate_experiments(self, targets, references): #self.validate_dna_pairs(targets, references) self.validate_same_technique(targets, references) def validate_settings(self, settings): self.validate_reference_genome(settings.reference) def get_command(self, analysis, inputs, settings): target = analysis.targets[0] reference = analysis.references[0] command = [ settings.svab, "run", "-z", "-a", target.system_id, "-G", settings.reference, "-t", self.get_bam(target), "-n", self.get_bam(reference), "-p", settings.cores, ] com = (" ".join(command)) return com def get_analysis_results(self, analysis): results = { "svs": join( analysis["storage_url"], analysis["targets"][0]["system_id"] + ".svaba.somatic.sv.vcf.gz", ) } for i in results.values(): assert isfile(i) return results