Пример #1
0
class Filter(AbstractApplication):

    NAME = "FILTER"
    VERSION = "0.1"
    ASSEMBLY = "hg19_2"

    SPECIES = "HUMAN"
    cli_help = "Filtering output of gridss and delly."
    #cli_options = [options.PAIRS, options.PAIRS_FROM_FILE]
    cli_options = [options.TARGETS]

    application_description = cli_help

    application_results = {
        "svs": {
            "frontend_type": "tsv-file",
            "description": "SVS VCF.",
            "verbose_name": "Somatic SVS VCF",
            "external_link": None,
        }
    }
    dir_path = dirname(realpath(__file__))
    application_settings = {
        "docker_pysam": get_docker_command("danielrbroad/pysamdocker"),
        "script_filtering": "/mnt/efs/myisabl/filtering.py",
        "cores": "1",
    }

    @cached_property
    def _apps(self):
        return {
            "delly": Delly(),
            "gridss": Gridss(),
        }

    @cached_property
    def analyses_dependencies(self):
        return [
            {
                "app": self._apps["gridss"],
                "result": "svs",
                "name": "gridss_vcf"
            },
            {
                "app": self._apps["delly"],
                "result": "svs",
                "name": "delly_vcf"
            },
        ]

    def get_dependencies(self, targets, references, settings):
        inputs = {}
        analyses = []
        for dependency in self.analyses_dependencies:
            input_name = dependency["name"]
            inputs[input_name], key = self.get_result(
                targets[0],
                application_key=dependency["app"].primary_key,
                application_name=dependency["app"].NAME,
                result_key=dependency["result"],
                targets=targets,
                references=references,
            )
            analyses.append(key)
        return analyses, inputs

    def get_experiments_from_cli_options(self, **cli_options):
        return [([i], []) for i in cli_options["targets"]]

    def validate_experiments(self, targets, references):
        self.validate_one_target_no_references(targets, references)

    def get_command(self, analysis, inputs, settings):
        outdir = analysis.storage_url
        outdir1 = join(outdir, "filt1.vcf")
        outdir2 = join(outdir, "filt2.vcf")
        return " ".join(
            map(
                str,
                [
                    settings.docker_pysam,
                    "python",
                    settings.script_filtering,
                    "-vcf1",
                    inputs["gridss_vcf"],
                    "-vcf2",
                    inputs["delly_vcf"],
                    "-out1",
                    outdir1,
                    "-out2",
                    outdir2,
                    "&& sudo chown -R ec2-user {}".format(outdir),
                ],
            ))

    def get_analysis_results(self, analysis):
        results = {
            "svs": join(analysis.storage_url, "filt1.vcf"),
            "svs2": join(analysis.storage_url, "filt2.vcf"),
        }

        for i in results.values():
            assert isfile(i), f"Missing result file {i}"
        return results
Пример #2
0
class Circos(AbstractApplication):

    NAME = "CIRCOS"
    VERSION = "0.1"
    ASSEMBLY = "hg19_2"

    SPECIES = "HUMAN"
    cli_help = "Circos plot."
    cli_options = [options.TARGETS]

    application_description = cli_help

    application_results = {
        "circos_png": {
            "frontend_type": "image",
            "description": "Circos image.",
            "verbose_name": "Circos image",
            "external_link": None,
        }
    }
    application_settings = {
        "docker_pysam": "docker run -it --entrypoint '' -v /mnt/efs/myisabl:/mnt/efs/myisabl danielrbroad/pysamdocker /bin/bash ",
        "make_circos": "/mnt/efs/myisabl/circosplot/make_circos.r",
        "circos_prep": "python /mnt/efs/myisabl/circosplot/circos_prep.py",
        "cores": "1",
        "docker_circos": get_docker_command("danielavarelat/circosr"),
        "docker_py": get_docker_command("danielrbroad/pysamdocker"),
        "bed": "/mnt/efs/myisabl/circosplot/circos_genes.bed",
        "cns": "/mnt/efs/myisabl/circosplot/empty.cns",
    }

    @cached_property
    def _apps(self):
        return {
            "merge": Merge(),
        }

    @cached_property
    def analyses_dependencies(self):
        return [
            {"app": self._apps["merge"], "result": "svs", "name": "merge"},
        ]

    def get_dependencies(self, targets, references, settings):
        inputs = {}
        analyses = []
        for dependency in self.analyses_dependencies:
            input_name = dependency["name"]
            inputs[input_name], key = self.get_result(
                targets[0],
                application_key=dependency["app"].primary_key,
                application_name=dependency["app"].NAME,
                result_key=dependency["result"],
                targets=targets,
                references=references,
            )
            analyses.append(key)
        return analyses, inputs

    def get_experiments_from_cli_options(self, **cli_options):
        return [([i], []) for i in cli_options["targets"]]

    def validate_experiments(self, targets, references):
        self.validate_one_target_no_references(targets, references)
        
    def command_prep(self, analysis, inputs, settings):
        inp = inputs["merge"]
        outdir = analysis.storage_url
        return " ".join(
            map(
                str,
                [
                    settings.docker_py,
                    settings.circos_prep,
                    "-cns",
                    settings.cns,
                    "-vcf",
                    inp,
                    "-o",
                    outdir,
                    " && ",               
                ],
            )
        )

    def get_command(self, analysis, inputs, settings):
        cmd = self.command_prep(analysis, inputs, settings)
        outdir = analysis.storage_url
        outcircos = join(outdir, "circos.png")
        circostsv = join(outdir, "circos_svs.tsv")
        circossegs = join(outdir, "segs.csv ")
        return " ".join(
            map(
                str,
                [
                    cmd,
                    settings.make_circos,
                    circostsv,
                    settings.bed,
                    circossegs,
                    outcircos,
                    "&& sudo chown -R ec2-user {}".format(outdir),               
                ],
            )
        )

    def get_analysis_results(self, analysis):
        results = {
            "svs": join(analysis.storage_url, "merged.vcf"),
        }

        for i in results.values():
            assert isfile(i), f"Missing result file {i}"
        return results
Пример #3
0
class QualityControl(AbstractApplication):

    """See https://quay.io/repository/biocontainers/rna-seqc."""

    NAME = "QC_DATA"
    VERSION = "0.1.0"
    ASSEMBLY = "GRCh37"
    SPECIES = "HUMAN"


    cli_help = "Get Quality Control metrics for NGS data."
    cli_options = [options.TARGETS]
    application_project_level_results = BASE_APPLICATION_RESULTS
    application_individual_level_results = BASE_APPLICATION_RESULTS
    application_results = APPLICATION_RESULTS
    application_description = "Quality Control metrics for NGS data."
    application_settings = {
        "reference": "reference_data_id:genome_fasta",
        "java_args": "-Xmx4g -XX:-UsePerfData",
        # RNA settings
        "gc_gencode": None,
        "fasta_rrna": None,
        "gtf": None,
        # executables
        "multiqc": get_docker_command("ewels/multiqc:v1.5", "multiqc"),
        "picard": get_docker_command("leukgen/docker-picard"),
        "fastqc": get_docker_command("biocontainers/fastqc:v0.11.5", "fastqc"),
        "rna_seqc": get_docker_command(
            "quay.io/biocontainers/rna-seqc:1.1.8--2", "rna-seqc"
        ),
    }

    def get_experiments_from_cli_options(self, **cli_options):
        return [([i], []) for i in cli_options["targets"]]

    def validate_experiments(self, targets, references):
        self.validate_bams(targets + references)
        self.validate_one_target_no_references(targets, references)

        if targets[0].technique.category == "DNA":
            self.validate_bedfiles(targets + references)

        # msk specific validation
        for i in targets[0]["raw_data"]:
            if i["file_type"].startswith("FASTQ") and i["file_url"]:
                assert not i["file_url"].startswith(
                    "/warm"
                ), "fastq in /warm, cant process"

    def validate_settings(self, settings):
        self.validate_reference_genome(settings.reference)

    def get_command(self, analysis, inputs, settings):
        target = analysis["targets"][0]
        outdir = analysis["storage_url"]
        system_id = target["system_id"]
        fastqc_dir = join(outdir, "fastqc")
        picard_dir = join(outdir, "picard")
        multiqc_dir = join(outdir, "multiqc")
        rna_seqc_dir = join(outdir, "rna_seqc")
        commands = [f"mkdir -p {multiqc_dir}"]
        bampath = self.get_bam(target)

        # build fastqc command, it is possible some samples don't gave fastq
        fastqc_cmds = []
        fastqc_input = join(fastqc_dir, f"{system_id}.fastq.gz")

        for i in target["raw_data"]:
            if i["file_type"].startswith("FASTQ"):
                fastq = i["file_url"]

                if fastq.endswith(".gz"):
                    fastqc_cmds.append(f"cat {fastq} > {fastqc_input}")
                else:
                    fastqc_cmds.append(f"gzip -c {fastq} >> {fastqc_input}")

        if fastqc_cmds:
            commands += [f"mkdir -p {fastqc_dir}"] + fastqc_cmds
            commands += [f"{settings.fastqc} -o {fastqc_dir} {fastqc_input}"]
            commands += [f"rm {fastqc_input}"]

        # run picard for DNA data, bedfiles can't have a chr prefix
        if target["technique"]["category"] == "DNA":
            commands.append(f"mkdir -p {picard_dir}")
            picard_cmd = f'{settings.picard} -j "{settings.java_args}" '
            picard_kwargs = dict(
                bampath=bampath,
                reference=settings.reference,
                outbase=join(picard_dir, system_id),
                bedfile=self.get_bedfile(target),
            )

            for i in PICARD_BASE_COMMANDS:
                commands.append(picard_cmd + i.format(**picard_kwargs))

            if target["technique"]["method"] in ["WE", "TD"]:
                for i in PICARD_TARGETED_COMMANDS:
                    commands.append(picard_cmd + i.format(**picard_kwargs))

        # run RNA-SeQC for RNA data
        if target["technique"]["category"] == "RNA":
            if not settings.gtf or not settings.fasta_rrna:
                raise exceptions.ConfigurationError(
                    "Settings 'gtf' and 'fastq_rrna' must be set"
                )

            commands.append("mkdir -p {0}".format(rna_seqc_dir))
            strat_args = ""

            if settings.gc_gencode:
                strat_args = f"-strat gc -gc {settings.gc_gencode}"

            commands.append(
                f"{settings.rna_seqc} {settings.java_args} "
                f"-o {join(rna_seqc_dir, system_id)} "
                f"-r {settings.reference} "
                f"-t {settings.gtf} "
                f"-BWArRNA {settings.fasta_rrna} "
                f"-s \"'{system_id}|{bampath}|NA'\" "
                f"{strat_args}"
            )

        # run multiqc on current sample
        commands.append(f"{settings.multiqc} -f -p -o {multiqc_dir} {outdir}")

        return " && ".join(commands)

    def get_analysis_results(self, analysis):
        target = analysis["targets"][0]
        outdir = analysis["storage_url"]
        multiqc = join(outdir, "multiqc")
        multiqc_data = join(multiqc, "multiqc_data")

        results = {
            "multiqc_html": join(multiqc, "multiqc_report.html"),
            "multiqc_data": join(multiqc_data, "multiqc_data.json"),
            "multiqc_stats": join(multiqc_data, "multiqc_general_stats.txt"),
            "read_length": None,
        }

        for key, i in results.items():
            if key == "multiqc_data":
                continue
            assert True if i is None else isfile(i), f"Missing result {i}"

        if target["technique"]["category"] == "DNA":
            read_length_column = "MEAN_READ_LENGTH"
            read_length_path = "multiqc_picard_AlignmentSummaryMetrics.txt"
            read_length_path = join(multiqc_data, read_length_path)
        else:
            read_length_column = "Read Length"
            read_length_path = join(multiqc_data, "multiqc_rna_seqc.txt")

        with open(read_length_path) as f:
            row = next(csv.DictReader(f, delimiter="\t"))
            results["read_length"] = float(row[read_length_column])

            if "read_length" in target:
                api.patch_instance(
                    endpoint="experiments",
                    instance_id=target["pk"],
                    read_length=results["read_length"],
                )

        return results

    def validate_project_analyses(self, project, analyses):
        assert (
            len(analyses) <= 500
        ), "Project level QC only valid for projects with lestt than 500 samples"

    def get_project_analysis_results(self, analysis):
        return self.get_merged_results(analysis)

    def get_individual_analysis_results(self, analysis):
        return self.get_merged_results(analysis)

    def merge_project_analyses(self, analysis, analyses):
        return self.merge_analyses(
            analysis,
            analyses,
            f"QC report for project {analysis['project_level_analysis']['pk']} "
            f"created using {len(analyses)} samples.",
        )

    def merge_individual_analyses(self, analysis, analyses):
        return self.merge_analyses(
            analysis,
            analyses,
            f"QC report for individual {analysis['individual_level_analysis']['pk']} "
            f"created using {len(analyses)} samples.",
        )

    def get_merged_results(self, analysis):
        outdir = analysis["storage_url"]
        multiqc_data = join(outdir, "multiqc_data")
        results = {
            "multiqc_html": join(outdir, "multiqc_report.html"),
            "multiqc_data": join(multiqc_data, "multiqc_data.json"),
            "multiqc_stats": None,
        }

        for i, j in results.items():
            if i != "multiqc_stats":
                assert isfile(j), f"Missing result {j}"
            elif j and isfile(j):
                results["multiqc_stats"] = join(
                    multiqc_data, "multiqc_general_stats.txt"
                )

        return results

    def merge_analyses(self, analysis, analyses, comment):
        subprocess.check_call(
            [i for i in self.settings.multiqc.split(" ") if i]
            + [
                "--comment",
                comment,
                "--outdir",
                analysis["storage_url"],
                "--data-dir",
                "--force",
            ]
            + [i["storage_url"] for i in analyses],
            stdout=sys.stdout,
            stderr=sys.stderr,
        )
Пример #4
0
class BwaMem(AbstractApplication):
    NAME = "BWA_MEM"
    #VERSION = "0.7.17-r1188"
    VERSION = "1"
    #ASSEMBLY = "GRCh38"
    ASSEMBLY = "hg19_2"

    SPECIES = "HUMAN"
    URL = "https://github.com/cancerit/PCAP-core/wiki/Scripts-Reference-implementations"

    cli_help = "Align DNA data with bwa-mem."
    cli_options = [options.TARGETS]
    application_description = constants.APPLICATION_DESCRIPTION
    application_results = constants.APPLICATION_RESULTS
    application_settings = {
        "cores": {
            "WG": 32,
            "WE": 32,
            "TD": 32
        },
        "reference": "reference_data_id:genome_fasta",
        "bwa_mem_pl": get_docker_command("leukgen/docker-pcapcore:v0.1.1"),
    }

    def get_experiments_from_cli_options(self, **cli_options):
        return [([i], []) for i in cli_options["targets"]]

    def validate_experiments(self, targets, references):
        self.validate_dna_only(targets + references)
        self.validate_single_data_type(targets + references)
        self.validate_one_target_no_references(targets, references)
        self.validate_methods(targets, ["WG", "WE", "TD"])
        #assert not targets[0].is_pdx, "Use Disambiguate for PDX experiments"

    def validate_settings(self, settings):
        self.validate_reference_genome(settings.reference)

    def get_command(self, analysis, inputs, settings):
        #sequencing_data = analysis.targets[0].sequencing_data
        return self.get_bwa_mem_command(analysis, inputs, settings)

    def get_bwa_mem_command(self, analysis, inputs, settings):
        target = analysis["targets"][0]
        method = target["technique"]["method"]
        sample_name = target["system_id"]
        outdir = analysis["storage_url"]
        groupinfo, sequencing_data = utils.write_groupinfo(
            target, outdir, sample_name)
        command = [
            settings.bwa_mem_pl,
            "-fragment",
            10,
            "-reference",
            settings.reference,
            "-threads",
            settings.cores[method],
            "-map_threads",
            settings.cores[method],
            "-sample",
            sample_name,
            "-outdir",
            outdir,
        ]

        if target.technique["custom_fields"].get("nomarkdup"):
            command += ["-nomarkdup"]

        if groupinfo:
            command += ["-groupinfo", groupinfo]

        return (" ".join(map(str, command + sequencing_data)) +
                f" && sudo chown -R ec2-user {outdir}"
                # make sure index is older than bam
                + f" && touch {outdir}/{sample_name}.bam.bai")

    def get_analysis_results(self, analysis):
        results = utils.get_bwa_mem_pl_results(analysis)

        self.update_experiment_bam_file(
            experiment=analysis["targets"][0],
            bam_url=results["bam"],
            analysis_pk=analysis["pk"],
        )
        return results
Пример #5
0
class Gridss(AbstractApplication):

    NAME = "GRIDSS"
    VERSION = "2.2.2"
    #ASSEMBLY = "GRCh38"
    ASSEMBLY = "hg19_2"

    SPECIES = "HUMAN"
    cli_help = "Find structural variants with GRIDSS."
    cli_options = [options.PAIRS, options.PAIRS_FROM_FILE]
    application_description = cli_help

    application_results = {
        "svs": {
            "frontend_type": "tsv-file",
            "description": "GRIDSS somatic SVS VCF.",
            "verbose_name": "Somatic SVS VCF",
            "external_link": None,
        },
        "assembly_bam": {
            "frontend_type": "igv_bam:assembly_bam_bai",
            "description": "Gridss Assembled Bam",
            "verbose_name": "Assembly Bam",
            "external_link": None,
        },
        "assembly_bam_bai": {
            "frontend_type": None,
            "description": "Gridss Assembled Bam Index",
            "verbose_name": "Assembly Bam Index",
            "external_link": None,
        },
    }
    application_settings = {
        "config": "/mnt/efs/myisabl/config.txt",
        "blacklist":
        "/mnt/efs/myisabl/wgEncodeDacMapabilityConsensusExcludable.bed",
        "gridss": get_docker_command("papaemmelab/docker-gridss"),
        "reference": "reference_data_id:genome_fasta",
        "cores": "1",
    }

    def get_experiments_from_cli_options(self, **cli_options):
        return cli_options["pairs"] + cli_options["pairs_from_file"]

    def validate_experiments(self, targets, references):
        self.validate_bams(targets + references)
        self.validate_dna_pairs(targets, references)
        self.validate_same_technique(targets, references)
        #self.validate_methods(targets + references, ["WG"])

    def validate_settings(self, settings):
        self.validate_reference_genome(settings.reference)

    def get_command(self, analysis, inputs, settings):
        outdir = analysis.storage_url
        tumor = analysis.targets[0]
        normal = analysis.references[0]
        #x="/home/danielavt/cli2/myapps/myapps/apps/gridss/config.txt"
        return (
            f"cd {outdir} && "
            f"{settings.gridss} "
            f"CONFIGURATION_FILE={settings.config} "
            f"WORKING_DIR={outdir} "
            f"REFERENCE_SEQUENCE={settings.reference} "
            f"INPUT={self.get_bam(normal)} "
            f"INPUT_LABEL={normal.system_id} "
            f"INPUT={self.get_bam(tumor)} "
            f"INPUT_LABEL={tumor.system_id} "
            f"OUTPUT={join(outdir, 'somatic.sv.vcf')} "
            f"ASSEMBLY={join(outdir, 'somatic.gridss.assembly.bam')} "
            f"BLACKLIST={settings.blacklist} "
            f"WORKER_THREADS={settings.cores} "
            f"&& sudo chown -R ec2-user {outdir}"
            f"&& rm -rf {join(outdir, f'{normal.system_id}.bam.gridss.working')} "
            f"&& rm -rf {join(outdir, f'{tumor.system_id}.bam.gridss.working')}"
        )

    def get_analysis_results(self, analysis):
        results = {
            "svs":
            join(analysis.storage_url, "somatic.sv.vcf"),
            "assembly_bam":
            join(analysis.storage_url, "somatic.gridss.assembly.bam"),
            "assembly_bam_bai":
            join(analysis.storage_url, "somatic.gridss.assembly.bai"),
        }

        for i in results.values():
            assert isfile(i), f"Missing result file {i}"

        return results
Пример #6
0
class Delly(AbstractApplication):

    NAME = "DELLY"
    VERSION = "2"
    ASSEMBLY = "hg19_2"

    #ASSEMBLY = "GRCh38"
    SPECIES = "HUMAN"
    cli_help = "Find structural variants with GRIDSS."
    cli_options = [options.PAIRS, options.PAIRS_FROM_FILE]
    application_description = cli_help

    application_results = {
        "svs": {
            "frontend_type": "tsv-file",
            "description": "DELLY somatic SVS VCF.",
            "verbose_name": "Somatic SVS VCF",
            "external_link": None,
        }
    }
    application_settings = {
        "delly": get_docker_command("dellytools/delly"),
        "bcftools": get_docker_command("dceoy/bcftools"),
        "reference": "reference_data_id:genome_fasta",
        "cores": "2",
    }

    def get_experiments_from_cli_options(self, **cli_options):
        return cli_options["pairs"] + cli_options["pairs_from_file"]

    def validate_experiments(self, targets, references):
        self.validate_bams(targets + references)
        self.validate_same_technique(targets, references)

    def validate_settings(self, settings):
        self.validate_reference_genome(settings.reference)

    def get_command(self, analysis, inputs, settings):
        outdir = join(analysis.storage_url, "delly.bcf")
        outdirvcf = join(analysis.storage_url, "delly.vcf")

        target = analysis.targets[0]
        reference = analysis.references[0]
        command = [
            settings.delly,
            "delly",
            "call",
            "-o",
            outdir,
            "-g",
            settings.reference,
            self.get_bam(reference),
            self.get_bam(target),
        ]
        command2 = [
            settings.bcftools,
            "view",
            outdir,
            ">",
            outdirvcf,
        ]
        com = (" ".join(command))
        cmd2 = (" ".join(command2))
        return (com + f" && sudo touch {outdirvcf}" +
                f" && sudo chown -R ec2-user {outdirvcf}" + f" && {cmd2}")

    def get_analysis_results(self, analysis):
        results = {
            "svs": join(analysis.storage_url, "delly.vcf"),
        }

        for i in results.values():
            assert isfile(i), f"Missing result file {i}"
        return results
Пример #7
0
class Svaba(AbstractApplication):
    #ASSEMBLY = "GRCh37"
    #SPECIES = "HUMAN"
    NAME = "SVABA"
    VERSION = "0.2.1"

    cli_help = "Find structural variants with Svaba."
    cli_options = [options.PAIRS, options.PAIRS_FROM_FILE]
    application_description = cli_help
    application_results = {
        "svs": {
            "frontend_type": "tsv-file",
            "description": "Svaba somatic SVS VCF.",
            "verbose_name": "Somatic SVS VCF",
            "external_link": None,
        }
    }
    application_settings = {
        "svab": get_docker_command("papaemmelab/docker-svaba:v1.0.0"),
        "reference": "reference_data_id:genome_fasta",
        "cores": "16",
    }

    def get_experiments_from_cli_options(self, **cli_options):
        return cli_options["pairs"] + cli_options["pairs_from_file"]

    def validate_experiments(self, targets, references):
        #self.validate_dna_pairs(targets, references)
        self.validate_same_technique(targets, references)

    def validate_settings(self, settings):
        self.validate_reference_genome(settings.reference)

    def get_command(self, analysis, inputs, settings):
        target = analysis.targets[0]
        reference = analysis.references[0]
        command = [
            settings.svab,
            "run",
            "-z",
            "-a",
            target.system_id,
            "-G",
            settings.reference,
            "-t",
            self.get_bam(target),
            "-n",
            self.get_bam(reference),
            "-p",
            settings.cores,
        ]
        com = (" ".join(command))
        return com

    def get_analysis_results(self, analysis):
        results = {
            "svs":
            join(
                analysis["storage_url"],
                analysis["targets"][0]["system_id"] +
                ".svaba.somatic.sv.vcf.gz",
            )
        }
        for i in results.values():
            assert isfile(i)

        return results