Python Pipeline 예제들, daisy.Pipeline Python 예제들

예제 #1

0

파일 보기

파일: VariantCallers.py 프로젝트: AndreasHegerGenomics/cgat-bench

    def run(self, outfile, params):

        bam = resolve_argument(params.bam, sep=" ")
        reference_fasta = get_reference(params)

        stments, retvals = [], []
        variant_types = [x.strip() for x in params.variant_types.split(",")]

        for variant_type in variant_types:
            stments.append(
                "{params.path} "
                "--type {variant_type} "
                "--genome {reference_fasta} "
                "--outfile {outfile}.{variant_type}.vcf "
                "{params.options} "
                "{bam} "
                ">& {outfile}.{variant_type}.log; "
                "bgzip -f {outfile}.{variant_type}.vcf; "
                "tabix -f -p vcf {outfile}.{variant_type}.vcf.gz".format(
                    **locals()))

        retvals.extend(P.run(stments))

        vcf_files = " ".join(
            [outfile + "." + x + ".vcf.gz" for x in variant_types])
        retvals.append(
            P.run("{params.path_vcf_concat} "
                  "{vcf_files} "
                  "| {params.path_vcf_sort} "
                  "| bgzip "
                  "> {outfile}; "
                  "tabix -fp vcf {outfile}".format(**locals())))

        return retvals

예제 #2

0

파일 보기

def check_unique(tool_functions,
                 input_combos=None,
                 input_regex=None,
                 input_alias=None,
                 is_test=False):
    # compute a list of task names
    names = []
    if input_combos:
        for toolf, input_files in itertools.product(tool_functions,
                                                    input_combos):
            taskf = copy.copy(toolf)
            taskf.register_input(input_files,
                                 regex=input_regex,
                                 alias=input_alias,
                                 is_test=is_test)
            names.append(taskf.__name__)
    else:
        for toolf in tool_functions:
            taskf = copy.copy(toolf)
            taskf.register_input(regex=input_regex,
                                 alias=input_alias,
                                 is_test=is_test)
            names.append(taskf.__name__)

    counts = collections.Counter(names)
    for name, count in list(counts.items()):
        if count > 1:
            make_unique = True
            P.get_logger().debug(
                "adding hash identifier because of duplicate name: {}={}".format(name, count))
            break
    else:
        make_unique = False

    return make_unique

예제 #3

0

파일 보기

    def run(self, infiles, outfile, params):

        files = " ".join(infiles)

        job_threads = params.job_threads

        # todo:
        # 1. add header.
        # 2. do batch+merge sort in order to avoid hitting temporary space limits.
        # 3. remove unnecessary info fields while sorting, add them later.

        tmpdir = P.get_temp_filename()
        retval = P.run(
            "mkdir {tmpdir}; "
            "bcftools view -h {infiles[0]} "
            "| cut -f 1-10 "
            "| bgzip > {outfile}; "
            "zcat {files} "
            "| awk -v OFS='\\t' "
            "'!/^#/ && $5 != \"<NON_REF>\" "
            "{{$8=\".\";$9=\".\";$6=\".\";$7=\"GT\";$10=\".\"; print}}' "
            "2> {outfile}.filter.log "
            "| sort -k1,1V -k2,2n "
            "--parallel {job_threads} "
            "-T {tmpdir} "
            "2> {outfile}.sort.log "
            "| uniq "
            "| bgzip "
            ">> {outfile}; "
            "tabix -p vcf {outfile}; "
            "rm -rf {tmpdir} ".format(**locals()))

예제 #4

0

파일 보기

파일: GATKRunner.py 프로젝트: AndreasHegerGenomics/cgat-bench

    def run_statements(self, stmnts, **kwargs):

        stmnts = [x for x in stmnts if x]

        filename, main_statement, post_statement = P.join_statements(
            stmnts, infile=None)

        stmnt = " ; ".join([x for x in [main_statement, post_statement] if x])

        job_threads1, job_threads2 = 1, 1
        if "--num_threads" in stmnt:
            try:
                job_threads1 = max([
                    int(x) for x in re.search("--num_threads\s*(\d+)",
                                              stmnt).groups()
                ])
            except AttributeError:
                pass

        if "--num_cpu_threads_per_data" in stmnt:
            try:
                job_threads2 = max([
                    int(x) for x in re.search(
                        "--num_cpu_threads_per_data_thread\s*(\d+)",
                        stmnt).groups()
                ])
            except AttributeError:
                pass

        job_threads = max(job_threads1, job_threads2)
        return P.run(stmnt, **kwargs)

예제 #5

0

파일 보기

파일: AssemblyTools.py 프로젝트: AndreasHegerGenomics/cgat-bench

    def run(self, outfile, params):

        path = os.environ["PATH"]
        gp = P.get_parameters_as_namedtuple()
        cluster_queue = gp.cluster["queue"]
        cluster_memory_resource = gp.cluster["memory_resource"]
        cluster_parallel_environment = gp.cluster["parallel_environment"]
        outdir = os.path.dirname(outfile)
        outname = os.path.basename(outdir)
        # -sync y forces qsub to wait until job completes before
        # continuing.

        statement = (
            "{self.path} "
            "-p canu "
            "-d {outdir} "
            "-genomeSize={params.genome_size} "
            "gridOptionsJobName={outname} "
            "java={params.path_java} "
            "gridOptions=\"-q {cluster_queue} -v PATH={path} -sync y \" "
            "gridEngineMemoryOption=\"-l {cluster_memory_resource}=MEMORY\" "
            "gridEngineThreadsOption=\"-pe {cluster_parallel_environment} THREADS\" "
            "{params.options} "
            "{params.assembly_mode} "
            "{params.fasta} "
            ">& {outfile}.log; "
            "mv {outdir}/canu.contigs.fasta {outfile}".format(**locals()))

        return P.run(statement, without_cluster=True)

예제 #6

0

파일 보기

    def run(self, infile, outfile, params):

        if params.reference_bed is None:
            raise ValueError("{} requires reference_bed to be set".format(
                self.name))

        # requires a consistent sort order, so sort both files.
        # It also requires the chromosome content to be identical,
        # so restrict output to common sets.
        tmpf = P.get_temp_filename(clear=True)

        tmpf_test, tmpf_truth = tmpf + "_a.bed.gz", tmpf + "_b.bed.gz"
        stmnt = standardise_bed_files(tmpf_test, tmpf_truth, infile,
                                      params.reference_bed)

        statements = [stmnt]
        statements.append("{params.path} intersect "
                          "-a {tmpf_test} "
                          "-b {tmpf_truth} "
                          "-wa "
                          "| bgzip "
                          "> {outfile}.shared.bed.gz")
        statements.append("{params.path} intersect "
                          "-a {tmpf_test} "
                          "-b {tmpf_truth} "
                          "-wa -v"
                          "| bgzip "
                          "> {outfile}.unique_test.bed.gz")
        statements.append("{params.path} intersect "
                          "-b {tmpf_test} "
                          "-a {tmpf_truth} "
                          "-wa -v"
                          "| bgzip "
                          "> {outfile}.unique_truth.bed.gz")
        statements.append("rm -f {tmpf_test} {tmpf_truth}")

        for section in self.sections:
            statements.append(
                "tabix -p bed {outfile}.{section}.bed.gz".format(**locals()))

        statement = "; ".join(statements)
        retval = P.run(statement.format(**locals()))

        # these are small files, so doing it here. Implement tabix.count()
        # method
        counts = dict()
        for section in self.sections:
            # with pysam.Tabixfile(outfile + "." + section + ".bed.gz") as inf:
            inf = pysam.Tabixfile(outfile + "." + section + ".bed.gz")
            counts[section] = len(list(inf.fetch()))
            inf.close()

        with IOTools.open_file(outfile, "w") as outf:
            outf.write("section\tcounts\n")
            outf.write("\n".join(
                ["\t".join(map(str, x)) for x in list(counts.items())]) + "\n")

        return retval

예제 #7

0

파일 보기

파일: BAMTools.py 프로젝트: AndreasHegerGenomics/cgat-bench

    def run(self, outfile, params):

        if "--threads" in params.options or "-t " in params.options:
            job_threads = int(re.search("(-t|--threads)\s*(\d+)",
                                        params.options).groups()[1])

        fastq = resolve_argument(params.fastq, ",").split(",")
        if len(fastq) == 1:
            fastq = '-U "{}"'.format(fastq)
        else:
            fastq = '-1 "{}" -2 "{}"'.format(*fastq)

        tmpdir = P.get_temp_filename(clear=True)

        if "index" in params._fields:
            index = params.index
        else:
            index = params.reference_fasta

        if params.set_readgroup or params.readgroup_id_regex is not None:
            readgroup_string, readgroup_id, readgroup_sample = build_readgroup_string(
                outfile, params)

            # pipes.quote needs to shlex.quote in py3
            readgroup_option = "--rg-id {}".format(readgroup_id)

            # add additional level of quoting and remove "ID:{}"
            readgroup_string = re.sub("@RG\tID:\S+\t", "", readgroup_string)
            readgroup_string = " ".join(["--rg {}".format(x)
                                         for x in readgroup_string.split("\t")])
        else:
            readgroup_option = ""
            readgroup_string = ""

        return P.run(
            "mkdir {tmpdir}; "
            "{self.path} "
            "{readgroup_option} "
            "{readgroup_string} "
            "{params.options} "
            "-x {index} "
            "{fastq} "
            "2> {outfile}.log "
            "| samtools view -b /dev/stdin "
            "2> {outfile}.view.log "
            "| samtools sort -T {tmpdir} -O bam /dev/stdin "
            "2> {outfile}.sort.log "
            "> {outfile}; "
            "samtools index {outfile}; "
            "rm -rf {tmpdir}".format(**locals()),
            **params._asdict())

예제 #8

0

파일 보기

    def run(self, infile, outfile, params):

        if params.annotations_bed is None:
            raise ValueError("{} requires annotations_bed to be set".format(
                self.name))

        if params.workspace_bed is None:
            raise ValueError("{} requires workspace_bed to be set".format(
                self.name))

        retval = run_metric_bedtools_intersection.run(self, infile, outfile,
                                                      params)
        retvals = [retval]

        statements = [
            "mv {outfile} {outfile}.bedtools_intersect_and_annotate_counts.tsv"
            .format(**locals())
        ]
        bed_files = []
        for section in self.sections:
            tmpf = P.get_temp_filename(clear=True) + "-" + section + ".gz"
            statements.append(
                "zcat {outfile}.{section}.bed.gz "
                "| awk -v OFS='\\t' '{{ $4 = \"{section}\"; print }}' "
                "| bgzip > {tmpf}".format(**locals()))

            bed_files.append(tmpf)

        segment_files = " ".join(
            ["--segment-bed-file={}".format(x) for x in bed_files])

        statements.append(
            "{params.gat_path} "
            "{segment_files} "
            "--with-segment-tracks "
            "--annotation-bed-file={params.annotations_bed} "
            "--workspace-bed-file={params.workspace_bed} "
            "--log={outfile} "
            "{params.options} "
            "> {outfile}.bedtools_intersect_and_annotate_enrichment.tsv".
            format(**locals()))

        for f in bed_files:
            statements.append("rm -f {}".format(f))

        statement = "; ".join(statements)
        retvals.append(P.run(statement))

        return retvals

예제 #9

0

파일 보기

파일: BAMTools.py 프로젝트: AndreasHegerGenomics/cgat-bench

    def run(self, infiles, outfile, params):

        files = " ".join(infiles)
        job_threads = 1
        if "--threads" in params.options:
            job_threads = int(re.search("--threads[= ]\s*(\d+)",
                                        params.options).groups()[0])

        if "--threads" in params.view_options:
            job_threads += int(re.search("--threads[= ]\s*(\d+)",
                                         params.view_options).groups()[0])

        if params.set_readgroup or params.readgroup_id_regex is not None:
            readgroup_string, readgroup_id, readgroup_sample = build_readgroup_string(
                outfile, params)

            with open(outfile + ".header.sam", "w") as outf:
                outf.write(readgroup_string + "\n")

            retval = P.run(
                "{params.path} merge "
                "{params.options} "
                "-f "
                "-h {outfile}.header.sam "
                "-r "
                "- "
                "{files} "
                "2> {outfile}.log "
                "| samtools view -h - "
                "| perl -p -e 's/^.*\\n// if (/^\@RG/ && !/{readgroup_id}/); "
                "   s/RG:Z:\S+/RG:Z:{readgroup_id}/' "
                "| samtools view -bS "
                "{params.view_options} "
                "- "
                "> {outfile}; "
                "samtools index {outfile} 2> {outfile}.index.log"
                .format(**locals()), job_threads=job_threads)

        else:
            retval = P.run(
                "{params.path} merge "
                "-f "
                "{params.options} "
                "{outfile} "
                "{files} "
                "2> {outfile}.log; "
                "samtools index {outfile} 2> {outfile}.index.log"
                .format(**locals()), job_threads=job_threads)
        return retval

예제 #10

0

파일 보기

    def run(self, infiles, outfiles, params):

        vcfs = infiles
        if len(vcfs) != 2:
            raise ValueError("expected 2 VCF files, received {}".format(vcfs))
        vcf = " ".join(infiles)

        if isinstance(outfiles, str):
            # files not known to ruffus, so expect a glob expression such as
            # \2.dir/*.dir/*.vcf.gz
            outdir = os.path.dirname(os.path.dirname(outfiles))
        else:
            outdir = os.path.commonprefix(outfiles)

        outfile = os.path.join(outdir, "result.log")

        retval = P.run("{params.path} isec "
                       "{params.options} "
                       "--output-type z "
                       "--prefix {outdir} "
                       "{vcf} "
                       "&> {outfile} ".format(**locals()))

        f = ["000{}.vcf.gz".format(x) for x in range(4)]
        self.distribute_results(outdir, list(zip(f, self.output)))

        f = ["000{}.vcf.gz.tbi".format(x) for x in range(4)]
        ff = [x + ".tbi" for x in self.output]
        self.distribute_results(outdir, list(zip(f, ff)))

        return retval

예제 #11

0

파일 보기

파일: Runner.py 프로젝트: AndreasHegerGenomics/cgat-bench

    def distribute_results(self, workdir, pairs, statement=None):
        """distribute results from a task into separate output directories.

        Arguments
        ---------
        workdir : string
            working directory
        pairs : list
            tuples of input/output filenames
        statement : string
            optional statement to be executed to transform input
            to output. If not given, the files are simply moved.

        """
        statements = []
        for infile, outfile in pairs:
            infile = os.path.join(workdir, infile)
            outfile = os.path.join(workdir, outfile)
            if not os.path.exists(infile):
                raise ValueError(
                    "expected file {} does not exist".format(infile))
            if not os.path.exists(os.path.dirname(outfile)):
                os.makedirs(os.path.dirname(outfile))
            if statement is None:
                shutil.move(infile, outfile)
            else:
                statements.append(statement.format(**locals()))
        if statements:
            return P.run(statements)

예제 #12

0

파일 보기

    def run(self, infiles, outfile, params):

        if isinstance(infiles, list) or isinstance(infiles, tuple):
            if len(infiles) > 1:
                raise NotImplementedError(
                    "collated somatic variant detection of multiple VCF files not implemented"
                )
            infile = infiles[0]
        else:
            infile = infiles

        with pysam.VariantFile(infile) as inf:
            samples = list(inf.header.samples)
            if len(samples) != 2:
                raise ValueError(
                    "expected only two samples in VCF, got {}: {}".format(
                        len(samples), ",".join(samples)))
            normal_sample_id, tumour_sample_id = samples

        statement = ("{params.path} "
                     "{params.options} "
                     "-i {infile} "
                     "-o {outfile} "
                     "-n {normal_sample_id} "
                     "-t {tumour_sample_id} "
                     "2> {outfile}.log ".format(**locals()))

        return P.run(statement)

예제 #13

0

파일 보기

파일: VariantCallers.py 프로젝트: AndreasHegerGenomics/cgat-bench

    def run(self, outfile, params):

        bam = resolve_argument(params.bam, sep=" ")
        reference_fasta = get_reference(params)

        # warning: requires -m or -c in the options
        if "--multiallelic-caller" not in params.options and \
           "-m" not in params.options and \
           "-c" not in params.options and \
           "--consensus-caller" not in params.options:
            E.warn("bcftools call requires -m or -c, got {}".format(
                params.options))

        # limit number of jobs to node to limit I/O
        job_threads = 4

        return P.run("{params.path_samtools} mpileup "
                     "-ug "
                     "-f {reference_fasta} "
                     "{params.samtools_options} "
                     "{bam} "
                     "2> {outfile}.pileup.log "
                     "| {params.path} call "
                     "--variants-only "
                     "--output-type z "
                     "{params.options} "
                     "2> {outfile}.call.log "
                     "> {outfile}; "
                     "tabix -p vcf {outfile} ".format(**locals()))

예제 #14

0

파일 보기

 def run(self, infile, outfile, params):
     return P.run("{params.path} "
                  "{params.options} "
                  "-I {infile} "
                  "--log {outfile}.log "
                  "2> {outfile}.err "
                  "> {outfile} ".format(**locals()))

예제 #15

0

파일 보기

파일: VCFTools.py 프로젝트: AndreasHegerGenomics/cgat-bench

    def run(self, outfile, params):

        outfile = os.path.abspath(outfile)
        if params.primary_vcf is None:
            raise ValueError("expected primary_vcf, received {}".format(
                params.primary_vcf))
        if params.filter_vcf is None:
            raise ValueError("expected filter_vcf, received {}".format(
                params.filter_vcf))

        primary_vcf = os.path.abspath(params.primary_vcf)
        filter_vcf = os.path.abspath(params.filter_vcf)
        outdir = os.path.dirname(outfile)

        retval = P.run(
            "( "
            "cd {outdir} && "
            "{params.path} query -l {filter_vcf} > subset_samples "
            "&& {params.path} view {params.options} --force-samples "
            "-S subset_samples {primary_vcf} -Ob -o test.subset_samples.bcf "
            "&& {params.path} index test.subset_samples.bcf "
            "&& {params.path} isec {params.options} -n=2 "
            "--prefix isec "
            "test.subset_samples.bcf {filter_vcf} "
            "--output-type z "
            "&& mv -f isec/0000.vcf.gz {outfile} "
            "&& tabix {outfile} "
            ") &> {outfile}.log ".format(**locals()))

        return retval

예제 #16

0

파일 보기

파일: VCFSplit.py 프로젝트: AndreasHegerGenomics/cgat-bench

    def run(self, infile, outfiles, params):

        tbxfile = pysam.VariantFile(infile)
        statements = []

        for chrom in list(tbxfile.header.contigs):
            output_file = outfiles.format(chrom)
            output_dir = os.path.dirname(output_file)
            statements.append(
                "mkdir {output_dir}; "
                "tabix -h {infile} {chrom} | bgzip > {output_file}; "
                "tabix -p vcf {output_file} ".format(**locals()))

        retvals = P.run(statements)

        # clean up empty vcfs, opening empty VCF in pysam throws
        # ValueError
        for chrom in list(tbxfile.header.contigs):
            output_file = outfiles.format(chrom)
            output_dir = os.path.dirname(output_file)
            try:
                f = pysam.VariantFile(output_file)
                f.close()
            except ValueError:
                E.warn("removing empty VCF {}".format(output_file))
                shutil.rmtree(output_dir)

        tbxfile.close()

예제 #17

0

파일 보기

파일: VCFTools.py 프로젝트: AndreasHegerGenomics/cgat-bench

    def run(self, outfiles, params):

        vcf = resolve_argument(params.vcf, sep=" ")
        vcfs = vcf.split(" ")
        if len(vcfs) != 2:
            raise ValueError("expected 2 VCF files, received {}".format(vcfs))

        outdir = os.path.commonprefix(outfiles)
        outfile = os.path.join(outdir, "result.log")

        retval = P.run("{params.path} isec "
                       "{params.options} "
                       "--output-type z "
                       "--prefix {outdir} "
                       "{vcf} "
                       "&> {outfile} ".format(**locals()))

        f = ["000{}.vcf.gz".format(x) for x in range(4)]
        self.distribute_results(outdir, list(zip(f, self.output)))

        f = ["000{}.vcf.gz.tbi".format(x) for x in range(4)]
        ff = [x + ".tbi" for x in self.output]
        self.distribute_results(outdir, list(zip(f, ff)))

        return retval

예제 #18

0

파일 보기

파일: VariantCallers.py 프로젝트: AndreasHegerGenomics/cgat-bench

    def run(self, outfile, params):

        retvals = []
        prefix = IOTools.snip(outfile, ".bed.gz")
        vcffile = prefix + ".vcf.gz"
        if not os.path.exists(vcffile):
            retvals.extend(run_tool_delly.run(self, vcffile, params))

        statements = []

        statements.append("{self.path_bcftools} query "
                          "{params.bcftools_options} "
                          "-f \"%%CHROM\\t%%POS\\t%%END\\t%%SVTYPE\\n\" "
                          "{vcffile} "
                          "| awk -v OFS='\\t' '$3 != \".\" {{ switch ($4) {{"
                          "case \"DEL\": $5=0; break; "
                          "case \"DUP\": $5=3; break; "
                          "case \"INS\": next; break; "
                          "}}; print }}' "
                          "| bgzip "
                          "> {outfile}".format(**locals()))
        statements.append("tabix -f -p bed {outfile}".format(**locals()))

        statement = "; ".join(statements)
        retvals.append(P.run(statement))

        return retvals

예제 #19

0

파일 보기

파일: BAMMetrics.py 프로젝트: AndreasHegerGenomics/cgat-bench

    def run(self, infile, outfile, params):

        try:
            retval = P.run("{params.path} view -H  "
                           "{infile} "
                           "2> {outfile}.log "
                           "> {outfile}.tmp; ".format(**locals()))
        except OSError as e:
            E.warn("input file {} gave the following errors: {}".format(
                infile, str(e)))

        with open(outfile, "w") as outf, open(outfile + ".tmp") as inf:
            outf.write("header_tag\ttag\tlineno\tvalue\n")
            for lineno, line in enumerate(inf):
                fields = line[1:-1].split("\t")
                header_tag = fields[0]
                if header_tag == "CO":
                    # Do not split comment lines
                    outf.write("\t".join((header_tag, "", str(lineno),
                                          "\t".join(fields[1:]))) + "\n")
                else:
                    for field in fields[1:]:
                        sub_tag, content = field.split(":", 1)
                        outf.write("\t".join((header_tag, sub_tag, str(lineno),
                                              content)) + "\n")

        os.unlink(outfile + ".tmp")
        return retval

예제 #20

0

파일 보기

 def run(self, infile, outfile, params):
     return P.run("{params.path} "
                  "--printf='filename\\tsize\\tepoch_modified"
                  "\\tmodified"
                  "\\n"
                  "%%n\\t%%s\\t%%Y\\t%%y\\n' "
                  "{infile} > {outfile}".format(**locals()))

예제 #21

0

파일 보기

    def run(self, infiles, outfile, params):

        files = " ".join(infiles)

        return P.run("zcat {files} "
                     "| sort -k 1,1 -k2,2n "
                     "| bgzip > {outfile}; "
                     "tabix -p bed {outfile} ".format(**locals()))

예제 #22

0

파일 보기

파일: BAMTools.py 프로젝트: AndreasHegerGenomics/cgat-bench

    def run(self, outfile, params):

        # the default is auto so use ten threads.
        if "threads" in params.options:
            if "job_threads=auto" in params.options:
                raise ValueError(
                    "please specify the number of threads "
                    "to use explicitely")
            else:
                job_threads = int(re.search("threads=(\d+)",
                                            params.options).groups()[0])
        else:
            raise ValueError("please specify the number of threads to use")

        job_memory = "32G"

        fastq = resolve_argument(params.fastq, " ")

        tmpdir = P.get_temp_filename(clear=True)

        return P.run(
            "mkdir {tmpdir}; "
            "zcat {fastq} "
            "| cut -c -5999 "
            "| gzip > {tmpdir}/in.fastq.gz; "
            "{params.path} "
            "{params.options} "
            "in={tmpdir}/in.fastq.gz "
            "ref={params.reference_fasta} "
            "out={tmpdir}/result.bam "
            ">& {outfile}.log; "
            "samtools sort -o {tmpdir}/sorted.bam {tmpdir}/result.bam; "
            "java -Xmx8000m -jar {params.path_picard} "
            "AddOrReplaceReadGroups "
            "INPUT={tmpdir}/sorted.bam "
            "OUTPUT={outfile} "
            "VALIDATION_STRINGENCY=LENIENT "
            "RGID=1 "
            "RGLB={params.library} "
            "RGPL={params.platform} "
            "RGPU=unknown "
            "RGSM={params.sample} "
            ">& {outfile}.picard.log; "
            "samtools index {outfile} "
            ">& {outfile}.index.log; "
            "rm -rf {tmpdir}".format(**locals()))

예제 #23

0

파일 보기

파일: BAMMetrics.py 프로젝트: AndreasHegerGenomics/cgat-bench

    def run(self, infile, outfile, params):
        statement = ("{params.path} depth "
                     "-a "
                     "{params.options} "
                     "{infile} "
                     "> {outfile}".format(**locals()))

        return P.run(statement)

예제 #24

0

파일 보기

    def run(self, infiles, outfile, params):

        statements = []
        outdir = os.path.dirname(outfile)
        temp_files = []
        for start in range(0, len(infiles), self.block_size):

            fn_vcf = os.path.join(outdir, "block_{}.vcf.gz".format(start))
            temp_files.append(fn_vcf)

            if os.path.exists(fn_vcf):
                continue

            end = start + self.block_size
            files = " ".join(
                ["--variant {}".format(x) for x in infiles[start:end]])

            statements.append("java "
                              "-Djava.io.tmpdir=%(tmpdir)s "
                              "-jar {params.path} "
                              "-T CombineGVCFs "
                              "-R {params.reference_fasta} "
                              "{params.options} "
                              "{files} "
                              "--out {fn_vcf} "
                              "--log_to_file {fn_vcf}.log "
                              ">& {fn_vcf}.err; ".format(**locals()))

        retvals = P.run(statements, job_memory="28G")
        files = " ".join(["--variant {}".format(x) for x in temp_files])

        statement = ("java "
                     "-Djava.io.tmpdir=%(tmpdir)s "
                     "-jar {params.path} "
                     "-T GenotypeGVCFs "
                     "-R {params.reference_fasta} "
                     "{params.options} "
                     "{files} "
                     "--out {outfile} "
                     "--log_to_file {outfile}.log "
                     ">& {outfile}.err; ".format(**locals()))

        retvals.append(P.run(statement, job_memory="28G"))

        return retvals

예제 #25

0

파일 보기

파일: BAMMetrics.py 프로젝트: AndreasHegerGenomics/cgat-bench

    def run(self, infile, outfile, params):

        if "reference_fasta" in params._fields:
            reference_fasta = "REFERENCE_SEQUENCE={}".format(
                params.reference_fasta)
        else:
            reference_fasta = ""

        # command can fail when no output is produced, but still produce output
        # 12G is required for java overhead
        retval = P.run("java -Xmx8000m -jar {params.path} "
                       "CollectMultipleMetrics "
                       "{reference_fasta} "
                       "INPUT={infile} "
                       "TMP_DIR=%(tmpdir)s "
                       "{params.options} "
                       "OUTPUT={outfile} "
                       ">& {outfile} ".format(**locals()),
                       job_memory="12G",
                       ignore_errors=True)

        def get_section(section, data):
            pattern = "## {}".format(section)
            keep = False
            result = []
            for line in data:
                if line.startswith("##"):
                    if line.startswith(pattern):
                        keep = True
                    else:
                        keep = False
                if keep:
                    result.append(line)
            return result

        for tablename in self.tablenames:
            filename = re.sub("histogram", "metrics", tablename)
            raw = filename[len("picard_"):]
            src = outfile + "." + raw
            dest = outfile + "." + tablename + ".tsv"

            if not os.path.exists(src):
                E.warn("no file {}, ignored".format(src))
                continue

            with IOTools.open_file(src) as inf:
                data = inf.readlines()

            if tablename.endswith("metrics"):
                data = get_section("METRICS", data)
            elif tablename.endswith("histogram"):
                data = get_section("HISTOGRAM", data)

            with IOTools.open_file(dest, "w") as outf:
                outf.write("".join(data))

        return retval

예제 #26

0

파일 보기

파일: BAMTools.py 프로젝트: AndreasHegerGenomics/cgat-bench

    def run(self, outfile, params):

        if "-t" in params.options:
            job_threads = int(re.search("-t\s*(\d+)",
                                        params.options).groups()[0])
        else:
            job_threads = 1

        # BWA requires at least 6Gb of memory, but is also correlated
        # with the number of threads, so use 5Gb + 1Gb per thread
        job_memory = "{}G".format(5.0 + 1.0 * job_threads)

        fastq = resolve_argument(params.fastq, ",")
        fastq = '"{}"'.format('" "'.join(fastq.split(",")))

        tmpdir = P.get_temp_filename(clear=True)

        if params.set_readgroup or params.readgroup_id_regex is not None:
            readgroup_string, readgroup_id, readgroup_sample = build_readgroup_string(
                outfile, params)

            # pipes.quote needs to shlex.quote in py3
            readgroup_option = "-R {}".format(pipes.quote(readgroup_string))
            # add additional level of quoting:
            readgroup_option = re.sub("\\t", "\\\\t", readgroup_option)
        else:
            readgroup_option = ""

        return P.run(
            "mkdir {tmpdir}; "
            "{self.path} mem "
            "{readgroup_option} "
            "{params.options} "
            "{params.reference_fasta} "
            "{fastq} "
            "2> {outfile}.log "
            "| samtools view -bu /dev/stdin "
            "2> {outfile}.view.log "
            "| samtools sort --threads {job_threads} -T {tmpdir} -O bam /dev/stdin "
            "2> {outfile}.sort.log "
            "> {outfile}; "
            "samtools index {outfile} >& {outfile}.index.log; "
            "rm -rf {tmpdir}".format(**locals()),
            **params._asdict())

예제 #27

0

파일 보기

    def run(self, infiles, outfile, params):

        infiles = " ".join([x + params.add_glob for x in infiles])

        statement = ("daisy plot-variant-stats "
                     "{params.options} "
                     "--output-filename-pattern={outfile}.%%s.png "
                     "{infiles} "
                     "> {outfile}".format(**locals()))
        return P.run(statement)

예제 #28

0

파일 보기

파일: FASTAMetrics.py 프로젝트: AndreasHegerGenomics/cgat-bench

    def run(self, infile, outfile, params):

        statement = (
            "{params.path} fasta2stats "
            "--output-filename-sequences={outfile}.daisy_fasta2stats_sequences.tsv "
            "--log {outfile} "
            "{infile} "
            "> {outfile}.daisy_fasta2stats_summary.tsv ".format(**locals()))

        return P.run(statement)

예제 #29

0

파일 보기

    def run(self, infile, outfile, params):

        tmpf = P.get_temp_filename(clear=True)

        tmpf_test, tmpf_truth = tmpf + "_a.bed.gz", tmpf + "_b.bed.gz"
        stmnt = standardise_bed_files(tmpf_test, tmpf_truth, infile,
                                      params.annotations_bed)
        statements = [stmnt]
        statements.append("{params.path} "
                          "--segment-bed-file={tmpf_test} "
                          "--ignore-segment-tracks "
                          "--annotation-bed-file={tmpf_truth} "
                          "--workspace-bed-file={params.workspace_bed} "
                          "--log={outfile}.log "
                          "{params.options} "
                          "> {outfile}")

        statement = "; ".join(statements)
        return P.run(statement.format(**locals()))

예제 #30

0

파일 보기

파일: ONT.py 프로젝트: AndreasHegerGenomics/cgat-bench

    def run(self, infile, outfile, params):

        if params.reference_fasta is None:
            raise ValueError(
                "ont_variant_depth_ratio requires reference_fasta to be set")

        if params.reference_vcf is None:
            raise ValueError(
                "ont_variant_depth_ratio requires reference_vcf to be set")

        statement = []
        if params.ref_sample_size is not None:
            reference_vcf = outfile + ".ref_sample.vcf.gz"
            statement.append(
                "daisy fasta2vcf "
                "--log={outfile}.fasta2vcf.log "
                "--sample-size={params.ref_sample_size} {params.reference_fasta} "
                "| bgzip "
                "> {outfile}.fasta2vcf.vcf.gz; "
                "tabix -p vcf {outfile}.fasta2vcf.vcf.gz; "
                "bcftools concat --allow-overlap "
                "{params.reference_vcf} "
                "{outfile}.fasta2vcf.vcf.gz "
                "| bgzip "
                "> {reference_vcf}; "
                "tabix -p vcf {reference_vcf} ".format(**locals()))
        else:
            reference_vcf = params.reference_vcf

        statement.append("{params.path_freebayes} "
                         "-f {params.reference_fasta} "
                         "--variant-input {reference_vcf} "
                         "--only-use-input-alleles "
                         "{params.options_freebayes} "
                         "{infile} "
                         "| bgzip "
                         "> {outfile}.genotyped.vcf.gz; ".format(**locals()))

        # "tabix -p vcf {outfile}.genotyped.vcf.gz; "
        # "{params.path_bcftools} view {params.options_bcftools} "
        # "{reference_vcf} "
        # "| bgzip > {outfile}.ref.vcf.gz; "
        # "tabix -p vcf {outfile}.ref.vcf.gz; "
        # "{params.path_bcftools} query -f \"%%CHROM\\t%%POS\\t[%%GT]\\t[%%DPR]\\n\" "
        # "{outfile}.genotyped.vcf.gz > {outfile}.genotyped.tsv; "
        # "{params.path_bcftools} query -f \"%%CHROM\\t%%POS\\t[%%GT]\\n\" "
        # "{outfile}.ref.tsv; "
        # "join -1 2 -2 2 {outfile}.ref.tsv {outfile}.genotyped.tsv "
        # "| perl -p -e \"s/[, ]/\\t/g\" "
        # "| cut -f 1,3,5,6,7 "
        # "| grep -v '\.' "
        # "> {outfile}".format(**locals()))

        statement = ";".join(statement)
        return P.run(statement)