示例#1
0
    def _build_asset(
        genome,
        asset_key,
        tag,
        build_pkg,
        genome_outfolder,
        specific_args,
        specific_params,
        alias,
        **kwargs,
    ):
        """
        Builds assets with pypiper and updates a genome config file.

        This function actually run the build commands in a given build package,
        and then update the refgenie config file.

        :param str genome: The assembly key; e.g. 'mm10'.
        :param str asset_key: The unique asset identifier; e.g. 'bowtie2_index'
        :param dict build_pkg: A dict (see examples) specifying lists
            of required input_assets, commands to run, and outputs to register as
            assets.
        """

        log_outfolder = os.path.abspath(
            os.path.join(genome_outfolder, asset_key, tag, BUILD_STATS_DIR))
        _LOGGER.info("Saving outputs to:\n- content: {}\n- logs: {}".format(
            genome_outfolder, log_outfolder))
        if args.docker:
            # Set up some docker stuff
            if args.volumes:
                # TODO: is volumes list defined here?
                volumes = volumes.append(genome_outfolder)
            else:
                volumes = genome_outfolder

        if not _writeable(genome_outfolder):
            _LOGGER.error(
                "Insufficient permissions to write to output folder: {}".
                format(genome_outfolder))
            return

        pm = pypiper.PipelineManager(name="refgenie",
                                     outfolder=log_outfolder,
                                     args=args)
        tk = pypiper.NGSTk(pm=pm)
        if args.docker:
            pm.get_container(build_pkg[CONT], volumes)
        _LOGGER.debug("Asset build package: " + str(build_pkg))
        # create a bundle list to simplify calls below
        gat = [genome, asset_key, tag]
        # collect variables required to populate the command templates
        asset_vars = get_asset_vars(
            genome,
            asset_key,
            tag,
            genome_outfolder,
            specific_args,
            specific_params,
            **kwargs,
        )
        # populate command templates
        # prior to populating, remove any seek_key parts from the keys, since these are not supported by format method
        command_list_populated = [
            x.format(**{k.split(".")[0]: v
                        for k, v in asset_vars.items()})
            for x in build_pkg[CMD_LST]
        ]
        # create output directory
        tk.make_dir(asset_vars["asset_outfolder"])

        target = os.path.join(log_outfolder,
                              TEMPLATE_TARGET.format(genome, asset_key, tag))
        # add target command
        command_list_populated.append("touch {target}".format(target=target))
        _LOGGER.debug("Command populated: '{}'".format(
            " ".join(command_list_populated)))
        try:
            # run build command
            signal.signal(signal.SIGINT, _handle_sigint(gat))
            pm.run(command_list_populated, target, container=pm.container)
        except pypiper.exceptions.SubprocessError:
            _LOGGER.error("asset '{}' build failed".format(asset_key))
            return False
        else:
            # save build recipe to the JSON-formatted file
            recipe_file_name = TEMPLATE_RECIPE_JSON.format(asset_key, tag)
            with open(os.path.join(log_outfolder, recipe_file_name),
                      "w") as outfile:
                json.dump(build_pkg, outfile)
            # since the assets are always built to a standard dir structure, we
            # can just stitch a path together for asset digest calculation
            asset_dir = os.path.join(rgc.data_dir, *gat)
            if not os.path.exists(asset_dir):
                raise OSError("Could not compute asset digest. Path does not "
                              "exist: {}".format(asset_dir))
            digest = get_dir_digest(asset_dir)
            _LOGGER.info("Asset digest: {}".format(digest))
            # add updates to config file
            with rgc as r:
                if asset_key == "fasta":
                    r.update_genomes(genome,
                                     data={CFG_ALIASES_KEY: [alias]},
                                     force_digest=genome)
                r.update_assets(
                    *gat[0:2],
                    data={CFG_ASSET_DESC_KEY: build_pkg[DESC]},
                    force_digest=genome,
                )
                r.update_tags(
                    *gat,
                    force_digest=genome,
                    data={
                        CFG_ASSET_PATH_KEY: asset_key,
                        CFG_ASSET_CHECKSUM_KEY: digest,
                    },
                )
                r.update_seek_keys(
                    *gat,
                    force_digest=genome,
                    keys={
                        k: v.format(**asset_vars)
                        for k, v in build_pkg[ASSETS].items()
                    },
                )
                r.set_default_pointer(*gat, force_digest=genome)
        pm.stop_pipeline()
        return True
示例#2
0
# Initialize
outfolder = os.path.abspath(os.path.join(args.output_parent, args.sample_name))
pm = pypiper.PipelineManager(name="rnaNucSeq", outfolder=outfolder, args=args)

# Tools
# pm.config.tools.scripts_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "tools")

# Resources
# pm.config.resources.ref_genome = os.path.join(pm.config.resources.genomes, args.genome_assembly)
# pm.config.resources.ref_genome_fasta = os.path.join(pm.config.resources.genomes, args.genome_assembly, args.genome_assembly + ".fa")
# pm.config.resources.chrom_sizes = os.path.join(pm.config.resources.genomes, args.genome_assembly, args.genome_assembly + ".chromSizes")

# Output
pm.config.parameters.pipeline_outfolder = outfolder

ngstk = pypiper.NGSTk(pm=pm)

tools = pm.config.tools
param = pm.config.parameters
resources = pm.config.resources

raw_folder = os.path.join(param.pipeline_outfolder, "raw/")
fastq_folder = os.path.join(param.pipeline_outfolder, "fastq/")

# Merge/Link sample input and Fastq conversion
# These commands merge (if multiple) or link (if single) input files,
# then convert (if necessary, for bam, fastq, or gz format) files to fastq.
################################################################################
pm.timestamp("### Merge/link and fastq conversion: ")

local_input_files = ngstk.merge_or_link([args.input, args.input2], raw_folder,
示例#3
0
def refgenie_build(rgc, args):
    """
    Runs the refgenie build recipe.
    
    :param refgenconf.RefGenConf rgc: genome configuration instance
    :param argparse.Namespace args: parsed command-line options/arguments
    """

    # Build specific args

    specific_args = {k: getattr(args, k) for k in BUILD_SPECIFIC_ARGS}

    if args.genome:
        genome = args.genome
    else:
        # This can probably be eliminated now that with flexible building
        genome = os.path.basename(args.input)
        # eliminate extensions to get canonical genome name.
        for strike in [
                ".fasta.gz$", ".fa.gz$", ".fasta$", ".fa$", ".gz$", ".2bit$"
        ]:
            genome = re.sub(strike, "", genome)

    _LOGGER.info("Using genome name: {}".format(genome))

    if not hasattr(args, "outfolder") or not args.outfolder:
        # Default to genome_folder
        _LOGGER.debug("No outfolder provided, using genome config.")
        args.outfolder = rgc.genome_folder

    outfolder = os.path.abspath(os.path.join(args.outfolder, genome))
    if not _writeable(outfolder):
        _LOGGER.error(
            "Insufficient permissions to write to output folder: {}".format(
                outfolder))
        return

    _LOGGER.info("Output to: {} {} {}".format(genome, args.outfolder,
                                              outfolder))
    _LOGGER.debug("Default config file: {}".format(default_config_file()))

    if args.config_file and not os.path.isfile(args.config_file):
        _LOGGER.debug("Config file path isn't a file: {}".format(
            args.config_file))
        args.config_file = default_config_file()

    def path_data(root, c):
        return {"path": os.path.relpath(root, c.genome_folder)}

    def build_asset(genome, asset_key, asset_build_package, outfolder,
                    specific_args):
        """
        Builds assets with pypiper and updates a genome config file.

        This function actually run the build commands in a given build package,
        and then update the refgenie config file.

        :param str genome: The assembly key; e.g. 'mm10'.
        :param str asset_key: The unique asset identifier; e.g. 'bowtie2_index'
        :param dict asset_build_package: A dict (see examples) specifying lists
            of required inputs, commands to run, and outputs to register as
            assets.
        """
        _LOGGER.debug("Asset build package: " + str(asset_build_package))
        get_asset_vars(genome, asset_key, outfolder, specific_args)

        print(
            str([
                x.format(**asset_vars)
                for x in asset_build_package["command_list"]
            ]))

        tk.make_dir(asset_outfolder)
        target = os.path.join(asset_outfolder, "build_complete.flag")
        command_list_populated = [
            x.format(**asset_vars) for x in asset_build_package["command_list"]
        ]

        touch_target = "touch {target}".format(target=target)
        command_list_populated.append(touch_target)

        _LOGGER.debug("Command list populated: " + str(command_list_populated))

        pm.run(command_list_populated, target, container=pm.container)
        # Add index information to rgc
        for asset_key, relative_path in asset_build_package["assets"].items():
            rgc.update_genomes(genome, asset_key,
                               {"path": relative_path.format(**asset_vars)})

        # Write the updated refgenie genome configuration
        rgc.write()

    pm = pypiper.PipelineManager(name="refgenie",
                                 outfolder=outfolder,
                                 args=args)
    tk = pypiper.NGSTk(pm=pm)
    tools = pm.config.tools  # Convenience alias
    index = pm.config.index
    param = pm.config.param

    container = None
    if args.docker:
        # Set up some docker stuff
        if args.volumes:
            volumes = volumes.append(outfolder)
        else:
            volumes = outfolder
        pm.get_container("nsheff/refgenie", volumes)

    for asset_key in args.asset:
        if asset_key in asset_build_packages.keys():
            asset_build_package = asset_build_packages[asset_key]
            _LOGGER.debug(specific_args)
            required_inputs = ", ".join(asset_build_package["required_inputs"])
            _LOGGER.info("Inputs required to build '{}': {}".format(
                asset_key, required_inputs))
            for required_input in asset_build_package["required_inputs"]:
                if not specific_args[required_input]:
                    raise ValueError(
                        "Argument '{}' is required to build asset '{}', but not provided"
                        .format(required_input, asset_key))

            for required_asset in asset_build_package["required_assets"]:
                try:
                    if not rgc.get_asset(args.genome, required_asset):
                        raise ValueError(
                            "Asset '{}' is required to build asset '{}', but not provided"
                            .format(required_asset, asset_key))
                except refgenconf.exceptions.MissingGenomeError:
                    raise ValueError(
                        "Asset '{}' is required to build asset '{}', but not provided"
                        .format(required_asset, asset_key))
            build_asset(args.genome, asset_key, asset_build_package, outfolder,
                        specific_args)
        else:
            _LOGGER.warn(
                "Recipe does not exist for asset '{}'".format(asset_key))

    # if False:
    #     # pm.make_sure_path_exists(outfolder)
    #     conversions = {}
    #     conversions[".2bit"] = "twoBitToFa {INPUT} {OUTPUT}"
    #     conversions[".gz"] = tk.ziptool + " -cd {INPUT} > {OUTPUT}"

    #     # Copy fasta file to genome folder structure
    #     local_raw_fasta = genome + ".fa"
    #     raw_fasta = os.path.join(outfolder, local_raw_fasta)

    #     input_fasta, cmd = copy_or_download_file(args.fasta, outfolder)
    #     pm.run(cmd, input_fasta)

    #     cmd = convert_file(input_fasta, raw_fasta, conversions)
    #     if cmd:
    #         pm.run(cmd, raw_fasta, container=pm.container)

    # # Copy annotation file (if any) to folder structure
    # if args.gtf:
    #     annotation_file_unzipped = os.path.join(outfolder, genome + ".gtf")
    #     annotation_file, cmd = copy_or_download_file(args.gtf, outfolder)
    #     pm.run(cmd, annotation_file)

    #     cmd = convert_file(annotation_file, annotation_file_unzipped, conversions)
    #     pm.run(cmd, annotation_file_unzipped)

    # #   cmd = "cp " + args.gtf + " " + annotation_file
    # #   cmd2 = tk.ziptool + " -d " + annotation_file
    # #   pm.run([cmd, cmd2], annotation_file_unzipped)

    # else:
    #     _LOGGER.debug("* No GTF gene annotations provided. Skipping this step.")

    # # Bowtie indexes
    # if index.bowtie2:
    #     asset_key = "indexed_bowtie2"
    #     folder = os.path.join(outfolder, asset_key)
    #     tk.make_dir(folder)
    #     target = os.path.join(folder, "completed.flag")
    #     cmd1 = "ln -sf ../" + local_raw_fasta + " " + folder
    #     cmd2 = tools.bowtie2build + " " + raw_fasta + " " + os.path.join(folder, genome)
    #     cmd3 = "touch " + target
    #     pm.run([cmd1, cmd2, cmd3], target, container=pm.container)
    #     # Add index information to rgc
    #     rgc.update_genomes(genome, asset_key, path_data(folder, rgc))

    #     # Write the updated refgenie genome configuration
    #     rgc.write()

    # # Bismark index - bowtie2
    # if index.bismark_bt2:
    #     asset_key = "indexed_bismark_bt2"
    #     folder = os.path.join(outfolder, asset_key)
    #     tk.make_dir(folder)
    #     target = os.path.join(folder, "completed.flag")
    #     cmd1 = "ln -sf ../" + local_raw_fasta + " " + folder
    #     cmd2 = tools.bismark_genome_preparation + " --bowtie2 " + folder
    #     cmd3 = "touch " + target
    #     pm.run([cmd1, cmd2, cmd3], target, container=pm.container)
    #     rgc.update_genomes(genome, asset_key, path_data(folder, rgc))
    #     rgc.write()

    # # Bismark index - bowtie1
    # if index.bismark_bt1:
    #     asset_key = "indexed_bismark_bt1"
    #     folder = os.path.join(outfolder, asset_key)
    #     tk.make_dir(folder)
    #     target = os.path.join(folder, "completed.flag")
    #     cmd1 = "ln -sf ../" + local_raw_fasta + " " + folder
    #     cmd2 = tools.bismark_genome_preparation + " " + folder
    #     cmd3 = "touch " + target
    #     pm.run([cmd1, cmd2, cmd3], target, container=pm.container)
    #     rgc.update_genomes(genome, asset_key, path_data(folder, rgc))
    #     rgc.write()

    # # Epilog meth calling
    # if index.epilog:
    #     asset_key = "indexed_epilog"
    #     folder = os.path.join(outfolder, asset_key)
    #     tk.make_dir(folder)
    #     target = os.path.join(folder, "completed.flag")
    #     cmd1 = "ln -sf ../" + local_raw_fasta + " " + folder
    #     cmd2 = tools.epilog_indexer + " -i " + raw_fasta
    #     cmd2 += " -o " + os.path.join(folder, genome + "_" + param.epilog.context + ".tsv")
    #     cmd2 += " -s " + param.epilog.context  # context
    #     cmd2 += " -t"
    #     cmd3 = "touch " + target
    #     pm.run([cmd1, cmd2, cmd3], target, container=pm.container)
    #     rgc.update_genomes(genome, asset_key, path_data(folder, rgc))
    #     rgc.write()

    # if index.hisat2:
    #     asset_key = "indexed_hisat2"
    #     folder = os.path.join(outfolder, asset_key)
    #     tk.make_dir(folder)
    #     target = os.path.join(folder, "completed.flag")
    #     cmd1 = "ln -sf ../" + local_raw_fasta + " " + folder
    #     cmd2 = tools.hisat2build + " " + raw_fasta + " " + os.path.join(folder, genome)
    #     cmd3 = "touch " + target
    #     pm.run([cmd1, cmd2, cmd3], target, container=pm.container)
    #     rgc.update_genomes(genome, asset_key, path_data(folder, rgc))
    #     rgc.write()

    # # Kallisto should index transcriptome
    # # So it doesn't make sense to run these at the same time as the others.
    # if index.kallisto:
    #     asset_key = "indexed_kallisto"
    #     folder = os.path.join(outfolder, asset_key)
    #     tk.make_dir(folder)
    #     target = os.path.join(folder, "completed.flag")
    #     cmd2 = tools.kallisto + " index -i " + os.path.join(folder, genome + "_kallisto_index.idx")
    #     cmd2 += " " + raw_fasta
    #     cmd3 = "touch " + target
    #     pm.run([cmd2, cmd3], target, container=pm.container)
    #     rgc.update_genomes(genome, asset_key, path_data(folder, rgc))
    #     rgc.write()

    pm.stop_pipeline()
示例#4
0
def main(cmdl):

    args = _parse_args(cmdl)

    # Merging
    ################################################################################
    # If 2 input files are given, then these are to be merged.
    # Must be done here to initialize the sample name correctly
    if len(args.input) > 1:
        if args.sample_name == "default":
            args.sample_name = "merged"
    else:
        if args.sample_name == "default":
            # Default sample name is derived from the input file
            args.sample_name = os.path.splitext(os.path.basename(
                args.input[0]))[0]

    # Create a PipelineManager object and start the pipeline
    outfolder = os.path.abspath(
        os.path.join(args.output_parent, args.sample_name))
    pm = pypiper.PipelineManager(name="WGBS",
                                 outfolder=outfolder,
                                 args=args,
                                 version=__version__)

    # Set up a few additional paths not in the config file
    pm.config.tools.scripts_dir = os.path.join(
        os.path.dirname(os.path.realpath(__file__)), "tools")
    pm.config.resources.ref_genome_fasta = os.path.join(
        pm.config.resources.genomes, args.genome_assembly,
        args.genome_assembly + ".fa")
    pm.config.resources.chrom_sizes = os.path.join(
        pm.config.resources.genomes, args.genome_assembly,
        args.genome_assembly + ".chromSizes")
    pm.config.resources.genomes_split = os.path.join(
        pm.config.resources.resources, "genomes_split")
    try:
        pm.config.resources.bismark_spikein_genome = os.path.join(
            pm.config.resources.genomes, pm.config.resources.spikein_genome,
            "indexed_bismark_bt1")
    except:
        pm.config.resources.bismark_spikein_genome = None

    pm.config.resources.bismark_indexed_genome = os.path.join(
        pm.config.resources.genomes, args.genome_assembly,
        "indexed_bismark_bt2")

    # Epilog indexes
    pm.config.resources.methpositions = os.path.join(
        pm.config.resources.genomes, args.genome_assembly, "indexed_epilog",
        args.genome_assembly + "_cg.tsv.gz")

    if pm.config.resources.bismark_spikein_genome:
        pm.config.resources.spikein_methpositions = os.path.join(
            pm.config.resources.genomes, pm.config.resources.spikein_genome,
            "indexed_epilog",
            pm.config.resources.spikein_genome + "_index.tsv.gz")

    pm.config.parameters.pipeline_outfolder = outfolder

    print(pm.config)
    tools = pm.config.tools  # Convenience alias
    param = pm.config.parameters
    resources = pm.config.resources

    # Create a ngstk object
    ngstk = pypiper.NGSTk(pm=pm)

    raw_folder = os.path.join(param.pipeline_outfolder, "raw/")
    fastq_folder = os.path.join(param.pipeline_outfolder, "fastq/")

    # Merge/Link sample input and Fastq conversion
    # These commands merge (if multiple) or link (if single) input files,
    # then convert (if necessary, for bam, fastq, or gz format) files to fastq.
    ################################################################################
    pm.timestamp("### Merge/link and fastq conversion: ")

    local_input_files = ngstk.merge_or_link([args.input, args.input2],
                                            raw_folder, args.sample_name)
    cmd, out_fastq_pre, unaligned_fastq = ngstk.input_to_fastq(
        local_input_files, args.sample_name, args.paired_end, fastq_folder)
    pm.run(cmd,
           unaligned_fastq,
           follow=ngstk.check_fastq(local_input_files, unaligned_fastq,
                                    args.paired_end))
    pm.clean_add(out_fastq_pre + "*.fastq", conditional=True)

    pm.report_result("File_mb", ngstk.get_file_size(local_input_files))
    pm.report_result("Read_type", args.single_or_paired)
    pm.report_result("Genome", args.genome_assembly)

    # Adapter trimming
    ################################################################################
    pm.timestamp("### Adapter trimming: ")

    # We need to detect the quality encoding type of the fastq.
    if isinstance(unaligned_fastq, list):
        example_fq = unaligned_fastq[0]
    else:
        example_fq = unaligned_fastq

    cmd = tools.python + " -u " + os.path.join(
        tools.scripts_dir, "detect_quality_code.py") + " -f " + example_fq
    encoding_string = pm.checkprint(cmd)
    if encoding_string.find("phred33") != -1:
        encoding = "phred33"
    elif encoding_string.find("phred64") != -1:
        encoding = "phred64"
    else:
        raise Exception("Unknown quality encoding type: " + encoding_string)

    trimmed_fastq = out_fastq_pre + "_R1_trimmed.fq"
    trimmed_fastq_R2 = out_fastq_pre + "_R2_trimmed.fq"

    cmd = tools.java + " -Xmx" + str(pm.mem) + " -jar " + tools.trimmomatic
    if args.paired_end:
        cmd += " PE"
    else:
        cmd += " SE"
    cmd += " -" + encoding
    cmd += " -threads " + str(pm.cores) + " "
    #cmd += " -trimlog " + os.path.join(fastq_folder, "trimlog.log") + " "
    if args.paired_end:
        cmd += out_fastq_pre + "_R1.fastq "
        cmd += out_fastq_pre + "_R2.fastq "
        cmd += out_fastq_pre + "_R1_trimmed.fq "
        cmd += out_fastq_pre + "_R1_unpaired.fq "
        cmd += out_fastq_pre + "_R2_trimmed.fq "
        cmd += out_fastq_pre + "_R2_unpaired.fq "
    else:
        cmd += out_fastq_pre + "_R1.fastq "
        cmd += out_fastq_pre + "_R1_trimmed.fq "
    cmd += " " + param.trimmomatic.trimsteps
    cmd += " ILLUMINACLIP:" + resources.adapter_file + param.trimmomatic.illuminaclip

    pm.run(cmd,
           trimmed_fastq,
           follow=ngstk.check_trim(trimmed_fastq,
                                   args.paired_end,
                                   trimmed_fastq_R2,
                                   fastqc_folder=os.path.join(
                                       param.pipeline_outfolder, "fastqc/")))

    pm.clean_add(os.path.join(fastq_folder, "*.fastq"), conditional=True)
    pm.clean_add(os.path.join(fastq_folder, "*.fq"), conditional=True)
    pm.clean_add(os.path.join(fastq_folder, "*.log"), conditional=True)
    pm.clean_add(fastq_folder, conditional=True)

    # WGBS alignment with bismark.
    ################################################################################
    pm.timestamp("### Bismark alignment: ")
    # Bismark will start multiple instances of bowtie, so we have to split
    # the alotted cores among the instances. Otherwise we will use 2x or 4x the number
    # of cores that we aresupposed to. It will start 2 threads in
    # normal mode, and 4 in --non-directional mode.

    if param.bismark.nondirectional:
        bismark_bowtie_threads = 4
    else:
        bismark_bowtie_threads = 2

    bismark_cores = int(pm.cores) // bismark_bowtie_threads

    if int(pm.cores) % bismark_bowtie_threads != 0:
        print("inefficient core request; make divisible by " +
              str(bismark_bowtie_threads))

    bismark_folder = os.path.join(param.pipeline_outfolder,
                                  "bismark_" + args.genome_assembly)
    ngstk.make_sure_path_exists(bismark_folder)
    bismark_temp = os.path.join(bismark_folder, "bismark_temp")
    ngstk.make_sure_path_exists(bismark_temp)

    if args.paired_end:
        out_bismark = os.path.join(bismark_folder,
                                   args.sample_name + "_pe.bam")
    else:
        out_bismark = os.path.join(bismark_folder, args.sample_name + ".bam")

    cmd = tools.bismark + " " + resources.bismark_indexed_genome + " "
    if args.paired_end:
        cmd += " --1 " + out_fastq_pre + "_R1_trimmed.fq"
        cmd += " --2 " + out_fastq_pre + "_R2_trimmed.fq"
    else:
        cmd += out_fastq_pre + "_R1_trimmed.fq"
    cmd += " --bam --unmapped"
    # Bowtie may be specified in raw form to indicate presence on path.
    if tools.bowtie2 != "bowtie2":
        cmd += " --path_to_bowtie " + tools.bowtie2
    cmd += " --bowtie2"
    cmd += " --temp_dir " + bismark_temp
    cmd += " --output_dir " + bismark_folder
    if args.paired_end:
        cmd += " --minins 0"
        cmd += " --maxins " + str(param.bismark.maxins)
    cmd += " -p " + str(bismark_cores)  # Number of processors
    cmd += " --basename=" + args.sample_name

    # By default, BS-seq libraries are directional, but this can be turned off
    # in bismark for non-directional protocols
    if param.bismark.nondirectional:
        cmd += " --non_directional"

    def check_bismark():
        ar = ngstk.count_mapped_reads(out_bismark, args.paired_end)
        pm.report_result("Aligned_reads", ar)
        rr = float(pm.get_stat("Raw_reads"))
        tr = float(pm.get_stat("Trimmed_reads"))
        pm.report_result("Alignment_rate",
                         round(float(ar) * 100 / float(tr), 2))
        pm.report_result("Total_efficiency",
                         round(float(ar) * 100 / float(rr), 2))

        mr = ngstk.count_multimapping_reads(out_bismark, args.paired_end)
        pm.report_result("Multimap_reads", mr)
        pm.report_result("Multimap_rate",
                         round(float(mr) * 100 / float(tr), 2))

    pm.run(cmd, out_bismark, follow=check_bismark)

    # Secondary single mode:
    # align unmapped in single end mode?
    if args.paired_end and args.single2:
        pm.timestamp("### Bismark secondary single-end alignment: ")
        out_bismark_se = []
        for read_n in ["1", "2"]:  # Align each read in single end mode
            read_string = "R" + str(read_n)
            bismark2_folder = os.path.join(bismark_folder,
                                           "se" + str(read_string))
            ngstk.make_sure_path_exists(bismark2_folder)
            bismark2_temp = os.path.join(bismark2_folder, "bismark2_temp")
            ngstk.make_sure_path_exists(bismark2_temp)
            out_bismark2 = os.path.join(
                bismark2_folder, args.sample_name + read_string + ".bam")

            unmapped_reads_pre = os.path.join(bismark_folder, args.sample_name)

            cmd = tools.bismark + " " + resources.bismark_indexed_genome + " "
            cmd += unmapped_reads_pre + "_unmapped_reads_" + str(
                read_n) + ".fq"
            cmd += " --bam --unmapped"
            # Bowtie may be specified in raw form to indicate presence on path.
            if tools.bowtie2 != "bowtie2":
                cmd += " --path_to_bowtie " + tools.bowtie2
            cmd += " --bowtie2"
            cmd += " --temp_dir " + bismark2_temp
            cmd += " --output_dir " + bismark2_folder
            cmd += " --basename=" + args.sample_name + read_string
            cmd += " -p " + str(bismark_cores)
            if param.bismark.nondirectional:
                cmd += " --non_directional"

            pm.run(cmd, out_bismark2)
            out_bismark_se.append(out_bismark2)

        # Now merge, sort, and analyze the single-end data
        merged_bismark = args.sample_name + "_SEmerged.bam"
        output_merge = os.path.join(bismark_folder, merged_bismark)
        cmd = ngstk.merge_bams(out_bismark_se,
                               output_merge,
                               in_sorted="FALSE",
                               tmp_dir=resources.tmp_dir)

        pm.run(cmd, output_merge)
        # Sort by read name
        sorted_bismark = args.sample_name + "_SEsorted.bam"
        output_sort = os.path.join(bismark_folder, sorted_bismark)

        cmd = tools.samtools + " sort -n -o " + output_merge + " " + output_sort
        pm.run(cmd, output_sort)

        cmd = tools.python + " -u " + os.path.join(tools.scripts_dir,
                                                   "rematch_pairs.py")
        cmd += " -i " + output_sort

        pm.run(cmd, lock_name="rematch")

    pm.timestamp("### PCR duplicate removal: ")
    # Bismark's deduplication forces output naming, how annoying.
    #out_dedup = os.path.join(bismark_folder, args.sample_name + "_pe.deduplicated.bam")
    out_dedup = re.sub(r'.bam$', '.deduplicated.bam', out_bismark)
    cmd, out_dedup = get_dedup_bismark_cmd(paired=args.paired_end,
                                           infile=out_bismark,
                                           prog=tools.deduplicate_bismark)
    with FolderContext(bismark_folder):
        pm.run(cmd,
               out_dedup,
               follow=lambda: pm.report_result(
                   "Deduplicated_reads",
                   ngstk.count_reads(out_dedup, args.paired_end)))
    if not os.path.isfile(out_dedup):
        pm.fail_pipeline(
            IOError("Missing deduplication target: {}".format(out_dedup)))

    pm.timestamp("### Aligned read filtering: ")

    # convert bam file into sam file and sort again to
    # compensate for a sorting issue of "deduplicate_bismark"
    sam_temp = os.path.join(bismark_folder, "sam_temp")
    ngstk.make_sure_path_exists(sam_temp)
    out_sam = os.path.join(bismark_folder,
                           args.sample_name + ".aln.deduplicated.sam")
    #Is this an old version of samtools?
    #cmd = tools.samtools + " sort -n -o " + out_dedup + " " + out_dedup.replace(".bam", "_sorted") + " | " + tools.samtools + " view -h - >" + out_sam
    #cmd = tools.samtools + " sort -n " + out_dedup + " " + " | " + tools.samtools + " view -h - >" + out_sam
    cmd = tools.samtools + " sort -n " + out_dedup + " -o " + out_sam
    pm.run(cmd, out_sam, shell=True)

    #sorted file same size as presorted?
    #pm.report_result("Filtered_reads", ngstk.count_reads(out_sam_filter, args.paired_end)) = ngstk.count_reads(out_sam, args.paired_end)
    #if sorted_reads != deduplicated_reads:
    #	raise Exception("Sorted size doesn't match deduplicated size.")

    out_sam_filter = os.path.join(bismark_folder,
                                  args.sample_name + ".aln.dedup.filt.sam")

    headerLines = subprocess.check_output(tools.samtools + " view -SH " +
                                          out_sam + "| wc -l",
                                          shell=True).strip()
    cmd = tools.python + " " + os.path.join(
        tools.scripts_dir, "bisulfiteReadFiltering_forRNA.py")
    cmd += " --infile=" + out_sam
    cmd += " --outfile=" + out_sam_filter
    cmd += " --skipHeaderLines=" + headerLines
    cmd += " --genome=" + args.genome_assembly
    cmd += " --genomeDir=" + resources.genomes
    cmd += " --minNonCpgSites=3"
    cmd += " --minConversionRate=0.9"
    if args.paired_end:
        cmd = cmd + " --pairedEnd"

    pm.run(cmd,
           out_sam_filter,
           follow=lambda: pm.report_result(
               "Filtered_reads",
               ngstk.count_reads(out_sam_filter, args.paired_end)))

    # Clean up all intermediates
    pm.clean_add(out_bismark)  # initial mapped bam file
    pm.clean_add(os.path.join(bismark_folder, "*.fastq"))
    pm.clean_add(os.path.join(bismark_folder, "*.fq"))
    pm.clean_add(out_dedup)  # deduplicated bam file
    pm.clean_add(out_sam)  # dedup conversion to sam
    pm.clean_add(out_sam_filter)  # after filtering

    # Epilog analysis
    ################################################################################

    # Create the program specification, in scope both for ordinary and spike-in.
    if args.epilog:
        try:
            epilog_prog_spec = ProgSpec(jar=tools.epilog,
                                        memory=pm.mem,
                                        cores=pm.cores)
        except MissingEpilogError as e:
            print("ERROR: {} --  skipping epilog".format(str(e)))
            epilog_prog_spec = None
    else:
        epilog_prog_spec = None

    if epilog_prog_spec:

        # Sort and index the deduplicated alignments.
        out_dedup_sorted = re.sub(r'.bam$', "_sort.bam", out_dedup)
        cmd2 = tools.samtools + " sort -@ " + str(
            pm.cores) + " -o " + out_dedup_sorted + " " + out_dedup
        cmd3 = tools.samtools + " index " + out_dedup_sorted
        pm.run([cmd2, cmd3], out_dedup_sorted + ".bai")

        # Separate output subfolder for epilog
        epilog_output_dir = os.path.join(param.pipeline_outfolder,
                                         "epilog_" + args.genome_assembly)
        ngstk.make_sure_path_exists(epilog_output_dir)

        pm.timestamp("### Epilog Methcalling: ")
        run_main_epi_pipe(pm,
                          epiconf=param.epilog,
                          prog_spec=epilog_prog_spec,
                          readsfile=out_dedup_sorted,
                          sitesfile=resources.methpositions,
                          outdir=epilog_output_dir,
                          rrbs_fill=0)
        pm.timestamp("### COMPLETE: epilog")

    # Methylation extractor
    ################################################################################
    # REMARK NS:
    # Bismark methylation extractor produces various outpus, but unfortunately none
    # are great. The default "coverage" (.bismark.cov) file is thus:
    # chr	start	stop	meth	methylated	unmethylated
    # chr17	4890653	4890653	100	1	0
    # chr17	5334751	5334751	100	1	0
    # This output lacks strand information, so you don't know if the coordinate is
    # pointing to a C or G on the + strand unless you look it up in the reference genome.
    # The "cytosine_report" file has all the info, but includes an entry for every
    # CpG, covered or not:
    # chr17	3000204	+	0	0	CG	CGT
    # chr17	3000205	-	0	0	CG	CGA
    # chr17	4890653	-	1	0	CG	CGA
    # Solution: Use the cytosine_report file, and filter out any uncovered reads.

    pm.timestamp("### Methylation calling (bismark extractor): ")

    extract_dir = os.path.join(bismark_folder, "extractor")
    ngstk.make_sure_path_exists(extract_dir)
    out_extractor = os.path.join(
        extract_dir,
        re.sub(r'.sam$', '.bismark.cov', os.path.basename(out_sam_filter)))
    out_cpg_report = re.sub(r'.bismark.cov$', '.CpG_report.txt.gz',
                            out_extractor)

    cmd = tools.bismark_methylation_extractor
    if args.paired_end:
        cmd += " --paired-end --no_overlap"
    else:
        cmd += " --single-end"
    cmd += " --report"
    cmd += " --bedGraph"
    cmd += " --merge_non_CpG"
    cmd += " --cytosine_report"
    cmd += " --genome_folder " + resources.bismark_indexed_genome
    cmd += " --gzip"
    cmd += " --output " + extract_dir
    cmd += " " + out_sam_filter

    pm.run(cmd, out_cpg_report)

    # TODO: make these boolean flags options to the pipeline
    keep_bismark_report = True
    keep_non_standard_chromosomes = False
    adjust_minus_strand = True

    # prepare outputs:
    out_cpg_report_filt = re.sub(r'.CpG_report.txt.gz$',
                                 '.CpG_report_filt.txt', out_cpg_report)
    out_cpg_report_filt_cov = re.sub(r'.CpG_report.txt.gz$',
                                     '.CpG_report_filt.cov', out_cpg_report)

    # remove uncovered regions:
    # Update to Bismark version 17 now gzips this output.
    cmd = ngstk.ziptool + " -c -d"
    cmd += " " + out_cpg_report
    cmd += " | awk '{ if ($4+$5 > 0) print; }'"
    cmd += " > " + out_cpg_report_filt
    pm.run(cmd, out_cpg_report_filt, shell=True)

    # convert the bismark report to the simpler coverage format and adjust the coordinates
    # of CpG's on the reverse strand while doing so (by substracting 1 from the start):
    if os.path.getsize(out_cpg_report_filt) == 0:
        print("Methylation report () is empty -- skipping conversion".format(
            out_cpg_report_filt))
    else:
        cmd = tools.Rscript + " " + os.path.join(
            tools.scripts_dir, "convertBismarkReport.R"
        )  # disable coverage filter, because we have already used `awk` to achieve this result
        cmd += " --formats=cov,min"
        cmd += " --noCovFilter"
        if keep_non_standard_chromosomes:
            cmd += " --noChromFilter"
        if not adjust_minus_strand:
            cmd += " --noAdjustMinusStrand"
        cmd += " -i " + out_cpg_report_filt
        pm.run(cmd, out_cpg_report_filt_cov, nofail=True)

    # tidy up:
    if not keep_bismark_report:
        pm.clean_add(out_cpg_report_filt)

    # Make bigwig
    ################################################################################
    pm.timestamp("### Make bigwig: ")

    bedGraph = re.sub(".bismark.cov$", ".bedGraph", out_extractor)
    sort_bedGraph = re.sub(".bedGraph$", ".sort.bedGraph", bedGraph)
    out_bigwig = re.sub(".bedGraph$", ".bw", bedGraph)
    cmd1 = ngstk.ziptool + " -c -d"
    cmd1 += " " + bedGraph
    cmd1 += " | sed '1d' " + " | LC_COLLATE=C sort -k1,1 -k2,2n - " + " > " + sort_bedGraph
    cmd2 = tools.bedGraphToBigWig + " " + sort_bedGraph + " " + resources.chrom_sizes
    cmd2 += " " + out_bigwig

    pm.run([cmd1, cmd2], out_bigwig)

    # Spike-in alignment
    ################################################################################
    # currently using bowtie1 instead of bowtie2
    if resources.bismark_spikein_genome:
        pm.timestamp("### Bismark spike-in alignment: ")
        spikein_folder = os.path.join(param.pipeline_outfolder,
                                      "bismark_spikein")
        ngstk.make_sure_path_exists(spikein_folder)
        spikein_temp = os.path.join(spikein_folder, "bismark_temp")
        ngstk.make_sure_path_exists(spikein_temp)
        out_spikein_base = args.sample_name + ".spikein.aln"

        #out_spikein = spikein_folder + args.sample_name + "_R1_trimmed.fastq_unmapped_reads_1.fq_bismark_pe.bam"

        unmapped_reads_pre = os.path.join(bismark_folder, args.sample_name)
        if args.paired_end:
            out_spikein = os.path.join(spikein_folder,
                                       out_spikein_base + "_pe.bam")
        else:
            out_spikein = os.path.join(spikein_folder,
                                       out_spikein_base + ".bam")
        cmd = tools.bismark + " " + resources.bismark_spikein_genome + " "
        if args.paired_end:
            cmd += " --1 " + unmapped_reads_pre + "_unmapped_reads_1.fq"
            cmd += " --2 " + unmapped_reads_pre + "_unmapped_reads_2.fq"
        else:
            cmd += unmapped_reads_pre + "_unmapped_reads.fq"
        cmd += " --bam --unmapped"
        # Bowtie may be specified in raw form to indicate presence on path.
        if tools.bowtie1 != "bowtie":
            cmd += " --path_to_bowtie " + tools.bowtie1
        #cmd += " --bowtie2"
        cmd += " --temp_dir " + spikein_temp
        cmd += " --output_dir " + spikein_folder
        if args.paired_end:
            cmd += " --minins 0"
            cmd += " --maxins " + str(param.bismark.maxins)
        cmd += " --basename=" + out_spikein_base
        if param.bismark.nondirectional:
            cmd += " --non_directional"

        pm.run(cmd, out_spikein, nofail=True)
        # Clean up the unmapped file which is copied from the parent
        # bismark folder to here:
        pm.clean_add(os.path.join(spikein_folder, "*.fq"), conditional=False)
        pm.clean_add(spikein_temp)

        pm.timestamp("### PCR duplicate removal (Spike-in): ")
        # Bismark's deduplication forces output naming, how annoying.
        #out_spikein_dedup = spikein_folder + args.sample_name + ".spikein.aln.deduplicated.bam"
        cmd, out_spikein_dedup = get_dedup_bismark_cmd(
            paired=args.paired_end,
            infile=out_spikein,
            prog=tools.deduplicate_bismark)
        out_spikein_sorted = re.sub(r'.deduplicated.bam$',
                                    '.deduplicated.sorted.bam',
                                    out_spikein_dedup)
        cmd2 = tools.samtools + " sort " + out_spikein_dedup + " -o " + out_spikein_sorted
        cmd3 = tools.samtools + " index " + out_spikein_sorted
        cmd4 = "rm " + out_spikein_dedup
        pm.run([cmd, cmd2, cmd3, cmd4],
               out_spikein_sorted + ".bai",
               nofail=True)

        # Spike-in methylation calling
        ################################################################################
        pm.timestamp("### Methylation calling (testxmz) Spike-in: ")
        spike_chroms = ngstk.get_chrs_from_bam(out_spikein_sorted)

        for chrom in spike_chroms:
            cmd1 = tools.python + " -u " + os.path.join(
                tools.scripts_dir, "testxmz.py")
            cmd1 += " " + out_spikein_sorted + " " + chrom
            cmd1 += " >> " + pm.pipeline_stats_file
            pm.callprint(cmd1, nofail=True)

        # spike in conversion efficiency calculation with epilog
        if epilog_prog_spec:
            ngstk.make_sure_path_exists(spikein_folder)
            pm.timestamp("### Spike-in Epilog Methcalling: ")
            spikein_epiconf = copy.deepcopy(param.epilog)
            spikein_epiconf.context = "C"
            spikein_epiconf.no_epi_stats = True  # Always skip stats for spike-in.
            try:
                run_main_epi_pipe(pm,
                                  epiconf=spikein_epiconf,
                                  prog_spec=epilog_prog_spec,
                                  readsfile=out_spikein_sorted,
                                  sitesfile=resources.spikein_methpositions,
                                  outdir=spikein_folder,
                                  rrbs_fill=0)
            except Exception as e:
                print("WARNING -- Could not run epilog -- {}".format(e))
        """
		epilog_spike_outfile=os.path.join(
				spikein_folder, args.sample_name + "_epilog.bed")
		epilog_spike_summary_file=os.path.join(
				spikein_folder, args.sample_name + "_epilog_summary.bed")
		
		cmd = tools.epilog
		cmd += " call"
		cmd += " --infile=" + out_spikein_sorted  # absolute path to the bsmap aligned bam
		cmd += " --positions=" + resources.spikein_methpositions
		cmd += " --outfile=" + epilog_spike_outfile
		cmd += " --summary=" + epilog_spike_summary_file
		cmd += " --cores=" + str(pm.cores)
		cmd += " --qual-threshold=30"
		cmd += " --read-length-threshold=30"
		cmd += " --wgbs"    # No RRBS "fill-in"
		
		pm.run(cmd, epilog_spike_outfile, nofail=True)
		
		# Now parse some results for pypiper result reporting.
	
		for chrom in spike_chroms:
			cmd = tools.python + " -u " + os.path.join(tools.scripts_dir, "tsv_parser.py")
			cmd += " -i " + os.path.join(spikein_folder, epilog_spike_summary_file)
			cmd += " -r context=C chr=" + chrom
	
			cmd_total = cmd + " -c " + "total"
			x = pm.checkprint(cmd_total, shell=True)
			pm.report_result(chrom+'_count_EL', x)
			cmd_rate = cmd + " -c " + "rate"
			x = pm.checkprint(cmd_rate, shell=True)
			pm.report_result(chrom+'_meth_EL', x)
		"""

    # Final sorting and indexing
    ################################################################################
    # create sorted and indexed BAM files for visualization and analysis
    pm.timestamp("### Final sorting and indexing: ")

    #out_header = bismark_folder + args.sample_name + ".reheader.bam"
    out_final = os.path.join(bismark_folder, args.sample_name + ".final.bam")
    # temp_folder = os.path.join(bismark_folder, "tmp")

    # # Sort
    # cmd = tools.java + " -Xmx" + str(pm.mem)
    # # This sort can run out of temp space on big jobs; this puts the temp to a
    # # local spot.
    # cmd += " -Djava.io.tmpdir=" + str(temp_folder)
    # cmd += " -jar " + tools.picard + " SortSam"
    # cmd += " I=" + out_sam_filter
    # cmd += " O=" + out_final
    # cmd += " SORT_ORDER=coordinate"
    # cmd += " VALIDATION_STRINGENCY=SILENT"
    # cmd += " CREATE_INDEX=true"
    # pm.run(cmd, out_final, lock_name="final_sorting")

    cmd = tools.samtools + " sort -@ " + str(
        pm.cores) + " " + out_sam_filter + " -o " + out_final
    cmd2 = tools.samtools + " index " + out_final
    pm.run([cmd, cmd2], out_final + ".bai")

    # Cleanup
    ################################################################################
    # remove temporary folders
    pm.clean_add(bismark_temp)
    pm.clean_add(sam_temp)
    pm.stop_pipeline()
示例#5
0
def run_pipeline():
    # A good practice is to make an output folder for each sample, housed under
    # the parent output folder, like this:
    outfolder = os.path.abspath(
        os.path.join(args.output_parent, args.sample_name))

    # Create a PipelineManager object and start the pipeline
    pm = pypiper.PipelineManager(name="logmuse-test",
                                 outfolder=outfolder,
                                 args=args)
    pm.info("Getting started!")
    # NGSTk is a "toolkit" that comes with pypiper, providing some functions
    # for dealing with genome sequence data. You can read more about toolkits in the
    # documentation

    files = [str(x) + ".tmp" for x in range(1, 20)]

    pm.run("touch " + " ".join(files), target=files, clean=True)

    # Create a ngstk object
    ngstk = pypiper.NGSTk(pm=pm)

    raw_folder = os.path.join(outfolder, "raw/")
    fastq_folder = os.path.join(outfolder, "fastq/")

    # Merge/Link sample input and Fastq conversion
    # These commands merge (if multiple) or link (if single) input files,
    # then convert (if necessary, for bam, fastq, or gz format) files to fastq.

    # We'll start with a timestamp that will provide a division for this section
    # in the log file
    pm.timestamp("### Merge/link and fastq conversion: ")

    # Now we'll rely on 2 NGSTk functions that can handle inputs of various types
    # and convert these to fastq files.

    local_input_files = ngstk.merge_or_link([args.input, args.input2],
                                            raw_folder, args.sample_name)

    cmd, out_fastq_pre, unaligned_fastq = ngstk.input_to_fastq(
        local_input_files, args.sample_name, args.paired_end, fastq_folder)

    # Now we'll use another NGSTk function to grab the file size from the input files
    #
    pm.report_result("File_mb", ngstk.get_file_size(local_input_files))

    # And then count the number of reads in the file

    n_input_files = len(list(filter(bool, local_input_files)))

    raw_reads = sum([
        int(ngstk.count_reads(input_file, args.paired_end))
        for input_file in local_input_files
    ]) / n_input_files

    # Finally, we use the report_result() function to print the output and
    # log the key-value pair in the standard stats.tsv file
    pm.report_result("Raw_reads", str(raw_reads))

    # Cleanup
    pm.stop_pipeline()
示例#6
0
def process(sample, pipeline_config, args):
    """
	This takes unmapped Bam files and makes trimmed, aligned, duplicate marked
	and removed, indexed, shifted Bam files along with a UCSC browser track.
	Peaks are called and filtered.
	"""

    print("Start processing sample %s." % sample.sample_name)

    # for path in ["sample_root"] + sample.paths.__dict__.keys():
    # 	if not os.path.exists(sample.paths[path]):
    # 		try:
    # 			os.mkdir(sample.paths[path])
    # 		except OSError("Cannot create '%s' path: %s" % (path, sample.paths[path])):
    # 			raise

    # Start Pypiper object
    pm = pypiper.PipelineManager("rnaKallisto",
                                 sample.paths.sample_root,
                                 args=args)

    print "\nPipeline configuration:"
    print(pm.config)
    tools = pm.config.tools  # Convenience alias
    param = pm.config.parameters
    resources = pm.config.resources

    raw_folder = os.path.join(sample.paths.sample_root, "raw")
    fastq_folder = os.path.join(sample.paths.sample_root, "fastq")

    sample.paired = False
    if args.single_or_paired == "paired":
        sample.paired = True

    # Create a ngstk object
    ngstk = pypiper.NGSTk(pm=pm)

    # Convert bam to fastq
    pm.timestamp("Converting to Fastq format")

    local_input_files = ngstk.merge_or_link([args.input, args.input2],
                                            raw_folder, args.sample_name)
    cmd, out_fastq_pre, unaligned_fastq = ngstk.input_to_fastq(
        local_input_files, args.sample_name, sample.paired, fastq_folder)
    pm.run(cmd,
           unaligned_fastq,
           follow=ngstk.check_fastq(local_input_files, unaligned_fastq,
                                    sample.paired))
    pm.clean_add(out_fastq_pre + "*.fastq", conditional=True)

    pm.report_result("File_mb", ngstk.get_file_size(local_input_files))
    pm.report_result("Read_type", args.single_or_paired)
    pm.report_result("Genome", args.genome_assembly)

    sample.fastq = out_fastq_pre + "_R1.fastq"
    sample.trimmed = out_fastq_pre + "_R1_trimmed.fastq"
    sample.fastq1 = out_fastq_pre + "_R1.fastq" if sample.paired else None
    sample.fastq2 = out_fastq_pre + "_R2.fastq" if sample.paired else None
    sample.trimmed1 = out_fastq_pre + "_R1_trimmed.fastq" if sample.paired else None
    sample.trimmed1Unpaired = out_fastq_pre + "_R1_unpaired.fastq" if sample.paired else None
    sample.trimmed2 = out_fastq_pre + "_R2_trimmed.fastq" if sample.paired else None
    sample.trimmed2Unpaired = out_fastq_pre + "_R2_unpaired.fastq" if sample.paired else None

    #if not sample.paired:
    #	pm.clean_add(sample.fastq, conditional=True)
    #if sample.paired:
    #	pm.clean_add(sample.fastq1, conditional=True)
    #	pm.clean_add(sample.fastq2, conditional=True)
    #	pm.clean_add(sample.fastqUnpaired, conditional=True)

    # Trim reads
    pm.timestamp("Trimming adapters from sample")
    if pipeline_config.parameters.trimmer == "trimmomatic":

        inputFastq1 = sample.fastq1 if sample.paired else sample.fastq
        inputFastq2 = sample.fastq2 if sample.paired else None
        outputFastq1 = sample.trimmed1 if sample.paired else sample.trimmed
        outputFastq1unpaired = sample.trimmed1Unpaired if sample.paired else None
        outputFastq2 = sample.trimmed2 if sample.paired else None
        outputFastq2unpaired = sample.trimmed2Unpaired if sample.paired else None

        PE = sample.paired
        pe = "PE" if PE else "SE"
        cmd = tools.java + " -Xmx" + str(pm.mem) + " -jar " + tools.trimmomatic
        cmd += " {0} -threads {1} {2}".format(pe, args.cores, inputFastq1)
        if PE:
            cmd += " {0}".format(inputFastq2)
        cmd += " {0}".format(outputFastq1)
        if PE:
            cmd += " {0} {1} {2}".format(outputFastq1unpaired, outputFastq2,
                                         outputFastq2unpaired)
        if args.quantseq: cmd += " HEADCROP:6"
        cmd += " ILLUMINACLIP:" + resources.adapters + ":2:10:4:1:true"
        if args.quantseq:
            cmd += " ILLUMINACLIP:" + "/data/groups/lab_bsf/resources/trimmomatic_adapters/PolyA-SE.fa" + ":2:30:5:1:true"
        cmd += " SLIDINGWINDOW:4:1"
        cmd += " MAXINFO:16:0.40"
        cmd += " MINLEN:21"

        pm.run(cmd,
               sample.trimmed1 if sample.paired else sample.trimmed,
               shell=True,
               nofail=True,
               follow=ngstk.check_trim(sample.trimmed,
                                       sample.paired,
                                       sample.trimmed2,
                                       fastqc_folder=os.path.join(
                                           sample.paths.sample_root,
                                           "fastqc/")))
        if not sample.paired:
            pm.clean_add(sample.trimmed, conditional=True)
        else:
            pm.clean_add(sample.trimmed1, conditional=True)
            pm.clean_add(sample.trimmed1Unpaired, conditional=True)
            pm.clean_add(sample.trimmed2, conditional=True)
            pm.clean_add(sample.trimmed2Unpaired, conditional=True)

    elif pipeline_config.parameters.trimmer == "skewer":
        skewer_dirpath = os.path.join(sample.paths.sample_root, "skewer")
        ngstk.make_dir(skewer_dirpath)
        sample.trimlog = os.path.join(skewer_dirpath, "trim.log")
        cmd = ngstk.skewer(
            inputFastq1=sample.fastq1 if sample.paired else sample.fastq,
            inputFastq2=sample.fastq2 if sample.paired else None,
            outputPrefix=os.path.join(sample.paths.sample_root, "fastq/",
                                      sample.sample_name),
            outputFastq1=sample.trimmed1 if sample.paired else sample.trimmed,
            outputFastq2=sample.trimmed2 if sample.paired else None,
            log=sample.trimlog,
            cpus=args.cores,
            adapters=pipeline_config.resources.adapters)
        pm.run(cmd,
               sample.trimmed1 if sample.paired else sample.trimmed,
               shell=True,
               nofail=True,
               follow=ngstk.check_trim(sample.trimmed,
                                       sample.paired,
                                       sample.trimmed2,
                                       fastqc_folder=os.path.join(
                                           sample.paths.sample_root,
                                           "fastqc/")))
        if not sample.paired:
            pm.clean_add(sample.trimmed, conditional=True)
        else:
            pm.clean_add(sample.trimmed1, conditional=True)
            pm.clean_add(sample.trimmed2, conditional=True)

    # With kallisto from unmapped reads
    pm.timestamp("Quantifying read counts with kallisto")

    inputFastq = sample.trimmed1 if sample.paired else sample.trimmed
    inputFastq2 = sample.trimmed1 if sample.paired else None
    transcriptomeIndex = os.path.join(
        pm.config.resources.genomes, sample.transcriptome, "indexed_kallisto",
        sample.transcriptome + "_kallisto_index.idx")

    bval = 0  # Number of bootstrap samples (default: 0)
    size = 50  # Estimated average fragment length
    sdev = 20  # Estimated standard deviation of fragment length
    sample.paths.quant = os.path.join(sample.paths.sample_root, "kallisto")
    sample.kallistoQuant = os.path.join(sample.paths.quant, "abundance.h5")
    cmd1 = tools.kallisto + " quant -b {0} -l {1} -s {2} -i {3} -o {4} -t {5}".format(
        bval, size, sdev, transcriptomeIndex, sample.paths.quant, args.cores)
    if not sample.paired:
        cmd1 += " --single {0}".format(inputFastq)
    else:
        cmd1 += " {0} {1}".format(inputFastq, inputFastq2)
    cmd2 = tools.kallisto + " h5dump -o {0} {0}/abundance.h5".format(
        sample.paths.quant)

    pm.run([cmd1, cmd2], sample.kallistoQuant, shell=True, nofail=True)

    pm.stop_pipeline()
    print("Finished processing sample %s." % sample.sample_name)
示例#7
0
def main(cmdl):
    args = _parse_args(cmdl)
    # Create a PipelineManager object and start the pipeline
    outfolder = os.path.abspath(
        os.path.join(args.output_parent, args.sample_name))
    pm = pypiper.PipelineManager(name="RRBS",
                                 outfolder=outfolder,
                                 args=args,
                                 version=__version__)

    # Set up a few additional paths not in the config file
    pm.config.tools.scripts_dir = os.path.join(
        os.path.dirname(os.path.realpath(__file__)), "tools")
    pm.config.resources.ref_genome_fasta = os.path.join(
        pm.config.resources.genomes, args.genome_assembly,
        args.genome_assembly + ".fa")
    pm.config.resources.chrom_sizes = os.path.join(
        pm.config.resources.genomes, args.genome_assembly,
        args.genome_assembly + ".chromSizes")
    pm.config.resources.genomes_split = os.path.join(
        pm.config.resources.resources, "genomes_split")
    pm.config.resources.bismark_spikein_genome = os.path.join(
        pm.config.resources.genomes, pm.config.resources.spikein_genome,
        "indexed_bismark_bt1")

    # Epilog indexes
    pm.config.resources.methpositions = os.path.join(
        pm.config.resources.genomes, args.genome_assembly, "indexed_epilog",
        args.genome_assembly + "_cg.tsv.gz")
    pm.config.resources.spikein_methpositions = os.path.join(
        pm.config.resources.genomes, pm.config.resources.spikein_genome,
        "indexed_epilog", pm.config.resources.spikein_genome + "_index.tsv.gz")

    pm.config.parameters.pipeline_outfolder = outfolder

    print(pm.config)
    tools = pm.config.tools  # Convenience alias
    param = pm.config.parameters
    resources = pm.config.resources

    # Create a ngstk object
    ngstk = pypiper.NGSTk(pm=pm)

    raw_folder = os.path.join(param.pipeline_outfolder, "raw/")
    fastq_folder = os.path.join(param.pipeline_outfolder, "fastq/")

    # Merge/Link sample input and Fastq conversion
    # These commands merge (if multiple) or link (if single) input files,
    # then convert (if necessary, for bam, fastq, or gz format) files to fastq.
    ################################################################################
    pm.timestamp("### Merge/link and fastq conversion: ")

    local_input_files = ngstk.merge_or_link([args.input, args.input2],
                                            raw_folder, args.sample_name)
    cmd, out_fastq_pre, unaligned_fastq = ngstk.input_to_fastq(
        local_input_files, args.sample_name, args.paired_end, fastq_folder)
    pm.run(cmd,
           unaligned_fastq,
           follow=ngstk.check_fastq(local_input_files, unaligned_fastq,
                                    args.paired_end))
    pm.clean_add(out_fastq_pre + "*.fastq", conditional=True)

    pm.report_result("File_mb", ngstk.get_file_size(local_input_files))
    pm.report_result("Read_type", args.single_or_paired)
    pm.report_result("Genome", args.genome_assembly)

    if args.dark_bases and args.dark_bases != 0:
        pm.timestamp("### Dark sequencing mode: ")
        cmd = tools.scripts_dir + "/darkSeqCombineReads.pl " + \
         out_fastq_pre + "_R1.fastq " +\
         out_fastq_pre + "_R2.fastq " +\
         out_fastq_pre + "_undark_R1.fastq " +\
         str(args.dark_bases)
        out_fastq_pre = out_fastq_pre + "_undark"
        unaligned_fastq = out_fastq_pre + "_R1.fastq"
        pm.run(cmd, unaligned_fastq)
        args.paired_end = False

    ################################################################################
    pm.timestamp("### Adapter trimming: ")

    # We need to detect the quality encoding type of the fastq.

    if args.paired_end:
        # Just look at the first read
        cmd = tools.python + " -u " + os.path.join(
            tools.scripts_dir,
            "detect_quality_code.py") + " -f " + unaligned_fastq[0]
    else:
        cmd = tools.python + " -u " + os.path.join(
            tools.scripts_dir,
            "detect_quality_code.py") + " -f " + unaligned_fastq

    encoding_string = pm.checkprint(cmd)
    if encoding_string.find("phred33") != -1:
        encoding = "phred33"
    elif encoding_string.find("phred64") != -1:
        encoding = "phred64"
    else:
        raise Exception("Unknown quality encoding type: " + encoding_string)

    if args.trimgalore:
        # Trim galore requires biopython, cutadapt modules. RSeQC as well (maybe?)
        #   --- $trim_galore -q $q --phred33 -a $a --stringency $s -e $e --length $l --output_dir $output_dir $input_fastq

        raise NotImplementedError("TrimGalore no longer supported")

        if args.paired_end:
            raise NotImplementedError("TrimGalore for PE RRBS not implemented")
        input_fastq = out_fastq_pre + "_R1.fastq "

        # With trimgalore, the output file is predetermined.
        trimmed_fastq = out_fastq_pre + "_R1_trimmed.fq"

        output_dir = fastq_folder

        # Adapter
        a = "AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC"

        cmd = tools.trimgalore
        cmd += " -q 20"  # quality trimming
        cmd += " --" + encoding
        cmd += " -a " + a
        cmd += " --stringency 1"  # stringency: Overlap with adapter sequence required to trim a sequence
        cmd += " -e 0.1"  # Maximum allowed error rate
        cmd += " --length 16"  # Minimum Read length
        # by unchangeable default Trimmomatic discards reads of lenth 0 (produced by ILLUMINACLIP):
        cmd += " --output_dir " + output_dir + " " + input_fastq

    else:
        # Trimmomatic

        trimmed_fastq = out_fastq_pre + "_R1_trimmed.fq"
        trimmed_fastq_R2 = out_fastq_pre + "_R2_trimmed.fq"

        # REMARK AS: instead of trim_galore we try to use Trimmomatic for now
        # - we are more compatible with the other pipelines
        # - better code base, not a python wrapper of a perl script (as trim_galore)
        # - rrbs-mode not needed because biseq has the same functionality

        # REMARK NS:
        # The -Xmx4000m restricts heap memory allowed to java, and is necessary
        # to prevent java from allocating lots of memory willy-nilly
        # if it's on a machine with lots of memory, which can lead
        # to jobs getting killed by a resource manager. By default, java will
        # use more memory on systems that have more memory, leading to node-dependent
        # killing effects that are hard to trace.

        cmd = tools.java + " -Xmx" + str(
            pm.mem) + " -jar " + tools.trimmomatic_epignome
        if args.paired_end:
            cmd += " PE"
        else:
            cmd += " SE"
        cmd += " -" + encoding
        cmd += " -threads " + str(pm.cores) + " "
        #cmd += " -trimlog " + os.path.join(fastq_folder, "trimlog.log") + " "
        if args.paired_end:
            cmd += out_fastq_pre + "_R1.fastq "
            cmd += out_fastq_pre + "_R2.fastq "
            cmd += out_fastq_pre + "_R1_trimmed.fq "
            cmd += out_fastq_pre + "_R1_unpaired.fq "
            cmd += out_fastq_pre + "_R2_trimmed.fq "
            cmd += out_fastq_pre + "_R2_unpaired.fq "
        else:
            cmd += out_fastq_pre + "_R1.fastq "
            cmd += out_fastq_pre + "_R1_trimmed.fq "
        cmd += "ILLUMINACLIP:" + resources.adapter_file + param.trimmomatic.illuminaclip

    # Trimming command has been constructed, using either trimming options.
    # The code to run it is the same either way:

    pm.run(cmd,
           trimmed_fastq,
           follow=ngstk.check_trim(trimmed_fastq,
                                   args.paired_end,
                                   trimmed_fastq_R2,
                                   fastqc_folder=os.path.join(
                                       param.pipeline_outfolder, "fastqc/")))

    pm.clean_add(os.path.join(fastq_folder, "*.fastq"), conditional=True)
    pm.clean_add(os.path.join(fastq_folder, "*.fq"), conditional=True)
    pm.clean_add(os.path.join(fastq_folder, "*.log"), conditional=True)
    pm.clean_add(fastq_folder, conditional=True)

    # RRBS alignment with BSMAP.
    ################################################################################
    pm.timestamp("### BSMAP alignment: ")
    bsmap_folder = os.path.join(param.pipeline_outfolder, "bsmap_" +
                                args.genome_assembly)  # e.g. bsmap_hg19
    ngstk.make_sure_path_exists(bsmap_folder)
    # no tmp folder needed for BSMAP alignment

    out_bsmap = os.path.join(bsmap_folder, args.sample_name + ".bam")

    cmd = tools.bsmap
    cmd += " -a " + out_fastq_pre + "_R1_trimmed.fq"
    if args.paired_end:
        cmd += " -b " + out_fastq_pre + "_R2_trimmed.fq"
    cmd += " -d " + resources.ref_genome_fasta
    cmd += " -o " + out_bsmap
    cmd += " " + str(param.bsmap.rrbs_mapping_mode)
    cmd += " -w " + str(param.bsmap.equal_best_hits)
    cmd += " -v " + str(param.bsmap.mismatch_rate)
    cmd += " -r " + str(param.bsmap.report_repeat)
    cmd += " -p " + str(param.bsmap.processors)
    cmd += " -n " + str(param.bsmap.map_to_strands)
    cmd += " -s " + str(param.bsmap.seed_size)
    cmd += " -S " + str(param.bsmap.random_number_seed)
    cmd += " -f " + str(param.bsmap.filter)
    cmd += " -q " + str(param.bsmap.quality_threshold)
    cmd += " -u"  # report unmapped reads (into same bam file)
    cmd += " -V 1"  # set verbosity level
    if args.paired_end:
        cmd += " -m " + str(param.bsmap.minimal_insert_size)
        cmd += " -x " + str(param.bsmap.maximal_insert_size)

    def check_bsmap():
        # BSMap apparently stores all the reads (mapped and unmapped) in
        # its output bam; to count aligned reads, then, we have to use
        # a -F4 flag (with count_mapped_reads instead of count_reads).
        ar = ngstk.count_mapped_reads(out_bsmap, args.paired_end)
        pm.report_result("Aligned_reads", ar)
        rr = float(pm.get_stat("Raw_reads"))
        tr = float(pm.get_stat("Trimmed_reads"))
        pm.report_result("Alignment_rate",
                         round(float(ar) * 100 / float(tr), 2))
        pm.report_result("Total_efficiency",
                         round(float(ar) * 100 / float(rr), 2))

        # In addition, BSMap can (if instructed by parameters) randomly assign
        # multimapping reads. It's useful to know how many in the final bam were such.
        mr = ngstk.count_multimapping_reads(out_bsmap, args.paired_end)
        pm.report_result("Multimap_reads", mr)
        pm.report_result("Multimap_rate",
                         round(float(mr) * 100 / float(tr), 2))

    pm.run(cmd, out_bsmap, follow=check_bsmap)

    # bsmap2.90 requires that
    cmd2 = tools.samtools + " sort -o " + out_bsmap + " " + out_bsmap
    cmd3 = tools.samtools + " index " + out_bsmap
    pm.run([cmd2, cmd3], out_bsmap + ".bai")

    # Clean up big intermediate files:
    pm.clean_add(os.path.join(bsmap_folder, "*.fastq"))
    pm.clean_add(os.path.join(bsmap_folder, "*.fq"))

    # Run biseq-methcalling:
    ################################################################################
    pm.timestamp("### Biseq methylation calling: ")

    # Python Software Requirements for biseq
    # REMARK AS: all packages are available via "easy_install --user <lib>"
    # pip is also a possibility if available (currently not on CeMM infrastructure)
    #
    # Direct links just in case:
    # - biopython: wget https://pypi.python.org/pypi/biopython or wget http://biopython.org/DIST/biopython-1.63.zip
    # - bitarray: wget https://pypi.python.org/packages/source/b/bitarray/bitarray-0.8.1.tar.gz
    # - guppy: wget https://pypi.python.org/packages/source/g/guppy/guppy-0.1.10.tar.gz
    # - pysam: wget https://code.google.com/p/pysam/downloads/detail?name=pysam-0.7.5.tar.gz

    biseq_output_path = os.path.join(param.pipeline_outfolder,
                                     "biseq_" + args.genome_assembly)
    biseq_output_path_web = os.path.join(biseq_output_path, "web")
    biseq_output_path_temp = os.path.join(biseq_output_path, "temp")

    ngstk.make_sure_path_exists(biseq_output_path)

    cmd = tools.python + " -u " + os.path.join(tools.scripts_dir,
                                               "biseqMethCalling.py")
    cmd += " --sampleName=" + args.sample_name
    cmd += " --alignmentFile=" + out_bsmap  # this is the absolute path to the bsmap aligned bam file
    cmd += " --methodPrefix=RRBS"
    cmd += " --rrbsMode"
    cmd += " --restrictionSite=" + str(
        param.biseq.restrictionSite
    )  # specify the pattern of restriction sites
    cmd += " --checkRestriction"
    cmd += " --minFragmentLength=" + str(param.biseq.minFragmentLength)
    cmd += " --maxFragmentLength=" + str(param.biseq.maxFragmentLength)
    cmd += " --pfStatus=" + str(param.biseq.pfStatus)
    cmd += " --maxMismatches=" + str(param.biseq.maxMismatches)
    cmd += " --baseQualityScoreC=" + str(param.biseq.baseQualityScoreC)
    cmd += " --baseQualityScoreNextToC=" + str(
        param.biseq.baseQualityScoreNextToC)
    cmd += " --laneSpecificStatistics"
    cmd += " --bigBedFormat"
    cmd += " --deleteTemp"
    cmd += " --toolsDir=" + tools.biseq_tools
    cmd += " --outputDir=" + biseq_output_path
    cmd += " --webOutputDir=" + biseq_output_path_web
    cmd += " --tempDir=" + biseq_output_path_temp
    cmd += " --timeDelay=" + str(param.biseq.timeDelay)
    cmd += " --genomeFraction=" + str(param.biseq.genomeFraction)
    cmd += " --smartWindows=" + str(param.biseq.smartWindows)
    cmd += " --maxProcesses=" + str(param.biseq.maxProcesses)
    cmd += " --genomeDir=" + resources.genomes_split
    cmd += " --inGenome=" + args.genome_assembly
    cmd += " --outGenome=" + args.genome_assembly
    # TODO AS: Investigate what happens with biseq in the case of paired-end data

    # The dog genome has 38 chromosomes (plus one X chromosome). It's probably best to check here for these rarely used
    # reference genomes:
    # The default value for includedChromosomes is chr1-30, X, Y, Z (sufficient for human and mouse genomes)
    # REMARK NS: This is a hack to account for the way biseq restricts to
    # default chroms. THis should be fixed in biseq in the future, but for now, this
    # lets us run dog samples using the default pipeline. hack!
    if args.genome_assembly == "canFam3":
        cmd += ' --includedChromosomes="chr1,chr2,chr3,chr4,chr5,chr6,chr7,chr8,chr9,chr10,chr11,chr12,chr13,chr14,' \
            'chr15,chr16,chr17,chr18,chr19,chr20,chr21,chr22,chr23,chr24,chr25,chr26,chr27,chr28,chr29,chr30,chrX,' \
            'chrY,chrZ,chr31,chr32,chr33,chr34,chr35,chr36,chr37,chr38"'

    # Deactivated options:
    #cmd += " --appendStatisticsOutput=" + stat_output  # TODO AS: I disable this option for now. This is an analysis-global file where every biseq run writes to
    #stat_output = os.path.join(biseq_output_path, "RRBS_biseq_statistics.txt")  # general stats file independent of sample

    biseq_finished_helper = os.path.join(biseq_output_path, "biseq.completed")
    cmd2 = "touch " + biseq_finished_helper

    pm.run([cmd, cmd2], target=biseq_finished_helper)

    # Now parse some results for pypiper result reporting.
    read_variables = [
        'uniqueSeqMotifCount', 'totalSeqMotifCount', 'bisulfiteConversionRate',
        'globalMethylationMean'
    ]
    totalSeqMotifCount = 0.0
    uniqueSeqMotifCount = 0.0
    for var in read_variables:
        cmd = tools.python + " -u " + os.path.join(tools.scripts_dir,
                                                   "tsv_parser.py")
        cmd += " -i " + os.path.join(
            biseq_output_path, "RRBS_statistics_" + args.sample_name + ".txt")
        cmd += " -c " + var
        x = pm.checkprint(cmd, shell=True)

        if var == 'totalSeqMotifCount':
            totalSeqMotifCount = float(x)
        if var == 'uniqueSeqMotifCount':
            uniqueSeqMotifCount = float(x)

        if var == 'uniqueSeqMotifCount':
            pm.report_result('Unique_CpGs', x)
        elif var == 'totalSeqMotifCount':
            pm.report_result('Total_CpGs', x)
            pm.report_result('meanCoverage',
                             str(totalSeqMotifCount / uniqueSeqMotifCount))
        else:
            pm.report_result(var, x)

    ################################################################################
    pm.timestamp("### Make bigbed: ")
    # REMARK AS: Make bigwig uses a bismark output file. For RRBS we don't have the bismark cov file
    # (essentially a bedgraph file) which the tool bedGraphToBigWig would need
    # REMARK AS: UCSC tracks are generated by biseq-methcalling

    # First, convert the bed format into the bigBed input style.
    # This is how biseq did it, but it's actually unnecessary; instead we can just go straight off the output file.
    # Left command here for posterity.
    # awk '{ printf "%s\t%s\t%s\t\047%s%[\04720\047]\047\t%s\t%s\n", $1, $2, $3, $5/10, $5, $6 }' RRBS_cpgMethylation_01_2276TU.bed > f

    # bigbed conversion input file is the biseq methylation calls output file
    biseq_methcall_file = os.path.join(
        biseq_output_path, "RRBS_cpgMethylation_" + args.sample_name + ".bed")

    bigbed_output_path = os.path.join(param.pipeline_outfolder,
                                      "bigbed_" + args.genome_assembly)
    bigwig_output_path = os.path.join(param.pipeline_outfolder,
                                      "bigwig_" + args.genome_assembly)

    ngstk.make_sure_path_exists(bigbed_output_path)
    ngstk.make_sure_path_exists(bigwig_output_path)
    bigbed_output_file = os.path.join(bigbed_output_path,
                                      "RRBS_" + args.sample_name + ".bb")
    out_bedGraph = os.path.join(bigwig_output_path,
                                "RRBS_" + args.sample_name + ".bedGraph")
    out_bigwig = os.path.join(bigwig_output_path,
                              "RRBS_" + args.sample_name + ".bw")

    cmd = tools.bedToBigBed
    cmd += " " + biseq_methcall_file
    cmd += " " + resources.chrom_sizes
    cmd += " " + bigbed_output_file

    # REMARK NS: As of June 2015, IGV will load bigBed files for methylation
    # in a unique format if the *filename contains  "RRBS_cpgMethylation" -- see
    # https://github.com/igvteam/igv/blob/master/src/org/broad/igv/methyl/MethylTrack.java
    # This is obviously not ideal, but I will create a link with this filename
    # to the original file (even for WGBS tracks) so that you could load these into
    # IGV if you want:

    filename_hack_link_file = os.path.join(
        bigbed_output_path, "RRBS_cpgMethylation_" + args.sample_name + ".bb")
    cmd2 = "ln -sf " + os.path.relpath(
        bigbed_output_file, bigbed_output_path) + " " + filename_hack_link_file

    pm.run([cmd, cmd2], bigbed_output_file)

    # Let's also make bigwigs:

    # First convert to bedGraph
    cmd = "awk -v OFS='\t' '{ print $1, $2, $3, $5/10 }'"
    cmd += " " + biseq_methcall_file
    cmd += " > " + out_bedGraph

    pm.clean_add(out_bedGraph, conditional=True)

    cmd2 = tools.bedGraphToBigWig
    cmd2 += " " + out_bedGraph
    cmd2 += " " + resources.chrom_sizes
    cmd2 += " " + out_bigwig

    pm.run([cmd, cmd2], out_bigwig, shell=True)

    ################################################################################

    # Create the program specification, in scope both for ordinary and spike-in.
    if args.epilog:
        try:
            epilog_prog_spec = ProgSpec(jar=tools.epilog,
                                        memory=pm.mem,
                                        cores=pm.cores)
        except MissingEpilogError as e:
            print("ERROR: {} --  skipping epilog".format(str(e)))
            epilog_prog_spec = None
    else:
        epilog_prog_spec = None

    if epilog_prog_spec:
        epilog_output_dir = os.path.join(param.pipeline_outfolder,
                                         "epilog_" + args.genome_assembly)
        ngstk.make_sure_path_exists(epilog_output_dir)
        pm.timestamp("### Epilog Methcalling: ")
        run_main_epi_pipe(pm,
                          epiconf=param.epilog,
                          prog_spec=epilog_prog_spec,
                          readsfile=out_bsmap,
                          sitesfile=resources.methpositions,
                          outdir=epilog_output_dir,
                          rrbs_fill=args.rrbs_fill)
        pm.timestamp("### COMPLETE: epilog processing")
        """
		epilog_outfile = os.path.join(
				epilog_output_dir, args.sample_name + "_epilog.bed")
		epilog_summary_file = os.path.join(
				epilog_output_dir, args.sample_name + "_epilog_summary.bed")
	
		cmd = tools.epilog
		cmd += " call"
		cmd += " --infile=" + out_bsmap  # absolute path to the bsmap aligned bam
		cmd += " --positions=" + resources.methpositions
		cmd += " --outfile=" + epilog_outfile
		cmd += " --summary-filename=" + epilog_summary_file
		cmd += " --cores=" + str(pm.cores)
		cmd += " --qual-threshold=" + str(param.epilog.qual_threshold)
		cmd += " --read-length-threshold=" + str(param.epilog.read_length_threshold)
		cmd += " --rrbs-fill=" + str(args.rrbs_fill)
		cmd += " --use-strand"    # Strand mode required because this isn't a bismark alignment.
	
		pm.run(cmd, epilog_outfile, nofail=True)
		"""

    ################################################################################
    pm.timestamp("### Bismark alignment (spike-in): ")
    # currently using bowtie1 instead of bowtie2

    # get unaligned reads out of BSMAP bam
    bsmap_unalignable_bam = os.path.join(bsmap_folder,
                                         args.sample_name + "_unalignable.bam")
    pm.run(tools.samtools + " view -bh -f 4 -F 128 " + out_bsmap + " > " +
           bsmap_unalignable_bam,
           bsmap_unalignable_bam,
           shell=True)

    # Re-flag the unaligned paired-end reads to make them look like unpaired for Bismark
    if args.paired_end:
        bsmap_unalignable_bam_output = os.path.join(
            bsmap_folder, args.sample_name + "_unalignable_reflagged.bam")
        cmd = tools.python + " -u " + os.path.join(tools.scripts_dir,
                                                   "pe_flag_changer.py")
        cmd += " -i " + bsmap_unalignable_bam
        cmd += " -o " + bsmap_unalignable_bam_output
        pm.run(cmd, bsmap_unalignable_bam_output)
        pm.clean_add(bsmap_unalignable_bam, conditional=True)
        bsmap_unalignable_bam = bsmap_unalignable_bam_output

    # convert BAM to fastq
    bsmap_fastq_unalignable_pre = os.path.join(
        bsmap_folder, args.sample_name + "_unalignable")
    bsmap_fastq_unalignable = bsmap_fastq_unalignable_pre + "_R1.fastq"
    cmd = ngstk.bam_to_fastq(bsmap_unalignable_bam,
                             bsmap_fastq_unalignable_pre, args.paired_end)
    pm.run(cmd, bsmap_fastq_unalignable)

    # actual spike-in analysis
    spikein_folder = os.path.join(param.pipeline_outfolder, "bismark_spikein")
    ngstk.make_sure_path_exists(spikein_folder)
    spikein_temp = os.path.join(spikein_folder, "bismark_temp")
    ngstk.make_sure_path_exists(spikein_temp)
    out_spikein_base = args.sample_name + ".spikein.aln"

    out_spikein = os.path.join(spikein_folder, out_spikein_base + ".bam")
    cmd = tools.bismark + " " + resources.bismark_spikein_genome + " "
    cmd += bsmap_fastq_unalignable_pre + "_R1.fastq"
    cmd += " --bam --unmapped"
    if (os.path.isdir(tools.bowtie1)):
        # If tools.bowtie1 is not a directory, assume owtie is in the path,
        # in which case bismark doesn't need it.
        cmd += " --path_to_bowtie " + tools.bowtie1
    #	cmd += " --bowtie2"
    cmd += " --temp_dir " + spikein_temp
    cmd += " --output_dir " + spikein_folder
    cmd += " --basename=" + out_spikein_base
    #cmd += " -p 4"
    cmd += " -n 0"  #allow no mismatches

    pm.run(cmd, out_spikein, nofail=True)

    # Clean up the unmapped file which is copied from the parent
    # bismark folder to here:
    pm.clean_add(os.path.join(spikein_folder, "*.fastq"), conditional=True)
    pm.clean_add(os.path.join(spikein_folder, "*.fq"), conditional=True)
    pm.clean_add(out_spikein, conditional=True)
    pm.clean_add(spikein_temp)

    ################################################################################
    pm.timestamp("### PCR duplicate removal (spike-in): ")
    # Bismark's deduplication forces output naming, how annoying.
    #out_spikein_dedup = spikein_folder + args.sample_name + ".spikein.aln.deduplicated.bam"
    cmd, out_spikein_dedup = get_dedup_bismark_cmd(
        paired=args.paired_end,
        infile=out_spikein,
        prog=tools.deduplicate_bismark)
    out_spikein_sorted = re.sub(r'.deduplicated.bam$',
                                '.deduplicated.sorted.bam', out_spikein_dedup)
    cmd2 = tools.samtools + " sort " + out_spikein_dedup + " -o " + out_spikein_sorted
    cmd3 = tools.samtools + " index " + out_spikein_sorted
    pm.run([cmd, cmd2, cmd3], out_spikein_sorted + ".bai", nofail=True)
    pm.clean_add(out_spikein_dedup, conditional=False)

    # Spike-in methylation calling
    ################################################################################
    pm.timestamp("### Testxmz methylation calling (spike-in): ")
    spike_chroms = ngstk.get_chrs_from_bam(out_spikein_sorted)

    for chrom in spike_chroms:
        cmd1 = tools.python + " -u " + os.path.join(tools.scripts_dir,
                                                    "testxmz.py")
        cmd1 += " " + out_spikein_sorted + " " + chrom
        cmd1 += " >> " + pm.pipeline_stats_file
        pm.run(cmd1, lock_name="spikein", nofail=True)

    if epilog_prog_spec:
        # spike in conversion efficiency calculation with epilog
        ngstk.make_sure_path_exists(spikein_folder)
        pm.timestamp("### Epilog methylation calling (spike-in): ")
        spikein_epiconf = copy.deepcopy(param.epilog)
        spikein_epiconf.context = "C"
        spikein_epiconf.no_epi_stats = True  # Always skip stats for spike-in.
        try:
            run_main_epi_pipe(pm,
                              epiconf=spikein_epiconf,
                              prog_spec=epilog_prog_spec,
                              readsfile=out_spikein_sorted,
                              sitesfile=resources.spikein_methpositions,
                              outdir=spikein_folder,
                              rrbs_fill=args.rrbs_fill)
        except Exception as e:
            print("WARNING -- Could not run epilog -- {}".format(e))
    """
	epilog_spike_outfile=os.path.join(
			spikein_folder, args.sample_name + "_epilog.bed")
	epilog_spike_summary_file=os.path.join(
			spikein_folder, args.sample_name + "_epilog_summary.bed")
	
	cmd = tools.epilog
	cmd += " call"
	cmd += " --infile=" + out_spikein_sorted # absolute path to the bsmap aligned bam
	cmd += " --positions=" + resources.spikein_methpositions
	cmd += " --outfile=" + epilog_spike_outfile
	cmd += " --summary-filename=" + epilog_spike_summary_file
	cmd += " --cores=" + str(pm.cores)
	cmd += " --qual-threshold=30"    # quality_threshold
	cmd += " --read-length-threshold=30"    # read length cutoff
	cmd += " --rrbs-fill=0"
	
	pm.run(cmd, epilog_spike_outfile, nofail=True)
	
	# Now parse some results for pypiper result reporting.
	for chrom in spike_chroms:
		cmd = tools.python + " -u " + os.path.join(tools.scripts_dir, "tsv_parser.py")
		cmd += " -i " + os.path.join(spikein_folder, epilog_spike_summary_file)
		cmd += " -r context=C chr=" + chrom
	
		cmd_total = cmd + " -c " + "total"
		x = pm.checkprint(cmd_total, shell=True)
		pm.report_result(chrom+'_count_EL', x)
		cmd_rate = cmd + " -c " + "rate"
		x = pm.checkprint(cmd_rate, shell=True)
		pm.report_result(chrom+'_meth_EL', x)
	"""

    # PDR calculation:
    ################################################################################

    # PDR not applied to PE case because bisulfiteReadConcordanceAnalysis.py is single-end only
    if not args.paired_end and args.pdr:

        pm.timestamp("### PDR (Partial Disordered Methylation) analysis")

        pdr_output_dir = os.path.join(param.pipeline_outfolder,
                                      "pdr_" + args.genome_assembly)
        ngstk.make_sure_path_exists(pdr_output_dir)

        # convert aligned bam to sam

        pdr_in_samfile = os.path.join(
            pdr_output_dir, args.sample_name +
            ".aligned.sam")  # gets deleted after, see some lines below
        pm.run(tools.samtools + " view " + out_bsmap + " > " + pdr_in_samfile,
               pdr_in_samfile,
               shell=True)

        # PDR calculation:
        #
        # output files:
        pdr_bedfile = os.path.join(pdr_output_dir,
                                   args.sample_name + ".pdr.bed")

        produce_sam = False  # TODO AS: make this an option somewhere
        concordsam = os.path.join(pdr_output_dir,
                                  args.sample_name + ".concordant.sam")
        discordsam = os.path.join(pdr_output_dir,
                                  args.sample_name + ".discordant.sam")

        # command::
        cmd1 = tools.python + " -u " + os.path.join(
            tools.scripts_dir, "bisulfiteReadConcordanceAnalysis.py")
        cmd1 += " --infile=" + pdr_in_samfile
        cmd1 += " --outfile=" + pdr_bedfile
        cmd1 += " --skipHeaderLines=0"
        cmd1 += " --genome=" + args.genome_assembly
        cmd1 += " --genomeDir=" + resources.genomes
        cmd1 += " --minNonCpgSites=3"  # These two parameters are not relevant for PDR analysis
        cmd1 += " --minConversionRate=0.9"

        if produce_sam:
            cmd1 += " --produce_sam"
            cmd1 += " --concordantOutfile=" + concordsam
            cmd1 += " --discordantOutfile=" + discordsam
            #TODO: perhaps convert them to bam *cough*

        #call:
        pm.run(cmd1, pdr_bedfile, nofail=True)

        # delete huge input SAM file
        pm.clean_add(os.path.join(pdr_output_dir, "*.sam"), conditional=True)
        pm.clean_add(pdr_output_dir, conditional=True)

        if os.path.isfile(os.path.join(tools.scripts_dir, "extractPDR.pl")):

            pm.timestamp("### PDR (Perl version by Kendell)")
            pdr_out = os.path.join(pdr_output_dir, args.sample_name + ".pdr")

            cmd = "perl " + os.path.join(tools.scripts_dir, "extractPDR.pl")
            cmd += " " + os.path.join(
                pdr_output_dir,
                args.sample_name) + " " + args.genome_assembly + ""
            cmd += " " + out_bsmap

            pm.run(cmd, target=pdr_out, nofail=True)

    # Final sorting and indexing
    ################################################################################
    # create sorted and indexed BAM files for visualization and analysis
    # bsmap already outputs a sorted and indexed bam file

    # Cleanup
    ################################################################################
    pm.stop_pipeline()
示例#8
0
def refgenie_build(rgc, args):
    """
    Runs the refgenie build recipe.
    
    :param refgenconf.RefGenConf rgc: genome configuration instance
    :param argparse.Namespace args: parsed command-line options/arguments
    """

    # Build specific args
    specific_args = {k: getattr(args, k) for k in BUILD_SPECIFIC_ARGS}

    if args.genome:
        genome = args.genome
    else:
        # This can probably be eliminated now that with flexible building
        genome = os.path.basename(args.input)
        # eliminate extensions to get canonical genome name.
        for strike in [
                ".fasta.gz$", ".fa.gz$", ".fasta$", ".fa$", ".gz$", ".2bit$"
        ]:
            genome = re.sub(strike, "", genome)

    _LOGGER.info("Using genome name: {}".format(genome))

    if not hasattr(args, "outfolder") or not args.outfolder:
        # Default to genome_folder
        _LOGGER.debug("No outfolder provided, using genome config.")
        args.outfolder = rgc.genome_folder

    outfolder = os.path.abspath(os.path.join(args.outfolder, genome))
    if not _writeable(outfolder):
        _LOGGER.error(
            "Insufficient permissions to write to output folder: {}".format(
                outfolder))
        return

    _LOGGER.info("Output to: {} {} {}".format(genome, args.outfolder,
                                              outfolder))
    _LOGGER.debug("Default config file: {}".format(default_config_file()))

    if args.config_file and not os.path.isfile(args.config_file):
        _LOGGER.debug("Config file path isn't a file: {}".format(
            args.config_file))
        args.config_file = default_config_file()

    def path_data(root, c):
        return {"path": os.path.relpath(root, c.genome_folder)}

    def build_asset(genome, asset_key, asset_build_package, outfolder,
                    specific_args):
        """
        Builds assets with pypiper and updates a genome config file.

        This function actually run the build commands in a given build package,
        and then update the refgenie config file.

        :param str genome: The assembly key; e.g. 'mm10'.
        :param str asset_key: The unique asset identifier; e.g. 'bowtie2_index'
        :param dict asset_build_package: A dict (see examples) specifying lists
            of required inputs, commands to run, and outputs to register as
            assets.
        """
        _LOGGER.debug("Asset build package: " + str(asset_build_package))
        asset_vars = get_asset_vars(genome, asset_key, outfolder,
                                    specific_args)
        asset_outfolder = os.path.join(outfolder, asset_key)

        _LOGGER.debug(
            str([
                x.format(**asset_vars)
                for x in asset_build_package["command_list"]
            ]))

        tk.make_dir(asset_outfolder)
        target = os.path.join(asset_outfolder, "build_complete.flag")
        command_list_populated = [
            x.format(**asset_vars) for x in asset_build_package["command_list"]
        ]

        touch_target = "touch {target}".format(target=target)
        command_list_populated.append(touch_target)

        _LOGGER.debug("Command list populated: " + str(command_list_populated))

        pm.run(command_list_populated, target, container=pm.container)
        # Add index information to rgc
        for asset_key, relative_path in asset_build_package["assets"].items():
            rgc.update_assets(genome, asset_key,
                              {"path": relative_path.format(**asset_vars)})

        # Write the updated refgenie genome configuration
        rgc.write()

    pm = pypiper.PipelineManager(name="refgenie",
                                 outfolder=outfolder,
                                 args=args)
    tk = pypiper.NGSTk(pm=pm)

    if args.docker:
        # Set up some docker stuff
        if args.volumes:
            volumes = volumes.append(outfolder)
        else:
            volumes = outfolder

    for asset_key in args.asset:
        if asset_key in asset_build_packages.keys():
            asset_build_package = asset_build_packages[asset_key]
            _LOGGER.debug(specific_args)
            required_inputs = ", ".join(asset_build_package["required_inputs"])
            _LOGGER.info("Inputs required to build '{}': {}".format(
                asset_key, required_inputs))
            for required_input in asset_build_package["required_inputs"]:
                if not specific_args[required_input]:
                    raise ValueError(
                        "Argument '{}' is required to build asset '{}', but not provided"
                        .format(required_input, asset_key))

            for required_asset in asset_build_package["required_assets"]:
                try:
                    if not rgc.get_asset(args.genome, required_asset):
                        raise ValueError(
                            "Asset '{}' is required to build asset '{}', but not provided"
                            .format(required_asset, asset_key))
                except refgenconf.exceptions.MissingGenomeError:
                    raise ValueError(
                        "Asset '{}' is required to build asset '{}', but not provided"
                        .format(required_asset, asset_key))
            if args.docker:
                pm.get_container(asset_build_package["container"], volumes)
            build_asset(args.genome, asset_key, asset_build_package, outfolder,
                        specific_args)
            _LOGGER.info("Finished building asset '{}'".format(asset_key))
        else:
            _LOGGER.warn(
                "Recipe does not exist for asset '{}'".format(asset_key))

    pm.stop_pipeline()
示例#9
0
    def _build_asset(
        genome,
        asset_key,
        tag,
        build_pkg,
        genome_outfolder,
        specific_args,
        specific_params,
        alias,
        **kwargs,
    ):
        """
        Builds assets with pypiper and updates a genome config file.

        This function actually runs the build commands in a given build package,
        and then update the refgenie config file.

        :param str genome: The assembly key; e.g. 'mm10'.
        :param str asset_key: The unique asset identifier; e.g. 'bowtie2_index'
        :param dict build_pkg: A dict (see examples) specifying lists
            of required input_assets, commands to run, and outputs to register as
            assets.
        """
        if args.map:
            # Performing a build map step.
            # The reduce step will need to be performed to get the built
            # asset metadata to the master config file
            genome_alias = rgc.get_genome_alias(digest=genome)
            # create an empty config file in the genome directory
            _LOGGER.info(f"Using new map genome config: {locked_map_gencfg}")
            make_sure_path_exists(os.path.dirname(locked_map_gencfg))
            open(locked_map_gencfg, "a").close()
            # initialize a new RefGenConf.
            # Use the master location for data storage,
            # but change path to the in asset dir location
            rgc_map = RefGenConf(
                entries={"genome_folder": rgc.genome_folder},
                filepath=locked_map_gencfg,
            )
            # set the alias first (if available), based on the master file

            rgc_map.set_genome_alias(
                digest=genome,
                genome=genome_alias,
                create_genome=True,
            )

            # copy the genome of interest section to the new RefGenConf,
            # so that possible dependancies can be satisfied
            rgc_map.update_genomes(
                genome=genome_alias,
                data=rgc[CFG_GENOMES_KEY][genome],
            )

        else:
            rgc_map = rgc

        _LOGGER.info(
            f"Saving outputs to:{block_iter_repr(['content: ' + genome_outfolder, 'logs: ' + build_stats_dir])}"
        )
        if args.docker:
            # Set up some docker stuff
            if args.volumes:
                # TODO: is volumes list defined here?
                volumes = volumes.append(genome_outfolder)
            else:
                volumes = genome_outfolder

        if not _writeable(genome_outfolder):
            _LOGGER.error(
                f"Insufficient permissions to write to output folder: {genome_outfolder}"
            )
            return False, rgc_map

        pm = pypiper.PipelineManager(name=PKG_NAME,
                                     outfolder=build_stats_dir,
                                     args=args)
        tk = pypiper.NGSTk(pm=pm)
        if args.docker:
            pm.get_container(build_pkg[CONT], volumes)
        _LOGGER.debug("Asset build package: " + str(build_pkg))
        # create a bundle list to simplify calls below
        gat = [genome, asset_key, tag]
        # collect variables required to populate the command templates
        asset_vars = get_asset_vars(
            genome,
            asset_key,
            tag,
            genome_outfolder,
            specific_args,
            specific_params,
            **kwargs,
        )
        # populate command templates
        # prior to populating, remove any seek_key parts from the keys, since these are not supported by format method
        command_list_populated = [
            x.format(**{k.split(".")[0]: v
                        for k, v in asset_vars.items()})
            for x in build_pkg[CMD_LST]
        ]
        # create output directory
        tk.make_dir(asset_vars["asset_outfolder"])

        target = os.path.join(build_stats_dir,
                              TEMPLATE_TARGET.format(genome, asset_key, tag))
        # add target command
        command_list_populated.append("touch {target}".format(target=target))
        _LOGGER.debug("Command populated: '{}'".format(
            " ".join(command_list_populated)))
        try:
            # run build command
            signal.signal(signal.SIGINT, _handle_sigint(gat))
            pm.run(command_list_populated, target, container=pm.container)
        except pypiper.exceptions.SubprocessError:
            _LOGGER.error("asset '{}' build failed".format(asset_key))
            return False, rgc_map
        else:
            # save build recipe to the JSON-formatted file
            recipe_file_name = TEMPLATE_RECIPE_JSON.format(asset_key, tag)
            with open(os.path.join(build_stats_dir, recipe_file_name),
                      "w") as outfile:
                json.dump(build_pkg, outfile)
            # since the assets are always built to a standard dir structure, we
            # can just stitch a path together for asset digest calculation
            asset_dir = os.path.join(rgc_map.data_dir, *gat)
            if not os.path.exists(asset_dir):
                raise OSError("Could not compute asset digest. Path does not "
                              "exist: {}".format(asset_dir))
            digest = get_dir_digest(asset_dir)
            _LOGGER.info(f"Asset digest: {digest}")
            # add a 'dir' seek_key that points to the asset directory
            build_pkg[ASSETS].update({"dir": "."})
            # add updates to config file
            with rgc_map as r:
                if asset_key == "fasta":
                    r.update_genomes(genome,
                                     data={CFG_ALIASES_KEY: [alias]},
                                     force_digest=genome)
                r.update_assets(
                    *gat[0:2],
                    data={CFG_ASSET_DESC_KEY: build_pkg[DESC]},
                    force_digest=genome,
                )
                r.update_tags(
                    *gat,
                    force_digest=genome,
                    data={
                        CFG_ASSET_PATH_KEY: asset_key,
                        CFG_ASSET_CHECKSUM_KEY: digest,
                    },
                )
                r.update_seek_keys(
                    *gat,
                    force_digest=genome,
                    keys={
                        k: v.format(**asset_vars)
                        for k, v in build_pkg[ASSETS].items()
                    },
                )
                r.set_default_pointer(*gat, force_digest=genome)
        pm.stop_pipeline()
        return True, rgc_map