Пример #1
0
def align(fastq_file, pair_file, ref_file, names, align_dir, data):
    assert data["analysis"].lower().startswith("wgbs-seq"), "No comparible alignment."
    config = data["config"]
    sample = dd.get_sample_name(data)
    out_prefix = os.path.join(align_dir, dd.get_lane(data))
    out_dir = os.path.join(align_dir, "%s_bismark" % dd.get_lane(data))

    if not ref_file:
        logger.error("bismark index not found. You can install "
                     "the index for your genome with: bcbio_nextgen.py upgrade "
                     "--aligners bismark --genomes genome-build-name --data")
        sys.exit(1)

    final_out = os.path.join(align_dir, "{0}.bam".format(sample))
    if file_exists(final_out):
        data = dd.set_work_bam(data, final_out)
        data["bam_report"] = glob.glob(os.path.join(out_dir, "*report.txt"))[0]
        data = dd.update_summary_qc(data, "bismark", base=data["bam_report"])
        return data

    bismark = config_utils.get_program("bismark", config)
    # bismark uses 5 threads/sample and ~12GB RAM/sample (hg38)
    resources = config_utils.get_resources("bismark", data["config"])
    max_cores = dd.get_num_cores(data)
    max_mem = config_utils.convert_to_bytes(resources.get("memory", "1G")) / (1024.0 * 1024.0)
    instances = calculate_bismark_instances(max_cores, max_mem * max_cores)
    # override instances if specified in the config
    if resources and resources.get("bismark_threads"):
        instances = resources.get("bismark_threads")
        logger.info(f"Using {instances} bismark instances - overriden by resources")
    bowtie_threads = 1
    if resources and resources.get("bowtie_threads"):
        bowtie_threads = resources.get("bowtie_threads")
    logger.info(f"Using {bowtie_threads} bowtie threads per bismark instance")
    kit = kits.KITS.get(dd.get_kit(data), None)
    directional = "--non_directional" if kit and not kit.is_directional else ""

    other_opts = resources.get("options", [])
    other_opts = " ".join([str(x) for x in other_opts]).strip()

    fastq_files = " ".join([fastq_file, pair_file]) if pair_file else fastq_file
    safe_makedir(align_dir)
    cmd = "{bismark} {other_opts} {directional} --bowtie2 --temp_dir {tx_out_dir} --gzip --parallel {instances} -p {bowtie_threads} -o {tx_out_dir} --unmapped {ref_file} {fastq_file} "
    if pair_file:
        fastq_file = "-1 %s -2 %s" % (fastq_file, pair_file)
    raw_bam = glob.glob(out_dir + "/*bismark*bt2*bam")
    if not raw_bam:
        with tx_tmpdir() as tx_out_dir:
            run_message = "Running Bismark aligner on %s and %s" % (fastq_file, ref_file)
            do.run(cmd.format(**locals()), run_message, None)
            shutil.move(tx_out_dir, out_dir)
        raw_bam = glob.glob(out_dir + "/*bismark*bt2*bam")
    # don't process bam in the bismark pipeline!
    utils.symlink_plus(raw_bam[0], final_out)
    data = dd.set_work_bam(data, final_out)
    data["bam_report"] = glob.glob(os.path.join(out_dir, "*report.txt"))[0]
    data = dd.update_summary_qc(data, "bismark", base=data["bam_report"])
    return data
Пример #2
0
def align(fastq_file, pair_file, ref_file, names, align_dir, data):
    assert data["analysis"].lower().startswith(
        "wgbs-seq"), "No comparible alignment."
    config = data["config"]
    sample = dd.get_sample_name(data)
    out_prefix = os.path.join(align_dir, dd.get_lane(data))
    out_dir = os.path.join(align_dir, "%s_bismark" % dd.get_lane(data))

    if not ref_file:
        logger.error(
            "bismark index not found. We don't provide the STAR indexes "
            "by default because they are very large. You can install "
            "the index for your genome with: bcbio_nextgen.py upgrade "
            "--aligners bismark --genomes genome-build-name --data")
        sys.exit(1)

    final_out = os.path.join(align_dir, "{0}.bam".format(sample))
    if file_exists(final_out):
        data = dd.set_work_bam(data, final_out)
        data["bam_report"] = glob.glob(os.path.join(out_dir, "*report.txt"))[0]
        return data

    bismark = config_utils.get_program("bismark", config)

    # bismark uses 5 threads/sample and ~12GB RAM/sample (hg38)
    resources = config_utils.get_resources("bismark", data["config"])
    max_cores = resources.get("cores", 1)
    max_mem = config_utils.convert_to_bytes(resources.get("memory", "1G"))
    n = min(max(int(max_cores / 5), 1),
            max(int(max_mem / config_utils.convert_to_bytes("12G")), 1))

    kit = kits.KITS.get(dd.get_kit(data), None)
    directional = "--non_directional" if kit and not kit.is_directional else ""

    other_opts = resources.get("options", [])
    other_opts = " ".join([str(x) for x in other_opts]).strip()

    fastq_files = " ".join([fastq_file, pair_file
                            ]) if pair_file else fastq_file
    safe_makedir(align_dir)
    cmd = "{bismark} {other_opts} {directional} --bowtie2 --temp_dir {tx_out_dir} --gzip --multicore {n} -o {tx_out_dir} --unmapped {ref_file} {fastq_file}"
    if pair_file:
        fastq_file = "-1 %s -2 %s" % (fastq_file, pair_file)
    raw_bam = glob.glob(out_dir + "/*bismark*bt2*bam")
    if not raw_bam:
        with tx_tmpdir() as tx_out_dir:
            run_message = "Running Bismark aligner on %s and %s" % (fastq_file,
                                                                    ref_file)
            do.run(cmd.format(**locals()), run_message, None)
            shutil.move(tx_out_dir, out_dir)
        raw_bam = glob.glob(out_dir + "/*bismark*bt2*bam")
    process_bam = _process_bam(raw_bam[0], fastq_files, sample,
                               dd.get_sam_ref(data), config)
    utils.symlink_plus(process_bam, final_out)
    data = dd.set_work_bam(data, final_out)
    data["bam_report"] = glob.glob(os.path.join(out_dir, "*report.txt"))[0]
    return data
Пример #3
0
def trim(data):
    """Remove adapter for bisulphite conversion sequencing data"""
    in_files = data["files"]
    names = dd.get_sample_name(data)
    work_dir = os.path.join(dd.get_work_dir(data), "trimmed", names)
    out_dir = utils.safe_makedir(work_dir)
    out_files = [
        os.path.join(out_dir,
                     utils.splitext_plus(os.path.basename(in_files[0]))[0]
                     + '_val_1.fq.gz'),
        os.path.join(out_dir,
                     utils.splitext_plus(os.path.basename(in_files[1]))[0]
                     + '_val_2.fq.gz')
    ]

    if utils.file_exists(out_files[0]):
        data["files"] = out_files
        return [[data]]

    kit = kits.KITS.get(dd.get_kit(data), None)
    if kit:
        logger.info(f"{kit.name} specified, using clip settings: R1 5'-{kit.clip_r1_5}nt/--/{kit.clip_r1_3}nt-3', R2 5'-{kit.clip_r2_5}nt/--/{kit.clip_r2_3}nt-3'")
        clipsettings = _get_clip_settings(kit)
    else:
        logger.info(f"No kit specified, using default clip settings")
        clipsettings = ""

    trim_galore = config_utils.get_program("trim_galore", data["config"])
    # trim_galore actual cores used = 3x + 3 where x = value of the parameter (according to manual)
    tg_cores = max(int((dd.get_num_cores(data) - 3) / 3), 1)
    other_opts = config_utils.get_resources("trim_galore", data["config"]).get("options", [])
    other_opts = " ".join([str(x) for x in other_opts]).strip()

    cmd = "{trim_galore} {other_opts} {clipsettings} --cores {tg_cores} --length 30 --quality 30 --fastqc --paired -o {tx_out_dir} {files}"
    log_file = os.path.join(out_dir, names + "_cutadapt_log.txt")

    if not utils.file_exists(out_files[0]):
        with file_transaction(out_dir) as tx_out_dir:
            files = "%s %s" % (in_files[0], in_files[1])
            do.run(cmd.format(**locals()), "remove adapters with trimgalore")

    data["files"] = out_files
    return [[data]]