Exemplo n.º 1
0
def _set_quality_flag(options, data):
    qual_format = dd.get_quality_format(data)
    if qual_format.lower() == "illumina":
        options["solexa1.3-quals"] = True
    elif qual_format.lower() == "solexa":
        options["solexa-quals"] = True
    return options
Exemplo n.º 2
0
def get_fastq_files(data):
    """Retrieve fastq files for the given lane, ready to process.
    """
    assert "files" in data, "Did not find `files` in input; nothing to process"
    ready_files = []
    should_gzip = True

    # Bowtie does not accept gzipped fastq
    if 'bowtie' in data['reference'].keys():
        should_gzip = False
    for fname in data["files"]:
        if fname.endswith(".bam"):
            if _pipeline_needs_fastq(data["config"], data):
                ready_files = convert_bam_to_fastq(fname, data["dirs"]["work"],
                                                   data, data["dirs"], data["config"])
            else:
                ready_files = [fname]
        elif objectstore.is_remote(fname):
            ready_files.append(fname)
        # Trimming does quality conversion, so if not doing that, do an explicit conversion
        elif not(dd.get_trim_reads(data)) and dd.get_quality_format(data) != "standard":
            out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "fastq_convert"))
            ready_files.append(fastq.groom(fname, data, out_dir=out_dir))
        else:
            ready_files.append(fname)
    ready_files = [x for x in ready_files if x is not None]
    if should_gzip:
        out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "fastq"))
        ready_files = [_gzip_fastq(x, out_dir) for x in ready_files]
    for in_file in ready_files:
        if not objectstore.is_remote(in_file):
            assert os.path.exists(in_file), "%s does not exist." % in_file
    return ready_files
Exemplo n.º 3
0
def get_fastq_files(data):
    """Retrieve fastq files for the given lane, ready to process.
    """
    assert "files" in data, "Did not find `files` in input; nothing to process"
    ready_files = []
    should_gzip = True

    # Bowtie does not accept gzipped fastq
    if 'bowtie' in data['reference'].keys():
        should_gzip = False
    for fname in data["files"]:
        if fname.endswith(".bam"):
            if _pipeline_needs_fastq(data["config"], data):
                ready_files = convert_bam_to_fastq(fname, data["dirs"]["work"],
                                                   data, data["dirs"], data["config"])
            else:
                ready_files = [fname]
        elif objectstore.is_remote(fname):
            ready_files.append(fname)
        # Trimming does quality conversion, so if not doing that, do an explicit conversion
        elif not(dd.get_trim_reads(data)) and dd.get_quality_format(data) != "standard":
            out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "fastq_convert"))
            ready_files.append(fastq.groom(fname, data, out_dir=out_dir))
        else:
            ready_files.append(fname)
    ready_files = [x for x in ready_files if x is not None]
    if should_gzip:
        out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "fastq"))
        ready_files = [_gzip_fastq(x, out_dir) for x in ready_files]
    for in_file in ready_files:
        if not objectstore.is_remote(in_file):
            assert os.path.exists(in_file), "%s does not exist." % in_file
    return ready_files
Exemplo n.º 4
0
def align_transcriptome(fastq_file, pair_file, ref_file, data):
    """
    bwa mem with settings for aligning to the transcriptome for eXpress/RSEM/etc
    """
    work_bam = dd.get_work_bam(data)
    base, ext = os.path.splitext(work_bam)
    out_file = base + ".transcriptome" + ext
    if utils.file_exists(out_file):
        data = dd.set_transcriptome_bam(data, out_file)
        return data
    # bwa mem needs phred+33 quality, so convert if it is Illumina
    if dd.get_quality_format(data).lower() == "illumina":
        logger.info("bwa mem does not support the phred+64 quality format, "
                    "converting %s and %s to phred+33.")
        fastq_file = fastq.groom(fastq_file, data, in_qual="fastq-illumina")
        if pair_file:
            pair_file = fastq.groom(pair_file, data, in_qual="fastq-illumina")
    bwa = config_utils.get_program("bwa", data["config"])
    gtf_file = dd.get_gtf_file(data)
    gtf_fasta = index_transcriptome(gtf_file, ref_file, data)
    args = " ".join(_bwa_args_from_config(data["config"]))
    num_cores = data["config"]["algorithm"].get("num_cores", 1)
    samtools = config_utils.get_program("samtools", data["config"])
    cmd = ("{bwa} mem {args} -a -t {num_cores} {gtf_fasta} {fastq_file} "
           "{pair_file} | {samtools} view -bhS - > {tx_out_file}")

    with file_transaction(data, out_file) as tx_out_file:
        message = "Aligning %s and %s to the transcriptome." % (fastq_file,
                                                                pair_file)
        do.run(cmd.format(**locals()), message)
    data = dd.set_transcriptome_bam(data, out_file)
    return data
Exemplo n.º 5
0
def to_sdf(files, data):
    """Convert a fastq or BAM input into a SDF indexed file.
    """
    # BAM
    if len(files) == 1 and files[0].endswith(".bam"):
        qual = []
        format = ["-f", "sam-pe" if bam.is_paired(files[0]) else "sam-se"]
        inputs = [files[0]]
    # fastq
    else:
        qual = [
            "-q", "illumina"
            if dd.get_quality_format(data).lower() == "illumina" else "sanger"
        ]
        format = ["-f", "fastq"]
        if len(files) == 2:
            inputs = ["-l", files[0], "-r", files[1]]
        else:
            assert len(files) == 1
            inputs = [files[0]]
    work_dir = utils.safe_makedir(
        os.path.join(data["dirs"]["work"], "align_prep"))
    out_file = os.path.join(
        work_dir, "%s.sdf" %
        utils.splitext_plus(os.path.basename(os.path.commonprefix(files)))[0])
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            cmd = _rtg_cmd(["rtg", "format", "-o", tx_out_file] + format +
                           qual + inputs)
            do.run(cmd, "Format inputs to indexed SDF")
    return out_file
Exemplo n.º 6
0
def gatk_splitreads(data):
    """
    use GATK to split reads with Ns in the CIGAR string, hard clipping regions
    that end up in introns
    """
    broad_runner = broad.runner_from_config(dd.get_config(data))
    ref_file = dd.get_ref_file(data)
    deduped_bam = dd.get_deduped_bam(data)
    base, ext = os.path.splitext(deduped_bam)
    split_bam = base + ".splitN" + ext
    if dd.get_quality_format(data) == "illumina":
        quality_flag = ["--fix_misencoded_quality_scores", "-fixMisencodedQuals"]
    else:
        quality_flag = []
    if file_exists(split_bam):
        data = dd.set_split_bam(data, split_bam)
        return data
    with file_transaction(split_bam) as tx_split_bam:
        params = ["-T", "SplitNCigarReads",
                  "-R", ref_file,
                  "-I", deduped_bam,
                  "-o", tx_split_bam,
                  "-rf", "ReassignOneMappingQuality",
                  "-RMQF", "255",
                  "-RMQT", "60",
                  "-rf", "UnmappedRead",
                  "-U", "ALLOW_N_CIGAR_READS"] + quality_flag
        broad_runner.run_gatk(params)
    bam.index(split_bam, dd.get_config(data))
    data = dd.set_split_bam(data, split_bam)
    return data
Exemplo n.º 7
0
def _set_quality_flag(options, data):
    qual_format = dd.get_quality_format(data)
    if qual_format.lower() == "illumina":
        options["solexa1.3-quals"] = True
    elif qual_format.lower() == "solexa":
        options["solexa-quals"] = True
    return options
Exemplo n.º 8
0
def _fastp_trim(fastq_files, adapters, out_dir, data):
    """Perform multicore trimming with fastp (https://github.com/OpenGene/fastp)
    """
    report_file = os.path.join(out_dir, "%s-report.json" % utils.splitext_plus(os.path.basename(fastq_files[0]))[0])
    out_files = [os.path.join(out_dir, "%s-trimmed.fq.gz" % utils.splitext_plus(os.path.basename(x))[0])
                 for x in fastq_files]
    if not utils.file_exists(out_files[0]):
        with file_transaction(data, *[report_file] + out_files) as tx_out:
            tx_report = tx_out[0]
            tx_out_files = tx_out[1:]
            cmd = ["fastp", "--thread", dd.get_num_cores(data)]
            if dd.get_quality_format(data).lower() == "illumina":
                cmd += ["--phred64"]
            for i, (inf, outf) in enumerate(zip(fastq_files, tx_out_files)):
                if i == 0:
                    cmd += ["-i", inf, "-o", outf]
                else:
                    cmd += ["-I", inf, "-O", outf]
            cmd += ["--trim_poly_g", "--cut_by_quality3", "--cut_mean_quality", "5", "--disable_quality_filtering",
                    "--length_required", str(dd.get_min_read_length(data))]
            for a in adapters:
                cmd += ["--adapter_sequence", a]
            if not adapters:
                cmd += ["--disable_adapter_trimming"]
            cmd += ["--json", report_file, "--report_title", dd.get_sample_name(data)]
            do.run(cmd, "Trimming with fastp: %s" % dd.get_sample_name(data))
    return out_files, report_file
Exemplo n.º 9
0
def _run_qc_tools(bam_file, data):
    """Run a set of third party quality control tools, returning QC directory and metrics.

        :param bam_file: alignments in bam format
        :param data: dict with all configuration information

        :returns: dict with output of different tools
    """
    from bcbio.qc import (atropos, coverage, damage, fastqc, kraken, qsignature, qualimap,
                          samtools, picard, srna, umi, variant, viral, preseq)
    tools = {"fastqc": fastqc.run,
             "atropos": atropos.run,
             "small-rna": srna.run,
             "samtools": samtools.run,
             "qualimap": qualimap.run,
             "qualimap_rnaseq": qualimap.run_rnaseq,
             "qsignature": qsignature.run,
             "coverage": coverage.run,
             "damage": damage.run,
             "variants": variant.run,
             "peddy": peddy.run_qc,
             "kraken": kraken.run,
             "picard": picard.run,
             "umi": umi.run,
             "viral": viral.run,
             "preseq": preseq.run,
             }
    qc_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "qc", data["description"]))
    metrics = {}
    qc_out = utils.deepish_copy(dd.get_summary_qc(data))
    for program_name in dd.get_algorithm_qc(data):
        if not bam_file and program_name != "kraken":  # kraken doesn't need bam
            continue
        if dd.get_phenotype(data) == "germline" and program_name != "variants":
            continue
        qc_fn = tools[program_name]
        cur_qc_dir = os.path.join(qc_dir, program_name)
        out = qc_fn(bam_file, data, cur_qc_dir)
        qc_files = None
        if out and isinstance(out, dict):
            # Check for metrics output, two cases:
            # 1. output with {"metrics"} and files ("base")
            if "metrics" in out:
                metrics.update(out.pop("metrics"))
            # 2. a dictionary of metrics
            elif "base" not in out:
                metrics.update(out)
            # Check for files only output
            if "base" in out:
                qc_files = out
        elif out and isinstance(out, basestring) and os.path.exists(out):
            qc_files = {"base": out, "secondary": []}
        if not qc_files:
            qc_files = _organize_qc_files(program_name, cur_qc_dir)
        if qc_files:
            qc_out[program_name] = qc_files

    metrics["Name"] = dd.get_sample_name(data)
    metrics["Quality format"] = dd.get_quality_format(data).lower()
    return {"qc": qc_out, "metrics": metrics}
Exemplo n.º 10
0
def _filter_bad_reads(in_bam, ref_file, data):
    """Use GATK filter to remove problem reads which choke GATK and Picard.
    """
    bam.index(in_bam, data["config"])
    out_file = "%s-gatkfilter.bam" % os.path.splitext(in_bam)[0]
    if not utils.file_exists(out_file):
        with tx_tmpdir(data) as tmp_dir:
            with file_transaction(data, out_file) as tx_out_file:
                params = [
                    "-T",
                    "PrintReads",
                    "-R",
                    ref_file,
                    "-I",
                    in_bam,
                    "--out",
                    tx_out_file,
                    "--filter_mismatching_base_and_quals",
                    "--filter_bases_not_stored",
                    "--filter_reads_with_N_cigar",
                ]
                if dd.get_quality_format(data, "").lower() == "illumina":
                    params.append("--fix_misencoded_quality_scores")
                jvm_opts = broad.get_gatk_framework_opts(data["config"], tmp_dir)
                cmd = [config_utils.get_program("gatk-framework", data["config"])] + jvm_opts + params
                do.run(cmd, "Filter problem reads")
    bam.index(out_file, data["config"])
    return out_file
Exemplo n.º 11
0
def _bgzip_from_fastq(data):
    """Prepare a bgzipped file from a fastq input, potentially gzipped (or bgzipped already).
    """
    in_file = data["in_file"]
    needs_convert = dd.get_quality_format(data).lower() == "illumina"
    if in_file.endswith(".gz") and not objectstore.is_remote(in_file):
        if needs_convert:
            needs_bgzip, needs_gunzip = True, True
        else:
            needs_bgzip, needs_gunzip = _check_gzipped_input(in_file, data)
    elif in_file.endswith(".bz2"):
        needs_bgzip, needs_gunzip = True, True
    elif objectstore.is_remote(in_file) and not tz.get_in(
        ["config", "algorithm", "align_split_size"], data):
        needs_bgzip, needs_gunzip = False, False
    else:
        needs_bgzip, needs_gunzip = True, False
    work_dir = utils.safe_makedir(
        os.path.join(data["dirs"]["work"], "align_prep"))
    if needs_bgzip or needs_gunzip or needs_convert or objectstore.is_remote(
            in_file):
        out_file = _bgzip_file(in_file, data["config"], work_dir, needs_bgzip,
                               needs_gunzip, needs_convert)
    else:
        out_file = os.path.join(
            work_dir,
            "%s_%s" % (dd.get_sample_name(data), os.path.basename(in_file)))
        utils.symlink_plus(in_file, out_file)
    return out_file
Exemplo n.º 12
0
def _fastp_trim(fastq_files, adapters, out_dir, data):
    """Perform multicore trimming with fastp (https://github.com/OpenGene/fastp)
    """
    report_file = os.path.join(out_dir, "%s-report.json" % utils.splitext_plus(os.path.basename(fastq_files[0]))[0])
    out_files = [os.path.join(out_dir, "%s-trimmed.fq.gz" % utils.splitext_plus(os.path.basename(x))[0])
                 for x in fastq_files]
    if not utils.file_exists(out_files[0]):
        with file_transaction(data, *[report_file] + out_files) as tx_out:
            tx_report = tx_out[0]
            tx_out_files = tx_out[1:]
            cmd = ["fastp", "--thread", dd.get_num_cores(data)]
            if dd.get_quality_format(data).lower() == "illumina":
                cmd += ["--phred64"]
            for i, (inf, outf) in enumerate(zip(fastq_files, tx_out_files)):
                if i == 0:
                    cmd += ["-i", inf, "-o", outf]
                else:
                    cmd += ["-I", inf, "-O", outf]
            cmd += ["--cut_by_quality3", "--cut_mean_quality", "5",
                    "--length_required", str(dd.get_min_read_length(data)),
                    "--disable_quality_filtering"]
            if "polyx" in dd.get_adapters(data):
                cmd += ["--trim_poly_x", "--poly_x_min_len", "8"]
            if "polyx" in dd.get_adapters(data) or "polyg" in dd.get_adapters(data):
                cmd += ["--trim_poly_g", "--poly_g_min_len", "8"]
            for a in adapters:
                cmd += ["--adapter_sequence", a]
            if not adapters:
                cmd += ["--disable_adapter_trimming"]
            cmd += ["--json", report_file, "--report_title", dd.get_sample_name(data)]
            do.run(cmd, "Trimming with fastp: %s" % dd.get_sample_name(data))
    return out_files, report_file
Exemplo n.º 13
0
def align_transcriptome(fastq_file, pair_file, ref_file, data):
    """
    bwa mem with settings for aligning to the transcriptome for eXpress/RSEM/etc
    """
    work_bam = dd.get_work_bam(data)
    base, ext = os.path.splitext(work_bam)
    out_file = base + ".transcriptome" + ext
    if utils.file_exists(out_file):
        data = dd.set_transcriptome_bam(data, out_file)
        return data
    # bwa mem needs phred+33 quality, so convert if it is Illumina
    if dd.get_quality_format(data).lower() == "illumina":
        logger.info("bwa mem does not support the phred+64 quality format, " "converting %s and %s to phred+33.")
        fastq_file = fastq.groom(fastq_file, in_qual="fastq-illumina", data=data)
        if pair_file:
            pair_file = fastq.groom(pair_file, in_qual="fastq-illumina", data=data)
    bwa = config_utils.get_program("bwa", data["config"])
    gtf_file = dd.get_gtf_file(data)
    gtf_fasta = index_transcriptome(gtf_file, ref_file, data)
    args = " ".join(_bwa_args_from_config(data["config"]))
    num_cores = data["config"]["algorithm"].get("num_cores", 1)
    cmd = (
        "{bwa} mem {args} -a -t {num_cores} {gtf_fasta} {fastq_file} "
        "{pair_file} | samtools view -bhS - > {tx_out_file}"
    )

    with file_transaction(out_file) as tx_out_file:
        message = "Aligning %s and %s to the transcriptome." % (fastq_file, pair_file)
        do.run(cmd.format(**locals()), message)
    data = dd.set_transcriptome_bam(data, out_file)
    return data
Exemplo n.º 14
0
def _bgzip_from_fastq(data):
    """Prepare a bgzipped file from a fastq input, potentially gzipped (or bgzipped already).
    """
    in_file = data["in_file"]
    needs_convert = dd.get_quality_format(data).lower() == "illumina"
    # special case, empty files that have been cleaned
    if not objectstore.is_remote(in_file) and os.path.getsize(in_file) == 0:
        needs_bgzip, needs_gunzip = False, False
    elif in_file.endswith(".gz") and not objectstore.is_remote(in_file):
        if needs_convert or dd.get_trim_ends(data):
            needs_bgzip, needs_gunzip = True, True
        else:
            needs_bgzip, needs_gunzip = _check_gzipped_input(in_file, data)
    elif in_file.endswith(".bz2"):
        needs_bgzip, needs_gunzip = True, True
    elif objectstore.is_remote(in_file) and not tz.get_in(["config", "algorithm", "align_split_size"], data):
        needs_bgzip, needs_gunzip = False, False
    else:
        needs_bgzip, needs_gunzip = True, False
    work_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "align_prep"))
    if needs_bgzip or needs_gunzip or needs_convert or dd.get_trim_ends(data) or objectstore.is_remote(in_file):
        out_file = _bgzip_file(in_file, data["config"], work_dir,
                               needs_bgzip, needs_gunzip, needs_convert, data)
    else:
        out_file = os.path.join(work_dir, "%s_%s" % (dd.get_sample_name(data), os.path.basename(in_file)))
        # We cannot symlink in CWL, but may be able to use inputs or copy
        if data.get("is_cwl"):
            # Has grabix indexes, we're okay to go
            if utils.file_exists(in_file + ".gbi"):
                return in_file
            else:
                return utils.copy_plus(in_file, out_file)
        else:
            utils.symlink_plus(in_file, out_file)
    return out_file
Exemplo n.º 15
0
def to_sdf(files, data):
    """Convert a fastq or BAM input into a SDF indexed file.
    """
    # BAM
    if len(files) == 1 and files[0].endswith(".bam"):
        qual = []
        format = ["-f", "sam-pe" if bam.is_paired(files[0]) else "sam-se"]
        inputs = [files[0]]
    # fastq
    else:
        qual = ["-q", "illumina" if dd.get_quality_format(data).lower() == "illumina" else "sanger"]
        format = ["-f", "fastq"]
        if len(files) == 2:
            inputs = ["-l", files[0], "-r", files[1]]
        else:
            assert len(files) == 1
            inputs = [files[0]]
    work_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "align_prep"))
    out_file = os.path.join(work_dir,
                            "%s.sdf" % utils.splitext_plus(os.path.basename(os.path.commonprefix(files)))[0])
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            cmd = _rtg_cmd(["rtg", "format", "-o", tx_out_file] + format + qual + inputs)
            do.run(cmd, "Format inputs to indexed SDF")
    return out_file
Exemplo n.º 16
0
def _bgzip_from_fastq(data):
    """Prepare a bgzipped file from a fastq input, potentially gzipped (or bgzipped already).
    """
    in_file = data["in_file"]
    if isinstance(in_file, (list, tuple)):
        in_file = in_file[0]
    needs_convert = dd.get_quality_format(data).lower() == "illumina"
    # special case, empty files that have been cleaned
    if not objectstore.is_remote(in_file) and os.path.getsize(in_file) == 0:
        needs_bgzip, needs_gunzip = False, False
    elif in_file.endswith(".gz") and not objectstore.is_remote(in_file):
        if needs_convert or dd.get_trim_ends(data):
            needs_bgzip, needs_gunzip = True, True
        else:
            needs_bgzip, needs_gunzip = _check_gzipped_input(in_file, data)
    elif in_file.endswith(".bz2"):
        needs_bgzip, needs_gunzip = True, True
    elif objectstore.is_remote(in_file) and not tz.get_in(["config", "algorithm", "align_split_size"], data):
        needs_bgzip, needs_gunzip = False, False
    else:
        needs_bgzip, needs_gunzip = True, False
    work_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "align_prep"))
    if (needs_bgzip or needs_gunzip or needs_convert or dd.get_trim_ends(data) or
          objectstore.is_remote(in_file) or
          (isinstance(data["in_file"], (tuple, list)) and len(data["in_file"]) > 1)):
        out_file = _bgzip_file(data["in_file"], data["config"], work_dir,
                               needs_bgzip, needs_gunzip, needs_convert, data)
    else:
        out_file = os.path.join(work_dir, "%s_%s" % (dd.get_sample_name(data), os.path.basename(in_file)))
        out_file = _symlink_or_copy_grabix(in_file, out_file, data)
    return out_file
Exemplo n.º 17
0
def _get_quality_flag(data):
    qual_format = dd.get_quality_format(data)
    if qual_format.lower() == "illumina":
        return "--phred64"
    elif qual_format.lower() == "solexa":
        return "--solexa-quals"
    else:
        return "--phred33"
Exemplo n.º 18
0
def _get_quality_flag(data):
    qual_format = dd.get_quality_format(data)
    if qual_format.lower() == "illumina":
        return "--phred64"
    elif qual_format.lower() == "solexa":
        return "--solexa-quals"
    else:
        return "--phred33"
Exemplo n.º 19
0
def _atropos_trim(fastq_files, adapters, out_dir, data):
    """Perform multicore trimming with atropos.
    """
    report_file = os.path.join(
        out_dir, "%s-report.json" %
        utils.splitext_plus(os.path.basename(fastq_files[0]))[0])
    out_files = [
        os.path.join(
            out_dir,
            "%s-trimmed.fq.gz" % utils.splitext_plus(os.path.basename(x))[0])
        for x in fastq_files
    ]
    if not utils.file_exists(out_files[0]):
        with file_transaction(data, *[report_file] + out_files) as tx_out:
            tx_report_file, tx_out1 = tx_out[:2]
            if len(tx_out) > 2:
                tx_out2 = tx_out[2]
            adapters_args = " ".join(["-a %s" % a for a in adapters])
            aligner_args = "--aligner adapter"
            if len(fastq_files) == 1:
                input_args = "-se %s" % objectstore.cl_input(fastq_files[0])
                output_args = "-o >(bgzip --threads %s -c > {tx_out1})".format(
                    **locals())
            else:
                assert len(fastq_files) == 2, fastq_files
                adapters_args = adapters_args + " " + " ".join(
                    ["-A %s" % a for a in adapters])
                input_args = "-pe1 %s -pe2 %s" % tuple(
                    [objectstore.cl_input(x) for x in fastq_files])
                output_args = "-o >(bgzip -c > {tx_out1}) -p >(bgzip -c > {tx_out2})".format(
                    **locals())
            adapters_args += " --no-default-adapters"  # Prevent GitHub queries
            quality_base = "64" if dd.get_quality_format(
                data).lower() == "illumina" else "33"
            sample_name = dd.get_sample_name(data)
            report_args = "--report-file %s --report-formats json --sample-id %s" % (
                tx_report_file, dd.get_sample_name(data))
            ropts = " ".join(
                str(x) for x in config_utils.get_resources(
                    "atropos", data["config"]).get("options", []))
            extra_opts = []
            for k, alt_ks, v in [("--quality-cutoff", ["-q "], "5"),
                                 ("--minimum-length", ["-m "],
                                  str(dd.get_min_read_length(data))),
                                 ("--nextseq-trim", [], "25")]:
                if k not in ropts and not any(alt_k in ropts
                                              for alt_k in alt_ks):
                    extra_opts.append("%s=%s" % (k, v))
            extra_opts = " ".join(extra_opts)
            thread_args = ("--threads %s" % dd.get_num_cores(data)
                           if dd.get_num_cores(data) > 1 else "")
            cmd = (
                "atropos trim {ropts} {thread_args} --quality-base {quality_base} --format fastq "
                "{adapters_args} {input_args} {output_args} {report_args} {extra_opts}"
            )
            do.run(cmd.format(**locals()),
                   "Trimming with atropos: %s" % dd.get_sample_name(data))
    return out_files, report_file
Exemplo n.º 20
0
def _run_qc_tools(bam_file, data):
    """Run a set of third party quality control tools, returning QC directory and metrics.

        :param bam_file: alignments in bam format
        :param data: dict with all configuration information

        :returns: dict with output of different tools
    """
    from bcbio.qc import (coverage, damage, fastqc, kraken, qsignature, qualimap,
                          samtools, picard, srna, umi, variant, viral, preseq)
    tools = {"fastqc": fastqc.run,
             "small-rna": srna.run,
             "samtools": samtools.run,
             "qualimap": qualimap.run,
             "qualimap_rnaseq": qualimap.run_rnaseq,
             "qsignature": qsignature.run,
             "coverage": coverage.run,
             "damage": damage.run,
             "variants": variant.run,
             "kraken": kraken.run,
             "picard": picard.run,
             "umi": umi.run,
             "viral": viral.run,
             "preseq": preseq.run,
             }
    qc_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "qc", data["description"]))
    metrics = {}
    qc_out = {}
    for program_name in dd.get_algorithm_qc(data):
        if not bam_file and program_name != "kraken":  # kraken doesn't need bam
            continue
        if dd.get_phenotype(data) == "germline" and program_name != "variants":
            continue
        qc_fn = tools[program_name]
        cur_qc_dir = os.path.join(qc_dir, program_name)
        out = qc_fn(bam_file, data, cur_qc_dir)
        qc_files = None
        if out and isinstance(out, dict):
            # Check for metrics output, two cases:
            # 1. output with {"metrics"} and files ("base")
            if "metrics" in out:
                metrics.update(out.pop("metrics"))
            # 2. a dictionary of metrics
            elif "base" not in out:
                metrics.update(out)
            # Check for files only output
            if "base" in out:
                qc_files = out
        elif out and isinstance(out, basestring) and os.path.exists(out):
            qc_files = {"base": out, "secondary": []}
        if not qc_files:
            qc_files = _organize_qc_files(program_name, cur_qc_dir)
        if qc_files:
            qc_out[program_name] = qc_files

    metrics["Name"] = dd.get_sample_name(data)
    metrics["Quality format"] = dd.get_quality_format(data).lower()
    return {"qc": qc_out, "metrics": metrics}
Exemplo n.º 21
0
def _get_quality_format(config):
    SUPPORTED_FORMATS = ["illumina", "standard"]
    quality_format = dd.get_quality_format(data).lower()
    if quality_format not in SUPPORTED_FORMATS:
        logger.error("quality_format is set to an unsupported format. "
                     "Supported formats are %s."
                     % (", ".join(SUPPORTED_FORMATS)))
        exit(1)
    return quality_format
Exemplo n.º 22
0
def _get_quality_format(config):
    SUPPORTED_FORMATS = ["illumina", "standard"]
    quality_format = dd.get_quality_format(data).lower()
    if quality_format not in SUPPORTED_FORMATS:
        logger.error("quality_format is set to an unsupported format. "
                     "Supported formats are %s."
                     % (", ".join(SUPPORTED_FORMATS)))
        exit(1)
    return quality_format
Exemplo n.º 23
0
def _ready_gzip_fastq(in_files, data):
    """Check if we have gzipped fastq and don't need format conversion or splitting.
    """
    all_gzipped = all([not x or x.endswith(".gz") for x in in_files])
    needs_convert = dd.get_quality_format(data).lower() == "illumina"
    needs_trim = dd.get_trim_ends(data)
    do_splitting = tz.get_in(["config", "algorithm", "align_split_size"],
                             data) is not False
    return (all_gzipped and not needs_convert and not do_splitting
            and not objectstore.is_remote(in_files[0]) and not needs_trim)
Exemplo n.º 24
0
def _run_qc_tools(bam_file, data):
    """Run a set of third party quality control tools, returning QC directory and metrics.

        :param bam_file: alignments in bam format
        :param data: dict with all configuration information

        :returns: dict with output of different tools
    """
    from bcbio.qc import (coverage, damage, fastqc, kraken, qsignature,
                          qualimap, samtools, picard, srna, umi, variant,
                          viral)
    tools = {
        "fastqc": fastqc.run,
        "small-rna": srna.run,
        "samtools": samtools.run,
        "qualimap": qualimap.run,
        "qualimap_rnaseq": qualimap.run_rnaseq,
        "qsignature": qsignature.run,
        "coverage": coverage.run,
        "damage": damage.run,
        "variants": variant.run,
        "kraken": kraken.run,
        "picard": picard.run,
        "umi": umi.run,
        "viral": viral.run
    }
    qc_dir = utils.safe_makedir(
        os.path.join(data["dirs"]["work"], "qc", data["description"]))
    metrics = {}
    qc_out = {}
    for program_name in dd.get_algorithm_qc(data):
        qc_fn = tools[program_name]
        cur_qc_dir = os.path.join(qc_dir, program_name)
        out = qc_fn(bam_file, data, cur_qc_dir)
        qc_files = None
        if out and isinstance(out, dict):
            if "base" in out:
                if "metrics" in out:
                    metrics.update(out.pop("metrics"))
                qc_files = out
            else:
                metrics.update(out)
        elif out and isinstance(out, basestring) and os.path.exists(out):
            qc_files = {"base": out, "secondary": []}
        if not qc_files:
            qc_files = _organize_qc_files(program_name, cur_qc_dir)
        if qc_files:
            qc_out[program_name] = qc_files

    metrics["Name"] = dd.get_sample_name(data)
    metrics["Quality format"] = dd.get_quality_format(data).lower()
    return {"qc": qc_out, "metrics": metrics}
Exemplo n.º 25
0
def _atropos_trim(fastq_files, adapters, out_dir, data):
    """Perform multicore trimming with atropos.
    """
    report_file = os.path.join(
        out_dir, "%s-report.json" %
        utils.splitext_plus(os.path.basename(fastq_files[0]))[0])
    out_files = [
        os.path.join(
            out_dir,
            "%s-trimmed.fq.gz" % utils.splitext_plus(os.path.basename(x))[0])
        for x in fastq_files
    ]
    if not utils.file_exists(out_files[0]):
        with file_transaction(data, *[report_file] + out_files) as tx_out:
            tx_report_file, tx_out1 = tx_out[:2]
            if len(tx_out) > 2:
                tx_out2 = tx_out[2]
            adapters_args = " ".join(["-a %s" % a for a in adapters])
            aligner_args = "--aligner adapter"
            if len(fastq_files) == 1:
                input_args = "-se %s" % objectstore.cl_input(fastq_files[0])
                output_args = "-o >(bgzip --threads %s -c > {tx_out1})".format(
                    **locals())
            else:
                assert len(fastq_files) == 2, fastq_files
                if adapters and len(adapters) <= 2:
                    aligner_args = "--aligner insert"
                adapters_args = adapters_args + " " + " ".join(
                    ["-A %s" % a for a in adapters])
                input_args = "-pe1 %s -pe2 %s" % tuple(
                    [objectstore.cl_input(x) for x in fastq_files])
                output_args = "-o >(bgzip -c > {tx_out1}) -p >(bgzip -c > {tx_out2})".format(
                    **locals())
            quality_base = "64" if dd.get_quality_format(
                data).lower() == "illumina" else "33"
            sample_name = dd.get_sample_name(data)
            report_args = "--report-file %s --report-formats json --sample-id %s" % (
                tx_report_file, dd.get_sample_name(data))
            ropts = " ".join(
                str(x) for x in config_utils.get_resources(
                    "atropos", data["config"]).get("options", []))
            thread_args = ("--threads %s" % dd.get_num_cores(data)
                           if dd.get_num_cores(data) > 1 else "")
            cmd = (
                "atropos trim {ropts} {thread_args} --quality-base {quality_base} --format fastq "
                "{adapters_args} {aligner_args} {input_args} {output_args} {report_args}"
            )
            cmd += " --quality-cutoff=5 --minimum-length=%s" % dd.get_min_read_length(
                data)
            do.run(cmd.format(**locals()),
                   "Trimming with atropos: %s" % dd.get_sample_name(data))
    return out_files, report_file
Exemplo n.º 26
0
def _ready_bgzip_fastq(in_files, data):
    """Check if we have bgzipped fastq and don't need format conversion or splitting.
    """
    all_gzipped = all([not x or x.endswith(".gz") for x in in_files])
    if all_gzipped:
        all_bgzipped = all([not x or not _check_gzipped_input(x, data)[0] for x in in_files])
    else:
        all_bgzipped = False
    needs_convert = dd.get_quality_format(data).lower() == "illumina"
    needs_trim = dd.get_trim_ends(data)
    do_splitting = tz.get_in(["config", "algorithm", "align_split_size"], data) is not False
    return (all_bgzipped and not needs_convert and not do_splitting and not objectstore.is_remote(in_files[0])
            and not needs_trim)
Exemplo n.º 27
0
def _ready_gzip_fastq(in_files, data, require_bgzip=False):
    """Check if we have gzipped fastq and don't need format conversion or splitting.

    Avoid forcing bgzip if we don't need indexed files.
    """
    all_gzipped = all([not x or x.endswith(".gz") for x in in_files])
    if require_bgzip and all_gzipped:
        all_gzipped = all([not x or not _check_gzipped_input(x, data)[0] for x in in_files])
    needs_convert = dd.get_quality_format(data).lower() == "illumina"
    needs_trim = dd.get_trim_ends(data)
    do_splitting = dd.get_align_split_size(data) is not False
    return (all_gzipped and not needs_convert and not do_splitting and not objectstore.is_remote(in_files[0])
            and not needs_trim and not get_downsample_params(data))
Exemplo n.º 28
0
def _ready_gzip_fastq(in_files, data, require_bgzip=False):
    """Check if we have gzipped fastq and don't need format conversion or splitting.

    Avoid forcing bgzip if we don't need indexed files.
    """
    all_gzipped = all([not x or x.endswith(".gz") for x in in_files])
    if require_bgzip and all_gzipped:
        all_gzipped = all([not x or not _check_gzipped_input(x, data)[0] for x in in_files])
    needs_convert = dd.get_quality_format(data).lower() == "illumina"
    needs_trim = dd.get_trim_ends(data)
    do_splitting = dd.get_align_split_size(data) is not False
    return (all_gzipped and not needs_convert and not do_splitting and
            not objectstore.is_remote(in_files[0]) and not needs_trim and not get_downsample_params(data))
Exemplo n.º 29
0
def _atropos_trim(fastq_files, adapters, out_dir, data):
    """Perform multicore trimming with atropos.
    """
    report_file = os.path.join(out_dir, "%s-report.json" % utils.splitext_plus(os.path.basename(fastq_files[0]))[0])
    out_files = [os.path.join(out_dir, "%s-trimmed.fq.gz" % utils.splitext_plus(os.path.basename(x))[0])
                 for x in fastq_files]
    if not utils.file_exists(out_files[0]):
        with file_transaction(data, *[report_file] + out_files) as tx_out:
            tx_report_file, tx_out1 = tx_out[:2]
            if len(tx_out) > 2:
                tx_out2 = tx_out[2]
            # polyX trimming, anchored to the 3' ends of reads
            if "polyx" in dd.get_adapters(data):
                adapters += ["A{200}", "C{200}", "G{200}", "T{200}"]
            adapters_args = " ".join(["-a '%s'" % a for a in adapters])
            adapters_args += " --overlap 8"  # Avoid very short internal matches (default is 3)
            adapters_args += " --no-default-adapters --no-cache-adapters"  # Prevent GitHub queries and saving pickles
            aligner_args = "--aligner adapter"
            if len(fastq_files) == 1:
                cores = dd.get_num_cores(data)
                input_args = "-se %s" % objectstore.cl_input(fastq_files[0])
                output_args = "-o >(bgzip --threads {cores} -c > {tx_out1})".format(**locals())
            else:
                assert len(fastq_files) == 2, fastq_files
                cores = max(1, dd.get_num_cores(data) // 2)
                adapters_args = adapters_args + " " + " ".join(["-A '%s'" % a for a in adapters])
                input_args = "-pe1 %s -pe2 %s" % tuple([objectstore.cl_input(x) for x in fastq_files])
                output_args = ("-o >(bgzip --threads {cores} -c > {tx_out1}) "
                               "-p >(bgzip --threads {cores} -c > {tx_out2})").format(**locals())
            quality_base = "64" if dd.get_quality_format(data).lower() == "illumina" else "33"
            sample_name = dd.get_sample_name(data)
            report_args = "--report-file %s --report-formats json --sample-id %s" % (tx_report_file,
                                                                                     dd.get_sample_name(data))
            ropts = " ".join(str(x) for x in
                             config_utils.get_resources("atropos", data["config"]).get("options", []))
            extra_opts = []
            for k, alt_ks, v, want in [("--quality-cutoff", ["-q "], "5", True),
                                       ("--minimum-length", ["-m "], str(dd.get_min_read_length(data)), True),
                                       ("--nextseq-trim", [], "25", ("polyx" in dd.get_adapters(data) or
                                                                     "polyg" in dd.get_adapters(data)))]:
                if k not in ropts and not any(alt_k in ropts for alt_k in alt_ks):
                    if want:
                        extra_opts.append("%s=%s" % (k, v))
            extra_opts = " ".join(extra_opts)
            thread_args = ("--threads %s" % cores if cores > 1 else "")
            cmd = ("atropos trim {ropts} {thread_args} --quality-base {quality_base} --format fastq "
                   "{adapters_args} {input_args} {output_args} {report_args} {extra_opts}")
            do.run(cmd.format(**locals()), "Trimming with atropos: %s" % dd.get_sample_name(data))
    return out_files, report_file
Exemplo n.º 30
0
def _atropos_trim(fastq_files, adapters, out_dir, data):
    """Perform multicore trimming with atropos.
    """
    report_file = os.path.join(out_dir, "%s-report.json" % utils.splitext_plus(os.path.basename(fastq_files[0]))[0])
    out_files = [os.path.join(out_dir, "%s-trimmed.fq.gz" % utils.splitext_plus(os.path.basename(x))[0])
                 for x in fastq_files]
    if not utils.file_exists(out_files[0]):
        with file_transaction(data, *[report_file] + out_files) as tx_out:
            tx_report_file, tx_out1 = tx_out[:2]
            if len(tx_out) > 2:
                tx_out2 = tx_out[2]
            # polyX trimming, anchored to the 3' ends of reads
            if "polyx" in dd.get_adapters(data):
                adapters += ["A{200}$", "C{200}$", "G{200}$", "T{200}$"]
            adapters_args = " ".join(["-a '%s'" % a for a in adapters])
            adapters_args += " --overlap 8"  # Avoid very short internal matches (default is 3)
            adapters_args += " --no-default-adapters --no-cache-adapters"  # Prevent GitHub queries and saving pickles
            aligner_args = "--aligner adapter"
            if len(fastq_files) == 1:
                cores = dd.get_num_cores(data)
                input_args = "-se %s" % objectstore.cl_input(fastq_files[0])
                output_args = "-o >(bgzip --threads {cores} -c > {tx_out1})".format(**locals())
            else:
                assert len(fastq_files) == 2, fastq_files
                cores = max(1, dd.get_num_cores(data) // 2)
                adapters_args = adapters_args + " " + " ".join(["-A '%s'" % a for a in adapters])
                input_args = "-pe1 %s -pe2 %s" % tuple([objectstore.cl_input(x) for x in fastq_files])
                output_args = ("-o >(bgzip --threads {cores} -c > {tx_out1}) "
                               "-p >(bgzip --threads {cores} -c > {tx_out2})").format(**locals())
            quality_base = "64" if dd.get_quality_format(data).lower() == "illumina" else "33"
            sample_name = dd.get_sample_name(data)
            report_args = "--report-file %s --report-formats json --sample-id %s" % (tx_report_file,
                                                                                     dd.get_sample_name(data))
            ropts = " ".join(str(x) for x in
                             config_utils.get_resources("atropos", data["config"]).get("options", []))
            extra_opts = []
            for k, alt_ks, v, want in [("--quality-cutoff", ["-q "], "5", True),
                                       ("--minimum-length", ["-m "], str(dd.get_min_read_length(data)), True),
                                       ("--nextseq-trim", [], "25", ("polyx" in dd.get_adapters(data) or
                                                                     "polyg" in dd.get_adapters(data)))]:
                if k not in ropts and not any(alt_k in ropts for alt_k in alt_ks):
                    if want:
                        extra_opts.append("%s=%s" % (k, v))
            extra_opts = " ".join(extra_opts)
            thread_args = ("--threads %s" % cores if cores > 1 else "")
            cmd = ("atropos trim {ropts} {thread_args} --quality-base {quality_base} --format fastq "
                   "{adapters_args} {input_args} {output_args} {report_args} {extra_opts}")
            do.run(cmd.format(**locals()), "Trimming with atropos: %s" % dd.get_sample_name(data))
    return out_files, report_file
Exemplo n.º 31
0
def _run_qc_tools(bam_file, data):
    """Run a set of third party quality control tools, returning QC directory and metrics.

        :param bam_file: alignments in bam format
        :param data: dict with all configuration information

        :returns: dict with output of different tools
    """
    from bcbio.qc import fastqc, gemini, kraken, qsignature, qualimap, samtools, picard, srna, umi
    tools = {
        "fastqc": fastqc.run,
        "small-rna": srna.run,
        "samtools": samtools.run,
        "qualimap": qualimap.run,
        "qualimap_rnaseq": qualimap.run_rnaseq,
        "gemini": gemini.run,
        "qsignature": qsignature.run,
        "coverage": _run_coverage_qc,
        "variants": _run_variants_qc,
        "kraken": kraken.run,
        "picard": picard.run,
        "umi": umi.run
    }
    qc_dir = utils.safe_makedir(
        os.path.join(data["dirs"]["work"], "qc", data["description"]))
    metrics = {}
    qc_out = {}
    for program_name in tz.get_in(["config", "algorithm", "qc"], data):
        qc_fn = tools[program_name]
        cur_qc_dir = os.path.join(qc_dir, program_name)
        out = qc_fn(bam_file, data, cur_qc_dir)
        qc_files = None
        if out and isinstance(out, dict):
            metrics.update(out)
        elif out and isinstance(out, basestring) and os.path.exists(out):
            qc_files = {"base": out, "secondary": []}
        if not qc_files:
            qc_files = _organize_qc_files(program_name, cur_qc_dir)
        if qc_files:
            qc_out[program_name] = qc_files

    bam.remove("%s-downsample%s" % os.path.splitext(bam_file))

    metrics["Name"] = dd.get_sample_name(data)
    metrics["Quality format"] = dd.get_quality_format(data).lower()
    return {"qc": qc_out, "metrics": metrics}
Exemplo n.º 32
0
def _run_qc_tools(bam_file, data):
    """Run a set of third party quality control tools, returning QC directory and metrics.

        :param bam_file: alignments in bam format
        :param data: dict with all configuration information

        :returns: dict with output of different tools
    """
    from bcbio.qc import fastqc, kraken, qsignature, qualimap, samtools, picard, srna, umi, variant
    tools = {"fastqc": fastqc.run,
             "small-rna": srna.run,
             "samtools": samtools.run,
             "qualimap": qualimap.run,
             "qualimap_rnaseq": qualimap.run_rnaseq,
             "qsignature": qsignature.run,
             "coverage": _run_coverage_qc,
             "variants": variant.run,
             "kraken": kraken.run,
             "picard": picard.run,
             "umi": umi.run}
    qc_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "qc", data["description"]))
    metrics = {}
    qc_out = {}
    for program_name in dd.get_algorithm_qc(data):
        qc_fn = tools[program_name]
        cur_qc_dir = os.path.join(qc_dir, program_name)
        out = qc_fn(bam_file, data, cur_qc_dir)
        qc_files = None
        if out and isinstance(out, dict):
            if "base" in out:
                qc_files = out
            else:
                metrics.update(out)
        elif out and isinstance(out, basestring) and os.path.exists(out):
            qc_files = {"base": out, "secondary": []}
        if not qc_files:
            qc_files = _organize_qc_files(program_name, cur_qc_dir)
        if qc_files:
            qc_out[program_name] = qc_files

    metrics["Name"] = dd.get_sample_name(data)
    metrics["Quality format"] = dd.get_quality_format(data).lower()
    return {"qc": qc_out, "metrics": metrics}
Exemplo n.º 33
0
def _filter_bad_reads(in_bam, ref_file, data):
    """Use GATK filter to remove problem reads which choke GATK and Picard.
    """
    bam.index(in_bam, data["config"])
    out_file = "%s-gatkfilter.bam" % os.path.splitext(in_bam)[0]
    if not utils.file_exists(out_file):
        with tx_tmpdir(data) as tmp_dir:
            with file_transaction(data, out_file) as tx_out_file:
                params = [
                    ("FixMisencodedBaseQualityReads" if dd.get_quality_format(
                        data, "").lower() == "illumina" else "PrintReads"),
                    "-R", ref_file, "-I", in_bam, "-O", tx_out_file, "-RF",
                    "MatchingBasesAndQualsReadFilter", "-RF",
                    "SeqIsStoredReadFilter", "-RF", "CigarContainsNoNOperator"
                ]
                jvm_opts = broad.get_gatk_opts(data["config"], tmp_dir)
                do.run(broad.gatk_cmd("gatk", jvm_opts, params),
                       "Filter problem reads")
    bam.index(out_file, data["config"])
    return out_file
Exemplo n.º 34
0
def _filter_bad_reads(in_bam, ref_file, data):
    """Use GATK filter to remove problem reads which choke GATK and Picard.
    """
    bam.index(in_bam, data["config"])
    out_file = "%s-gatkfilter.bam" % os.path.splitext(in_bam)[0]
    if not utils.file_exists(out_file):
        with tx_tmpdir(data) as tmp_dir:
            with file_transaction(data, out_file) as tx_out_file:
                params = [("FixMisencodedBaseQualityReads"
                           if dd.get_quality_format(data, "").lower() == "illumina"
                           else "PrintReads"),
                          "-R", ref_file,
                          "-I", in_bam,
                          "-O", tx_out_file,
                          "-RF", "MatchingBasesAndQualsReadFilter",
                          "-RF", "SeqIsStoredReadFilter",
                          "-RF", "CigarContainsNoNOperator"]
                jvm_opts = broad.get_gatk_opts(data["config"], tmp_dir)
                do.run(broad.gatk_cmd("gatk", jvm_opts, params), "Filter problem reads")
    bam.index(out_file, data["config"])
    return out_file
Exemplo n.º 35
0
def _filter_bad_reads(in_bam, ref_file, data):
    """Use GATK filter to remove problem reads which choke GATK and Picard.
    """
    bam.index(in_bam, data["config"])
    out_file = "%s-gatkfilter.bam" % os.path.splitext(in_bam)[0]
    if not utils.file_exists(out_file):
        with tx_tmpdir(data) as tmp_dir:
            with file_transaction(data, out_file) as tx_out_file:
                params = ["-T", "PrintReads",
                          "-R", ref_file,
                          "-I", in_bam,
                          "--out", tx_out_file,
                          "--filter_mismatching_base_and_quals",
                          "--filter_bases_not_stored",
                          "--filter_reads_with_N_cigar"]
                if dd.get_quality_format(data, "").lower() == "illumina":
                    params.append("--fix_misencoded_quality_scores")
                jvm_opts = broad.get_gatk_framework_opts(data["config"], tmp_dir)
                do.run(broad.gatk_cmd("gatk-framework", jvm_opts, params), "Filter problem reads")
    bam.index(out_file, data["config"])
    return out_file
Exemplo n.º 36
0
def _seqtk_fastq_prep_cl(data, in_file=None, read_num=0):
    """Provide a commandline for prep of fastq inputs with seqtk.

    Handles fast conversion of fastq quality scores and trimming.
    """
    needs_convert = dd.get_quality_format(data).lower() == "illumina"
    trim_ends = dd.get_trim_ends(data)
    seqtk = config_utils.get_program("seqtk", data["config"])
    if in_file:
        in_file = objectstore.cl_input(in_file)
    else:
        in_file = "/dev/stdin"
    cmd = ""
    if needs_convert:
        cmd += "{seqtk} seq -Q64 -V {in_file}".format(**locals())
    if trim_ends:
        left_trim, right_trim = trim_ends[0:2] if data.get("read_num", read_num) == 0 else trim_ends[2:4]
        if left_trim or right_trim:
            trim_infile = "/dev/stdin" if needs_convert else in_file
            pipe = " | " if needs_convert else ""
            cmd += "{pipe}{seqtk} trimfq -b {left_trim} -e {right_trim} {trim_infile}".format(**locals())
    return cmd
Exemplo n.º 37
0
def _seqtk_fastq_prep_cl(data, in_file=None, read_num=0):
    """Provide a commandline for prep of fastq inputs with seqtk.

    Handles fast conversion of fastq quality scores and trimming.
    """
    needs_convert = dd.get_quality_format(data).lower() == "illumina"
    trim_ends = dd.get_trim_ends(data)
    seqtk = config_utils.get_program("seqtk", data["config"])
    if in_file:
        in_file = objectstore.cl_input(in_file)
    else:
        in_file = "/dev/stdin"
    cmd = ""
    if needs_convert:
        cmd += "{seqtk} seq -Q64 -V {in_file}".format(**locals())
    if trim_ends:
        left_trim, right_trim = trim_ends[0:2] if data.get("read_num", read_num) == 0 else trim_ends[2:4]
        if left_trim or right_trim:
            trim_infile = "/dev/stdin" if needs_convert else in_file
            pipe = " | " if needs_convert else ""
            cmd += "{pipe}{seqtk} trimfq -b {left_trim} -e {right_trim} {trim_infile}".format(**locals())
    return cmd
Exemplo n.º 38
0
def _atropos_trim(fastq_files, adapters, out_dir, data):
    """Perform multicore trimming with atropos.
    """
    report_file = os.path.join(out_dir, "%s-report.json" % utils.splitext_plus(os.path.basename(fastq_files[0]))[0])
    out_files = [os.path.join(out_dir, "%s-trimmed.fq.gz" % utils.splitext_plus(os.path.basename(x))[0])
                 for x in fastq_files]
    if not utils.file_exists(out_files[0]):
        with file_transaction(data, *[report_file] + out_files) as tx_out:
            tx_report_file, tx_out1 = tx_out[:2]
            if len(tx_out) > 2:
                tx_out2 = tx_out[2]
            adapters_args = " ".join(["-a %s" % a for a in adapters])
            aligner_args = "--aligner adapter"
            if len(fastq_files) == 1:
                input_args = "-se %s" % objectstore.cl_input(fastq_files[0])
                output_args = "-o >(bgzip --threads %s -c > {tx_out1})".format(**locals())
            else:
                assert len(fastq_files) == 2, fastq_files
                adapters_args = adapters_args + " " + " ".join(["-A %s" % a for a in adapters])
                input_args = "-pe1 %s -pe2 %s" % tuple([objectstore.cl_input(x) for x in fastq_files])
                output_args = "-o >(bgzip -c > {tx_out1}) -p >(bgzip -c > {tx_out2})".format(**locals())
            quality_base = "64" if dd.get_quality_format(data).lower() == "illumina" else "33"
            sample_name = dd.get_sample_name(data)
            report_args = "--report-file %s --report-formats json --sample-id %s" % (tx_report_file,
                                                                                     dd.get_sample_name(data))
            ropts = " ".join(str(x) for x in
                             config_utils.get_resources("atropos", data["config"]).get("options", []))
            extra_opts = []
            for k, alt_ks, v in [("--quality-cutoff", ["-q "], "5"),
                                 ("--minimum-length", ["-m "], str(dd.get_min_read_length(data)))]:
                if k not in ropts and not any(alt_k in ropts for alt_k in alt_ks):
                    extra_opts.append("%s=%s" % (k, v))
            extra_opts = " ".join(extra_opts)
            thread_args = ("--threads %s" % dd.get_num_cores(data) if dd.get_num_cores(data) > 1 else "")
            cmd = ("atropos trim {ropts} {thread_args} --quality-base {quality_base} --format fastq "
                   "{adapters_args} {input_args} {output_args} {report_args} {extra_opts}")
            do.run(cmd.format(**locals()), "Trimming with atropos: %s" % dd.get_sample_name(data))
    return out_files, report_file
Exemplo n.º 39
0
def _bgzip_from_fastq(data):
    """Prepare a bgzipped file from a fastq input, potentially gzipped (or bgzipped already).
    """
    in_file = data["in_file"]
    if isinstance(in_file, (list, tuple)):
        in_file = in_file[0]
    needs_convert = dd.get_quality_format(data).lower() == "illumina"
    # special case, empty files that have been cleaned
    if not objectstore.is_remote(in_file) and os.path.getsize(in_file) == 0:
        needs_bgzip, needs_gunzip = False, False
    elif in_file.endswith(".gz") and not objectstore.is_remote(in_file):
        if needs_convert or dd.get_trim_ends(data):
            needs_bgzip, needs_gunzip = True, True
        else:
            needs_bgzip, needs_gunzip = _check_gzipped_input(in_file, data)
    elif in_file.endswith(".bz2"):
        needs_bgzip, needs_gunzip = True, True
    elif objectstore.is_remote(in_file) and not tz.get_in(
        ["config", "algorithm", "align_split_size"], data):
        needs_bgzip, needs_gunzip = False, False
    else:
        needs_bgzip, needs_gunzip = True, False
    work_dir = utils.safe_makedir(
        os.path.join(data["dirs"]["work"], "align_prep"))
    if (needs_bgzip or needs_gunzip or needs_convert or dd.get_trim_ends(data)
            or objectstore.is_remote(in_file)
            or (isinstance(data["in_file"],
                           (tuple, list)) and len(data["in_file"]) > 1)):
        out_file = _bgzip_file(data["in_file"], data["config"], work_dir,
                               needs_bgzip, needs_gunzip, needs_convert, data)
    else:
        out_file = os.path.join(
            work_dir,
            "%s_%s" % (dd.get_sample_name(data), os.path.basename(in_file)))
        out_file = _symlink_or_copy_grabix(in_file, out_file, data)
    return out_file