Пример #1
0
def _get_gtf(config):
    gtf = config["annotation"].get("file", None)
    #gtf = config.get("gtf", None)
    if not gtf or not file_exists(gtf):
        logger.error("genebody_coverage needs a GTF file passed to it.")
        exit(1)
    return gtf
Пример #2
0
def align(fastq_file,
          pair_file,
          ref_file,
          out_base,
          align_dir,
          config,
          names=None):

    out_dir = os.path.join(align_dir, "%s_tophat" % out_base)
    out_file = os.path.join(out_dir, _out_fnames[0])

    if file_exists(out_file):
        return os.path.join(out_dir, "%s.sam" % out_base)

    if not _bowtie_ref_match(ref_file, config):
        logger.error("Bowtie version %d was detected but the reference "
                     "file %s is built for version %d. Download version "
                     "%d or build it with bowtie-build." %
                     (_bowtie_major_version(config), ref_file,
                      _ref_version(ref_file), _bowtie_major_version(config)))
        exit(1)

    out_files = tophat_align(fastq_file,
                             pair_file,
                             ref_file,
                             out_base,
                             align_dir,
                             config,
                             names=None)

    return out_files
Пример #3
0
def demultiplex_samples(data):
    """
    demultiplex a fastqtransformed FASTQ file into separate sample barcode files
    """
    work_dir = os.path.join(dd.get_work_dir(data), "umis")
    sample_dir = os.path.join(work_dir, dd.get_sample_name(data))
    demulti_dir = os.path.join(sample_dir, "demultiplexed")

    files = data["files"]
    if len(files) == 2:
        logger.error(
            "Sample demultiplexing doesn't handle paired-end reads, but "
            "we can add it. Open an issue here https://github.com/bcbio/bcbio-nextgen/issues if you need this and we'll add it."
        )
        sys.exit(1)
    else:
        fq1 = files[0]
    # check if samples need to be demultiplexed
    with open_fastq(fq1) as in_handle:
        read = next(in_handle)
        if "SAMPLE_" not in read:
            return [[data]]

    bcfile = get_sample_barcodes(dd.get_sample_barcodes(data), sample_dir)
    demultiplexed = glob.glob(os.path.join(demulti_dir, "*.fq*"))
    if demultiplexed:
        return [split_demultiplexed_sampledata(data, demultiplexed)]
    umis = config_utils.get_program("umis", data, default="umis")
    cmd = ("{umis} demultiplex_samples --nedit 1 --barcodes {bcfile} "
           "--out_dir {tx_dir} {fq1}")
    msg = "Demultiplexing {fq1}."
    with file_transaction(data, demulti_dir) as tx_dir:
        do.run(cmd.format(**locals()), msg.format(**locals()))
    demultiplexed = glob.glob(os.path.join(demulti_dir, "*.fq*"))
    return [split_demultiplexed_sampledata(data, demultiplexed)]
Пример #4
0
def run_rnaseq_variant_calling(data):
    """
    run RNA-seq variant calling, variation file is stored in `vrn_file`
    in the datadict
    """
    variantcaller = dd.get_variantcaller(data)
    if isinstance(variantcaller, list) and len(variantcaller) > 1:
        logger.error("Only one variantcaller can be run for RNA-seq at "
                     "this time. Post an issue here "
                     "(https://github.com/bcbio/bcbio-nextgen/issues) "
                     "if this is something you need to do.")
        sys.exit(1)

    if variantcaller:
        if "gatk-haplotype" in variantcaller:
            data = variation.rnaseq_gatk_variant_calling(data)
        if vardict.get_vardict_command(data):
            data = variation.rnaseq_vardict_variant_calling(data)
    if dd.get_vrn_file(data):
        ann_file = vcfanno.run_vcfanno(dd.get_vrn_file(data), ["rnaedit"],
                                       data)
        if ann_file:
            data = dd.set_vrn_file(data, ann_file)
        ann_file = population.run_vcfanno(dd.get_vrn_file(data), data,
                                          population.do_db_build([data]))
        if ann_file:
            data = dd.set_vrn_file(data, ann_file)
    return [[data]]
Пример #5
0
def singlecell_rnaseq(samples, run_parallel):
    quantifier = dd.get_in_samples(samples, dd.get_singlecell_quantifier)
    quantifier = quantifier.lower()
    samples = run_parallel("run_umi_transform", samples)
    demultiplexed = run_parallel("demultiplex_samples", samples)
    # break demultiplixed lanes into their own samples
    samples = []
    for lane in demultiplexed:
        for index in lane:
            samples.append([index])
    samples = run_parallel("run_filter_barcodes", samples)
    samples = run_parallel("run_barcode_histogram", samples)
    if quantifier == "rapmap":
        samples = run_parallel("run_rapmap_index", [samples])
        samples = run_parallel("run_rapmap_align", samples)
        samples = run_parallel("run_tagcount", samples)
        samples = run_parallel("run_concatenate_sparse_counts", [samples])
    elif quantifier == "kallisto":
        samples = run_parallel("run_kallisto_singlecell", samples)
    else:
        logger.error(("%s is not supported for singlecell RNA-seq "
                      "quantification." % quantifier))
        sys.exit(1)
    samples = scrnaseq_concatenate_metadata(samples)
    singlecellexperiment.make_scrnaseq_object(samples)
    return samples
Пример #6
0
def main(config, view):

    # make the needed directories
    map(safe_makedir, config["dir"].values())

    # specific for project
    human_input = find_sam_files(config["input_dir_human"])
    mouse_input = find_sam_files(config["input_dir_mouse"])
    if len(human_input) != len(mouse_input):
        logger.error("The length of the number of human SAM files does "
                     "not match the length of the number of mouse SAM "
                     "files, aborting.")
        sys.exit(1)
    input_files = zip(human_input, mouse_input)

    curr_files = input_files

    logger.info("Running disambiguation pipeline on %s." % (curr_files))

    for stage in config["run"]:
        if stage == "disambiguate":
            logger.info("Disambiguating %s." % (curr_files))
            disambiguate = Disambiguate(config)
            out_files = list(flatten(view.map(disambiguate, curr_files)))
            bam_files = view.map(sam.sam2bam, out_files)
            bam_sorted = view.map(sam.bamsort, bam_files)
            view.map(sam.bamindex, bam_sorted)
Пример #7
0
def load_summarizedexperiment(samples):
    """ create summarizedexperiment rds object
    fails with n_samples = 1 """
    # using r36 (4.0) - will eventually drop R3.5
    rcmd = Rscript_cmd("r36")
    se_script = os.path.join(os.path.dirname(__file__), os.pardir, "scripts",
                             "R", "bcbio2se.R")
    data = samples[0][0]
    work_dir = dd.get_work_dir(data)
    out_dir = os.path.join(work_dir, "salmon")
    summarized_experiment = os.path.join(out_dir, "bcbio-se.rds")
    if not file_exists(summarized_experiment):
        with file_transaction(summarized_experiment) as tx_out_file:
            cmd = f"{rcmd} --vanilla {se_script} {work_dir} {tx_out_file}"
            message = f"Loading SummarizedExperiment."
            try:
                do.run(cmd, message)
            except Exception:
                logger.error("SE creation failed")
    if file_exists(summarized_experiment):
        try:
            se_qc_report = generate_se_qc_report(work_dir)
        except Exception:
            se_qc_report = None
            logger.error("SE QC failed")
        updated_samples = []
        for data in dd.sample_data_iterator(samples):
            data = dd.set_summarized_experiment(data, summarized_experiment)
            updated_samples.append([data])
        return updated_samples
    else:
        return samples
Пример #8
0
def singlecell_rnaseq(samples, run_parallel):
    quantifier = dd.get_in_samples(samples, dd.get_singlecell_quantifier)
    quantifier = quantifier.lower()
    samples = run_parallel("run_umi_transform", samples)
    demultiplexed = run_parallel("demultiplex_samples", samples)
    # break demultiplixed lanes into their own samples
    samples = []
    for lane in demultiplexed:
        for index in lane:
            samples.append([index])
    samples = run_parallel("run_filter_barcodes", samples)
    samples = run_parallel("run_barcode_histogram", samples)
    if quantifier == "rapmap":
        samples = run_parallel("run_rapmap_index", [samples])
        samples = run_parallel("run_rapmap_align", samples)
        samples = run_parallel("run_tagcount", samples)
        samples = run_parallel("run_concatenate_sparse_counts", [samples])
    elif quantifier == "kallisto":
        samples = run_parallel("run_kallisto_singlecell", samples)
    else:
        logger.error(("%s is not supported for singlecell RNA-seq "
                      "quantification." % quantifier))
        sys.exit(1)
    samples = scrnaseq_concatenate_metadata(samples)
    singlecellexperiment.make_scrnaseq_object(samples)
    return samples
Пример #9
0
def clean_chipseq_alignment(data):
    aligner = dd.get_aligner(data)
    data["align_bam"] = dd.get_work_bam(data)
    if dd.get_mark_duplicates(data):
        if aligner:
            if aligner == "bowtie2":
                filterer = bowtie2.filter_multimappers
            elif aligner == "bwa":
                filterer = bwa.filter_multimappers
            else:
                logger.error("ChIP-seq only supported for bowtie2 and bwa.")
                sys.exit(-1)
            unique_bam = filterer(dd.get_work_bam(data), data)
            data["work_bam"] = unique_bam
        else:
            logger.info(
                "Warning: When BAM file is given as input, bcbio skips multimappers removal."
                "If BAM is not cleaned for peak calling, can result in downstream errors."
            )
    # lcr_bed = utils.get_in(data, ("genome_resources", "variation", "lcr"))
    data["work_bam"] = _keep_assembled_chrom(dd.get_work_bam(data),
                                             dd.get_ref_file(data),
                                             data["config"])
    encode_bed = tz.get_in(
        ["genome_resources", "variation", "encode_blacklist"], data)
    if encode_bed:
        data["work_bam"] = _prepare_bam(dd.get_work_bam(data), encode_bed,
                                        data['config'])
        bam.index(data["work_bam"], data['config'])
    data["bigwig"] = _bam_coverage(dd.get_sample_name(data),
                                   dd.get_work_bam(data), data)
    return [[data]]
Пример #10
0
def make_bcbiornaseq_object(data):
    """ load the initial bcb.rda object using bcbioRNASeq """
    if "bcbiornaseq" not in dd.get_tools_on(data):
        return data
    upload_dir = tz.get_in(("upload", "dir"), data)
    report_dir = os.path.join(upload_dir, "bcbioRNASeq")
    safe_makedir(report_dir)
    organism = dd.get_bcbiornaseq(data).get("organism", None)
    groups = dd.get_bcbiornaseq(data).get("interesting_groups", None)
    loadstring = create_load_string(upload_dir, groups, organism, "gene")
    r_file = os.path.join(report_dir, "load_bcbioRNAseq.R")
    with file_transaction(r_file) as tmp_file:
        memoize_write_file(loadstring, tmp_file)
    rcmd = Rscript_cmd(env="rbcbiornaseq")
    with chdir(report_dir):
        do.run([rcmd, "--vanilla", r_file], "Loading bcbioRNASeq object.")
        # bcbiornaseq 0.3.44 writes to data/bcb.rds
        write_counts(os.path.join(report_dir, "data", "bcb.rds"), "gene")
    loadstring = create_load_string(upload_dir, groups, organism, "transcript")
    r_file = os.path.join(report_dir, "load_transcript_bcbioRNAseq.R")
    with file_transaction(r_file) as tmp_file:
        memoize_write_file(loadstring, tmp_file)
    rcmd = Rscript_cmd(env="rbcbiornaseq")
    with chdir(report_dir):
        do.run([rcmd, "--vanilla", r_file],
               "Loading transcript-level bcbioRNASeq object.")
        write_counts(os.path.join(report_dir, "data-transcript", "bcb.rds"),
                     "transcript")
    try:
        make_quality_report(data)
    except:
        logger.error("bcbiornaseq error at quality report")
    finally:
        return data
Пример #11
0
def clean_chipseq_alignment(data):
    aligner = dd.get_aligner(data)
    data["align_bam"] = dd.get_work_bam(data)
    if dd.get_mark_duplicates(data):
        if aligner:
            if aligner == "bowtie2":
                filterer = bowtie2.filter_multimappers
            elif aligner == "bwa":
                filterer = bwa.filter_multimappers
            else:
                logger.error("ChIP-seq only supported for bowtie2 and bwa.")
                sys.exit(-1)
            unique_bam = filterer(dd.get_work_bam(data), data)
            data["work_bam"] = unique_bam
        else:
            logger.info("Warning: When BAM file is given as input, bcbio skips multimappers removal."
                        "If BAM is not cleaned for peak calling, can result in downstream errors.")
    # lcr_bed = utils.get_in(data, ("genome_resources", "variation", "lcr"))
    data["work_bam"] = _keep_assembled_chrom(dd.get_work_bam(data), dd.get_ref_file(data),
                                             data["config"])
    encode_bed = tz.get_in(["genome_resources", "variation", "encode_blacklist"], data)
    if encode_bed:
        data["work_bam"] = _prepare_bam(dd.get_work_bam(data), encode_bed, data['config'])
        bam.index(data["work_bam"], data['config'])
    data["bigwig"] = _bam_coverage(dd.get_sample_name(data), dd.get_work_bam(data), data)
    return [[data]]
Пример #12
0
def demultiplex_samples(data):
    """
    demultiplex a fastqtransformed FASTQ file into separate sample barcode files
    """
    files = data["files"]
    if len(files) == 2:
        logger.error("Sample demultiplexing doesn't handle paired-end reads, but "
            "we can add it. Open an issue here https://github.com/chapmanb/bcbio-nextgen/issues if you need this and we'll add it.")
        sys.exit(1)
    else:
        fq1 = files[0]
    # check if samples need to be demultiplexed
    with open_fastq(fq1) as in_handle:
        read = in_handle.next()
        if "SAMPLE_" not in read:
            return [[data]]
    bcfile = dd.get_sample_barcodes(data)
    if not bcfile:
        logger.error("Sample demultiplexing needs a list of known indexes provided "
                     "with via the sample_barcodes option in the algorithm section.")
        sys.exit(1)
    work_dir = os.path.join(dd.get_work_dir(data), "umis")
    sample_dir = os.path.join(work_dir, dd.get_sample_name(data))
    demulti_dir = os.path.join(sample_dir, "demultiplexed")
    demultiplexed = glob.glob(os.path.join(demulti_dir, "*.fq*"))
    if demultiplexed:
        return [split_demultiplexed_sampledata(data, demultiplexed)]
    umis = config_utils.get_program("umis", data, default="umis")
    cmd = ("{umis} demultiplex_samples --nedit 1 --barcodes {bcfile} "
           "--out_dir {tx_dir} {fq1}")
    msg = "Demultiplexing {fq1}."
    with file_transaction(data, demulti_dir) as tx_dir:
        do.run(cmd.format(**locals()), msg.format(**locals()))
    demultiplexed = glob.glob(os.path.join(demulti_dir, "*.fq*"))
    return [split_demultiplexed_sampledata(data, demultiplexed)]
Пример #13
0
def _cutadapt_trim(fastq_files, quality_format, adapters, out_files):
    if quality_format == "illumina":
        quality_base = "64"
    else:
        quality_base = "33"

    # --times=2 tries twice remove adapters which will allow things like:
    # realsequenceAAAAAAadapter to remove both the poly-A and the adapter
    # this behavior might not be what we want; we could also do two or
    # more passes of cutadapt
    base_cmd = [
        "cutadapt", "--times=" + "2", "--quality-base=" + quality_base,
        "--quality-cutoff=20", "--format=fastq", "--minimum-length=0"
    ]
    adapter_cmd = map(lambda x: "--adapter=" + x, adapters)
    base_cmd.extend(adapter_cmd)

    if all(map(file_exists, out_files)):
        return out_files

    for in_file, out_file in zip(fastq_files, out_files):
        # if you pass an output filename, cutadapt will write some stats
        # about trimmed adapters to stdout. stat_file captures that.
        stat_file = replace_suffix(out_file, ".trim_stats.txt")
        with open(stat_file, "w") as stat_handle:
            cmd = list(base_cmd)
            cmd.extend(["--output=" + out_file, in_file])
            try:
                return_value = subprocess.check_call(cmd, stdout=stat_handle)
            except subprocess.CalledProcessError:
                cmd_string = subprocess.list2cmdline(cmd)
                logger.error("Cutadapt returned an error. The command "
                             "used to run cutadapt was: %s." % (cmd_string))
                exit(1)
    return out_files
Пример #14
0
def _cutadapt_trim(fastq_files, quality_format, adapters, out_files):
    if quality_format == "illumina":
        quality_base = "64"
    else:
        quality_base = "33"

    # --times=2 tries twice remove adapters which will allow things like:
    # realsequenceAAAAAAadapter to remove both the poly-A and the adapter
    # this behavior might not be what we want; we could also do two or
    # more passes of cutadapt
    base_cmd = ["cutadapt", "--times=" + "2", "--quality-base=" + quality_base,
                "--quality-cutoff=20", "--format=fastq", "--minimum-length=0"]
    adapter_cmd = map(lambda x: "--adapter=" + x, adapters)
    base_cmd.extend(adapter_cmd)

    if all(map(file_exists, out_files)):
        return out_files

    for in_file, out_file in zip(fastq_files, out_files):
        # if you pass an output filename, cutadapt will write some stats
        # about trimmed adapters to stdout. stat_file captures that.
        stat_file = replace_suffix(out_file, ".trim_stats.txt")
        with open(stat_file, "w") as stat_handle:
            cmd = list(base_cmd)
            cmd.extend(["--output=" + out_file, in_file])
            try:
                return_value = subprocess.check_call(cmd, stdout=stat_handle)
            except subprocess.CalledProcessError:
                cmd_string = subprocess.list2cmdline(cmd)
                logger.error("Cutadapt returned an error. The command "
                             "used to run cutadapt was: %s." % (cmd_string))
                exit(1)
    return out_files
Пример #15
0
def singlecell_rnaseq(samples, run_parallel):
    quantifier = dd.get_in_samples(samples, dd.get_singlecell_quantifier)
    quantifier = quantifier.lower()
    samples = run_parallel("run_umi_transform", samples)
    demultiplexed = run_parallel("demultiplex_samples", samples)
    # break demultiplixed lanes into their own samples
    samples = []
    for lane in demultiplexed:
        for index in lane:
            samples.append([index])
    if not samples:
        logger.error(f"No samples were found matching the supplied sample barcodes. See "
            f"https://github.com/bcbio/bcbio-nextgen/issues/3428#issuecomment-772609904 "
            f"for how to debug this issue.")
        sys.exit(1)
    samples = run_parallel("run_filter_barcodes", samples)
    samples = run_parallel("run_barcode_histogram", samples)
    if quantifier == "rapmap":
        samples = run_parallel("run_rapmap_index", [samples])
        samples = run_parallel("run_rapmap_align", samples)
        samples = run_parallel("run_tagcount", samples)
        samples = run_parallel("run_concatenate_sparse_counts", [samples])
    elif quantifier == "kallisto":
        samples = run_parallel("run_kallisto_singlecell", samples)
    else:
        logger.error(("%s is not supported for singlecell RNA-seq "
                      "quantification." % quantifier))
        sys.exit(1)
    samples = scrnaseq_concatenate_metadata(samples)
    singlecellexperiment.make_scrnaseq_object(samples)
    return samples
Пример #16
0
def align(fastq_file, pair_file, ref_file, names, align_dir, data):
    if not ref_file:
        logger.error("STAR index not found. We don't provide the STAR indexes "
                     "by default because they are very large. You can install "
                     "the index for your genome with: bcbio_nextgen.py upgrade "
                     "--aligners star --genomes genome-build-name --data")
        sys.exit(1)

    max_hits = 10
    srna = True if data["analysis"].lower().startswith("smallrna-seq") else False
    srna_opts = ""
    if srna:
        max_hits = 1000
        srna_opts = "--alignIntronMax 1"
    config = data["config"]
    star_dirs = _get_star_dirnames(align_dir, data, names)
    if file_exists(star_dirs.final_out):
        data = _update_data(star_dirs.final_out, star_dirs.out_dir, names, data)
        return data

    star_path = config_utils.get_program("STAR", config)
    fastq_files = " ".join([fastq_file, pair_file]) if pair_file else fastq_file
    num_cores = dd.get_num_cores(data)
    gtf_file = dd.get_gtf_file(data)
    if ref_file.endswith("chrLength"):
        ref_file = os.path.dirname(ref_file)

    with file_transaction(data, align_dir) as tx_align_dir:
        tx_star_dirnames = _get_star_dirnames(tx_align_dir, data, names)
        tx_out_dir, tx_out_file, tx_out_prefix, tx_final_out = tx_star_dirnames
        safe_makedir(tx_align_dir)
        safe_makedir(tx_out_dir)
        cmd = ("{star_path} --genomeDir {ref_file} --readFilesIn {fastq_files} "
            "--runThreadN {num_cores} --outFileNamePrefix {tx_out_prefix} "
            "--outReadsUnmapped Fastx --outFilterMultimapNmax {max_hits} "
            "--outStd SAM {srna_opts} "
            "--outSAMunmapped Within --outSAMattributes %s " % " ".join(ALIGN_TAGS))
        cmd += _add_sj_index_commands(fastq_file, ref_file, gtf_file) if not srna else ""
        cmd += " --readFilesCommand zcat " if is_gzipped(fastq_file) else ""
        cmd += _read_group_option(names)
        fusion_mode = utils.get_in(data, ("config", "algorithm", "fusion_mode"), False)
        if fusion_mode:
            cmd += (" --chimSegmentMin 12 --chimJunctionOverhangMin 12 "
                    "--chimScoreDropMax 30 --chimSegmentReadGapMax 5 "
                    "--chimScoreSeparation 5 "
                    "--chimOutType WithinSAM ")
        strandedness = utils.get_in(data, ("config", "algorithm", "strandedness"),
                                    "unstranded").lower()
        if strandedness == "unstranded" and not srna:
            cmd += " --outSAMstrandField intronMotif "
        if not srna:
            cmd += " --quantMode TranscriptomeSAM "
        cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_final_out)
        run_message = "Running STAR aligner on %s and %s" % (fastq_file, ref_file)
        do.run(cmd.format(**locals()), run_message, None)
        print("hello")

    data = _update_data(star_dirs.final_out, star_dirs.out_dir, names, data)
    return data
Пример #17
0
def _run_with_possible_error_message(cmd, **kwargs):
    try:
        subprocess.check_call(cmd, **kwargs)
    except subprocess.CalledProcessError:
        cmd_string = subprocess.list2cmdline(cmd)
        logger.error("Cutadapt returned an error. The command "
                     "used to run cutadapt was: %s." % (cmd_string))
        exit(1)
Пример #18
0
def _check_bowtie(ref_file, config):
    if not _bowtie_ref_match(ref_file, config):
        logger.error("Bowtie version %d was detected but the reference "
                     "file %s is built for version %d. Download version "
                     "%d or build it with bowtie-build." %
                     (_bowtie_major_version(config), ref_file,
                      _ref_version(ref_file), _bowtie_major_version(config)))
        exit(1)
Пример #19
0
def _run_with_possible_error_message(cmd, **kwargs):
    try:
        subprocess.check_call(cmd, **kwargs)
    except subprocess.CalledProcessError:
        cmd_string = subprocess.list2cmdline(cmd)
        logger.error("Cutadapt returned an error. The command "
                     "used to run cutadapt was: %s." % (cmd_string))
        exit(1)
Пример #20
0
def combine_pairs(input_files):
    """ calls files pairs if they are completely the same except
    for one has _1 and the other has _2 returns a list of tuples
    of pairs or singles.
    From bipy.utils (https://github.com/roryk/bipy/blob/master/bipy/utils.py)
    Adjusted to allow different input paths or extensions for matching files.
    """
    PAIR_FILE_IDENTIFIERS = set(["1", "2", "3"])

    pairs = []
    used = set([])
    for in_file in input_files:
        if in_file in used:
            continue
        for comp_file in input_files:
            if comp_file in used or comp_file == in_file:
                continue
            a = rstrip_extra(utils.splitext_plus(os.path.basename(in_file))[0])
            b = rstrip_extra(
                utils.splitext_plus(os.path.basename(comp_file))[0])
            if len(a) != len(b):
                continue
            s = dif(a, b)
            # no differences, then its the same file stem
            if len(s) == 0:
                logger.error(
                    "%s and %s have the same stem, so we don't know "
                    "how to assign it to the sample data in the CSV. To "
                    "get around this you can rename one of the files. "
                    "If they are meant to be the same sample run in two "
                    "lanes, combine them first with the "
                    "bcbio_prepare_samples.py script."
                    "(http://bcbio-nextgen.readthedocs.io/en/latest/contents/configuration.html#multiple-files-per-sample)"
                    % (in_file, comp_file))
                sys.exit(1)
            if len(s) > 1:
                continue  #there is only 1 difference
            if (a[s[0]] in PAIR_FILE_IDENTIFIERS
                    and b[s[0]] in PAIR_FILE_IDENTIFIERS):
                # if the 1/2 isn't the last digit before a separator, skip
                # this skips stuff like 2P 2A, often denoting replicates, not
                # read pairings
                if len(b) > (s[0] + 1):
                    if (b[s[0] + 1] not in ("_", "-", ".")):
                        continue
                # if the 1/2 is not a separator or prefaced with R, skip
                if b[s[0] - 1] in ("R", "_", "-", "."):
                    used.add(in_file)
                    used.add(comp_file)
                    if b[s[0]] > a[s[0]]:
                        pairs.append([in_file, comp_file])
                    else:
                        pairs.append([comp_file, in_file])
                    break
        if in_file not in used:
            pairs.append([in_file])
            used.add(in_file)
    return pairs
Пример #21
0
def align(fastq_file, pair_file, ref_file, names, align_dir, data):
    assert data["analysis"].lower().startswith("wgbs-seq"), "No comparible alignment."
    config = data["config"]
    sample = dd.get_sample_name(data)
    out_prefix = os.path.join(align_dir, dd.get_lane(data))
    out_dir = os.path.join(align_dir, "%s_bismark" % dd.get_lane(data))

    if not ref_file:
        logger.error("bismark index not found. You can install "
                     "the index for your genome with: bcbio_nextgen.py upgrade "
                     "--aligners bismark --genomes genome-build-name --data")
        sys.exit(1)

    final_out = os.path.join(align_dir, "{0}.bam".format(sample))
    if file_exists(final_out):
        data = dd.set_work_bam(data, final_out)
        data["bam_report"] = glob.glob(os.path.join(out_dir, "*report.txt"))[0]
        data = dd.update_summary_qc(data, "bismark", base=data["bam_report"])
        return data

    bismark = config_utils.get_program("bismark", config)
    # bismark uses 5 threads/sample and ~12GB RAM/sample (hg38)
    resources = config_utils.get_resources("bismark", data["config"])
    max_cores = dd.get_num_cores(data)
    max_mem = config_utils.convert_to_bytes(resources.get("memory", "1G")) / (1024.0 * 1024.0)
    instances = calculate_bismark_instances(max_cores, max_mem * max_cores)
    # override instances if specified in the config
    if resources and resources.get("bismark_threads"):
        instances = resources.get("bismark_threads")
        logger.info(f"Using {instances} bismark instances - overriden by resources")
    bowtie_threads = 1
    if resources and resources.get("bowtie_threads"):
        bowtie_threads = resources.get("bowtie_threads")
    logger.info(f"Using {bowtie_threads} bowtie threads per bismark instance")
    kit = kits.KITS.get(dd.get_kit(data), None)
    directional = "--non_directional" if kit and not kit.is_directional else ""

    other_opts = resources.get("options", [])
    other_opts = " ".join([str(x) for x in other_opts]).strip()

    fastq_files = " ".join([fastq_file, pair_file]) if pair_file else fastq_file
    safe_makedir(align_dir)
    cmd = "{bismark} {other_opts} {directional} --bowtie2 --temp_dir {tx_out_dir} --gzip --parallel {instances} -p {bowtie_threads} -o {tx_out_dir} --unmapped {ref_file} {fastq_file} "
    if pair_file:
        fastq_file = "-1 %s -2 %s" % (fastq_file, pair_file)
    raw_bam = glob.glob(out_dir + "/*bismark*bt2*bam")
    if not raw_bam:
        with tx_tmpdir() as tx_out_dir:
            run_message = "Running Bismark aligner on %s and %s" % (fastq_file, ref_file)
            do.run(cmd.format(**locals()), run_message, None)
            shutil.move(tx_out_dir, out_dir)
        raw_bam = glob.glob(out_dir + "/*bismark*bt2*bam")
    # don't process bam in the bismark pipeline!
    utils.symlink_plus(raw_bam[0], final_out)
    data = dd.set_work_bam(data, final_out)
    data["bam_report"] = glob.glob(os.path.join(out_dir, "*report.txt"))[0]
    data = dd.update_summary_qc(data, "bismark", base=data["bam_report"])
    return data
Пример #22
0
def align(fastq_file, pair_file, ref_file, names, align_dir, data):
    assert data["analysis"].lower().startswith(
        "wgbs-seq"), "No comparible alignment."
    config = data["config"]
    sample = dd.get_sample_name(data)
    out_prefix = os.path.join(align_dir, dd.get_lane(data))
    out_dir = os.path.join(align_dir, "%s_bismark" % dd.get_lane(data))

    if not ref_file:
        logger.error(
            "bismark index not found. We don't provide the STAR indexes "
            "by default because they are very large. You can install "
            "the index for your genome with: bcbio_nextgen.py upgrade "
            "--aligners bismark --genomes genome-build-name --data")
        sys.exit(1)

    final_out = os.path.join(align_dir, "{0}.bam".format(sample))
    if file_exists(final_out):
        data = dd.set_work_bam(data, final_out)
        data["bam_report"] = glob.glob(os.path.join(out_dir, "*report.txt"))[0]
        return data

    bismark = config_utils.get_program("bismark", config)

    # bismark uses 5 threads/sample and ~12GB RAM/sample (hg38)
    resources = config_utils.get_resources("bismark", data["config"])
    max_cores = resources.get("cores", 1)
    max_mem = config_utils.convert_to_bytes(resources.get("memory", "1G"))
    n = min(max(int(max_cores / 5), 1),
            max(int(max_mem / config_utils.convert_to_bytes("12G")), 1))

    kit = kits.KITS.get(dd.get_kit(data), None)
    directional = "--non_directional" if kit and not kit.is_directional else ""

    other_opts = resources.get("options", [])
    other_opts = " ".join([str(x) for x in other_opts]).strip()

    fastq_files = " ".join([fastq_file, pair_file
                            ]) if pair_file else fastq_file
    safe_makedir(align_dir)
    cmd = "{bismark} {other_opts} {directional} --bowtie2 --temp_dir {tx_out_dir} --gzip --multicore {n} -o {tx_out_dir} --unmapped {ref_file} {fastq_file}"
    if pair_file:
        fastq_file = "-1 %s -2 %s" % (fastq_file, pair_file)
    raw_bam = glob.glob(out_dir + "/*bismark*bt2*bam")
    if not raw_bam:
        with tx_tmpdir() as tx_out_dir:
            run_message = "Running Bismark aligner on %s and %s" % (fastq_file,
                                                                    ref_file)
            do.run(cmd.format(**locals()), run_message, None)
            shutil.move(tx_out_dir, out_dir)
        raw_bam = glob.glob(out_dir + "/*bismark*bt2*bam")
    process_bam = _process_bam(raw_bam[0], fastq_files, sample,
                               dd.get_sam_ref(data), config)
    utils.symlink_plus(process_bam, final_out)
    data = dd.set_work_bam(data, final_out)
    data["bam_report"] = glob.glob(os.path.join(out_dir, "*report.txt"))[0]
    return data
Пример #23
0
def _get_quality_format(config):
    SUPPORTED_FORMATS = ["illumina", "standard"]
    quality_format = dd.get_quality_format(data).lower()
    if quality_format not in SUPPORTED_FORMATS:
        logger.error("quality_format is set to an unsupported format. "
                     "Supported formats are %s."
                     % (", ".join(SUPPORTED_FORMATS)))
        exit(1)
    return quality_format
Пример #24
0
def _get_pipeline(item):
    from bcbio.log import logger
    analysis_type = item.get("analysis", "").lower()
    if analysis_type not in SUPPORTED_PIPELINES:
        logger.error("Cannot determine which type of analysis to run, "
                      "set in the run_info under details.")
        sys.exit(1)
    else:
        return SUPPORTED_PIPELINES[analysis_type]
Пример #25
0
def index(data, bam_fpath):
    cmdl = make_command(data, "index", bam_fpath)
    indexed_bam = bam_fpath + ".bai"
    if not utils.file_uptodate(indexed_bam, bam_fpath):
        do.run(cmdl, "Indexing BAM file using sambamba")
        if not utils.file_exists(indexed_bam):
            logger.error("Cannot index BAM file " + bam_fpath + " using sambamba.")
            return None
    return indexed_bam
Пример #26
0
def umi_transform(data):
    """
    transform each read by identifying the barcode and UMI for each read
    and putting the information in the read name
    """
    fq1 = data["files"][0]
    umi_dir = os.path.join(dd.get_work_dir(data), "umis")
    safe_makedir(umi_dir)
    transform = dd.get_umi_type(data)

    if not transform:
        logger.info("No UMI transform specified, assuming pre-transformed data.")
        if is_transformed(fq1):
            logger.info("%s detected as pre-transformed, passing it on unchanged." % fq1)
            data["files"] = [fq1]
            return data
        else:
            logger.error("No UMI transform was specified, but %s does not look "
                         "pre-transformed. Assuming non-umi data." % fq1)
            return data

    if file_exists(transform):
        transform_file = transform
    else:
        transform_file = get_transform_file(transform)
        if not file_exists(transform_file):
            logger.error(
                "The UMI transform can be specified as either a file or a "
                "bcbio-supported transform. Either the file %s does not exist "
                "or the transform is not supported by bcbio. Supported "
                "transforms are %s."
                % (dd.get_umi_type(data), ", ".join(SUPPORTED_TRANSFORMS)))
            sys.exit(1)
    out_base = dd.get_sample_name(data) + ".umitransformed.fq.gz"
    out_file = os.path.join(umi_dir, out_base)
    if file_exists(out_file):
        data["files"] = [out_file]
        return data
    umis = config_utils.get_program("umis", data, default="umis")
    cores = dd.get_num_cores(data)
    # skip transformation if the file already looks transformed
    with open_fastq(fq1) as in_handle:
        read = next(in_handle)
        if "UMI_" in read:
            data["files"] = [out_file]
            return data

    cmd = ("{umis} fastqtransform {transform_file} "
           "--cores {cores} "
           "{fq1}"
           "| seqtk seq -L 20 - | gzip > {tx_out_file}")
    message = ("Inserting UMI and barcode information into the read name of %s"
               % fq1)
    with file_transaction(out_file) as tx_out_file:
        do.run(cmd.format(**locals()), message)
    data["files"] = [out_file]
    return data
Пример #27
0
def _check_bowtie(ref_file, config):
    if not _bowtie_ref_match(ref_file, config):
        logger.error("Bowtie version %d was detected but the reference "
                     "file %s is built for version %d. Download version "
                     "%d or build it with bowtie-build."
                     % (_bowtie_major_version(config), ref_file,
                        _ref_version(ref_file),
                        _bowtie_major_version(config)))
        exit(1)
Пример #28
0
def _get_quality_format(config):
    SUPPORTED_FORMATS = ["illumina", "standard"]
    quality_format = config["algorithm"].get("quality_format", "standard").lower()
    if quality_format not in SUPPORTED_FORMATS:
        logger.error("quality_format is set to an unsupported format. "
                     "Supported formats are %s."
                     % (", ".join(SUPPORTED_FORMATS)))
        exit(1)
    return quality_format
Пример #29
0
def _get_pipeline(item):
    from bcbio.log import logger
    analysis_type = item.get("analysis", "").lower()
    if analysis_type not in SUPPORTED_PIPELINES:
        logger.error("Cannot determine which type of analysis to run, "
                     "set in the run_info under details.")
        sys.exit(1)
    else:
        return SUPPORTED_PIPELINES[analysis_type]
Пример #30
0
def combine_pairs(input_files):
    """ calls files pairs if they are completely the same except
    for one has _1 and the other has _2 returns a list of tuples
    of pairs or singles.
    From bipy.utils (https://github.com/roryk/bipy/blob/master/bipy/utils.py)
    Adjusted to allow different input paths or extensions for matching files.
    """
    PAIR_FILE_IDENTIFIERS = set(["1", "2", "3"])

    pairs = []
    used = set([])
    for in_file in input_files:
        if in_file in used:
            continue
        for comp_file in input_files:
            if comp_file in used or comp_file == in_file:
                continue
            a = rstrip_extra(utils.splitext_plus(os.path.basename(in_file))[0])
            b = rstrip_extra(utils.splitext_plus(os.path.basename(comp_file))[0])
            if len(a) != len(b):
                continue
            s = dif(a,b)
            # no differences, then its the same file stem
            if len(s) == 0:
                logger.error("%s and %s have the same stem, so we don't know "
                             "how to assign it to the sample data in the CSV. To "
                             "get around this you can rename one of the files. "
                             "If they are meant to be the same sample run in two "
                             "lanes, combine them first with the "
                             "bcbio_prepare_samples.py script."
                             "(http://bcbio-nextgen.readthedocs.io/en/latest/contents/configuration.html#multiple-files-per-sample)"
                             % (in_file, comp_file))
                sys.exit(1)
            if len(s) > 1:
                continue #there is only 1 difference
            if (a[s[0]] in PAIR_FILE_IDENTIFIERS and
                  b[s[0]] in PAIR_FILE_IDENTIFIERS):
                # if the 1/2 isn't the last digit before a separator, skip
                # this skips stuff like 2P 2A, often denoting replicates, not
                # read pairings
                if len(b) > (s[0] + 1):
                    if (b[s[0]+1] not in ("_", "-", ".")):
                        continue
                # if the 1/2 is not a separator or prefaced with R, skip
                if b[s[0]- 1] in ("R", "_", "-", "."):
                    used.add(in_file)
                    used.add(comp_file)
                    if b[s[0]] > a[s[0]]:
                        pairs.append([in_file, comp_file])
                    else:
                        pairs.append([comp_file, in_file])
                    break
        if in_file not in used:
            pairs.append([in_file])
            used.add(in_file)
    return pairs
Пример #31
0
def index(data, bam_fpath):
    cmdl = make_command(data, "index", bam_fpath)
    indexed_bam = bam_fpath + ".bai"
    if not utils.file_uptodate(indexed_bam, bam_fpath):
        do.run(cmdl, "Indexing BAM file using sambamba")
        if not utils.file_exists(indexed_bam):
            logger.error("Cannot index BAM file " + bam_fpath +
                         " using sambamba.")
            return None
    return indexed_bam
Пример #32
0
def _get_pipeline(item):
    from bcbio.log import logger

    SUPPORTED_PIPELINES = {x.name.lower(): x for x in utils.itersubclasses(AbstractPipeline)}
    analysis_type = item.get("analysis", "").lower()
    if analysis_type not in SUPPORTED_PIPELINES:
        logger.error("Cannot determine which type of analysis to run, " "set in the run_info under details.")
        sys.exit(1)
    else:
        return SUPPORTED_PIPELINES[analysis_type]
Пример #33
0
def align(fastq_file, pair_file, ref_file, names, align_dir, data):
    config = data["config"]
    out_prefix = os.path.join(align_dir, dd.get_lane(data))
    out_file = out_prefix + "Aligned.out.sam"
    out_dir = os.path.join(align_dir, "%s_star" % dd.get_lane(data))

    if not ref_file:
        logger.error(
            "STAR index not found. We don't provide the STAR indexes "
            "by default because they are very large. You can install "
            "the index for your genome with: bcbio_nextgen.py upgrade "
            "--aligners star --genomes genome-build-name --data")
        sys.exit(1)

    final_out = os.path.join(out_dir, "{0}.bam".format(names["sample"]))
    if file_exists(final_out):
        data = _update_data(final_out, out_dir, names, data)
        return data

    star_path = config_utils.get_program("STAR", config)
    fastq = " ".join([fastq_file, pair_file]) if pair_file else fastq_file
    num_cores = config["algorithm"].get("num_cores", 1)

    safe_makedir(align_dir)
    cmd = ("{star_path} --genomeDir {ref_file} --readFilesIn {fastq} "
           "--runThreadN {num_cores} --outFileNamePrefix {out_prefix} "
           "--outReadsUnmapped Fastx --outFilterMultimapNmax 10 "
           "--outStd SAM "
           "--outSAMunmapped Within --outSAMattributes %s" %
           " ".join(ALIGN_TAGS))
    cmd = cmd + " --readFilesCommand zcat " if is_gzipped(fastq_file) else cmd
    cmd += _read_group_option(names)
    fusion_mode = utils.get_in(data, ("config", "algorithm", "fusion_mode"),
                               False)
    if fusion_mode:
        cmd += " --chimSegmentMin 15 --chimJunctionOverhangMin 15"
    strandedness = utils.get_in(data, ("config", "algorithm", "strandedness"),
                                "unstranded").lower()
    if strandedness == "unstranded":
        cmd += " --outSAMstrandField intronMotif "

    if dd.get_rsem(data) and not is_transcriptome_broken():
        cmd += " --quantMode TranscriptomeSAM "

    with tx_tmpdir(data) as tmp_dir:
        sam_to_bam = bam.sam_to_bam_stream_cmd(config)
        sort = bam.sort_cmd(config, tmp_dir)
        cmd += "| {sam_to_bam} | {sort} -o {tx_final_out} "
        run_message = "Running STAR aligner on %s and %s" % (fastq_file,
                                                             ref_file)
        with file_transaction(data, final_out) as tx_final_out:
            do.run(cmd.format(**locals()), run_message, None)

    data = _update_data(final_out, out_dir, names, data)
    return data
Пример #34
0
def align(fastq_file, pair_file, ref_file, names, align_dir, data):
    max_hits = 10
    srna = True if data["analysis"].lower().startswith("smallrna-seq") else False
    srna_opts = ""
    if srna:
        max_hits = 1000
        srna_opts = "--alignIntronMax 1"
    config = data["config"]
    out_prefix = os.path.join(align_dir, dd.get_lane(data))
    out_file = out_prefix + "Aligned.out.sam"
    out_dir = os.path.join(align_dir, "%s_star" % dd.get_lane(data))

    if not ref_file:
        logger.error("STAR index not found. We don't provide the STAR indexes "
                     "by default because they are very large. You can install "
                     "the index for your genome with: bcbio_nextgen.py upgrade "
                     "--aligners star --genomes genome-build-name --data")
        sys.exit(1)

    final_out = os.path.join(out_dir, "{0}.bam".format(names["sample"]))
    if file_exists(final_out):
        data = _update_data(final_out, out_dir, names, data)
        return data
    star_path = config_utils.get_program("STAR", config)
    fastq_files = " ".join([fastq_file, pair_file]) if pair_file else fastq_file
    num_cores = dd.get_num_cores(data)
    gtf_file = dd.get_gtf_file(data)

    safe_makedir(align_dir)
    cmd = ("{star_path} --genomeDir {ref_file} --readFilesIn {fastq_files} "
           "--runThreadN {num_cores} --outFileNamePrefix {out_prefix} "
           "--outReadsUnmapped Fastx --outFilterMultimapNmax {max_hits} "
           "--outStd SAM {srna_opts} "
           "--outSAMunmapped Within --outSAMattributes %s " % " ".join(ALIGN_TAGS))
    cmd += _add_sj_index_commands(fastq_file, ref_file, gtf_file)
    cmd += " --readFilesCommand zcat " if is_gzipped(fastq_file) else ""
    cmd += _read_group_option(names)
    fusion_mode = utils.get_in(data, ("config", "algorithm", "fusion_mode"), False)
    if fusion_mode:
        cmd += " --chimSegmentMin 15 --chimJunctionOverhangMin 15"
    strandedness = utils.get_in(data, ("config", "algorithm", "strandedness"),
                                "unstranded").lower()
    if strandedness == "unstranded" and not srna:
        cmd += " --outSAMstrandField intronMotif "

    if dd.get_transcriptome_align(data) and not is_transcriptome_broken(data):
        cmd += " --quantMode TranscriptomeSAM "

    with file_transaction(data, final_out) as tx_final_out:
        cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_final_out)
        run_message = "Running STAR aligner on %s and %s" % (fastq_file, ref_file)
        do.run(cmd.format(**locals()), run_message, None)

    data = _update_data(final_out, out_dir, names, data)
    return data
Пример #35
0
def run(name, chip_bam, input_bam, genome_build, out_dir, method, resources,
        data):
    """
    Run macs2 for chip and input samples avoiding
    errors due to samples.
    """
    # output file name need to have the caller name
    config = dd.get_config(data)
    out_file = os.path.join(out_dir, name + "_peaks_macs2.xls")
    macs2_file = os.path.join(out_dir, name + "_peaks.xls")
    if utils.file_exists(out_file):
        _compress_and_sort_bdg_files(out_dir, data)
        return _get_output_files(out_dir)
    macs2 = config_utils.get_program("macs2", config)
    antibody = dd.get_antibody(data)
    if antibody:
        antibody = antibody.lower()
        if antibody not in antibodies.SUPPORTED_ANTIBODIES:
            logger.error(
                f"{antibody} specified, but not listed as a supported antibody. Valid antibodies are {antibodies.SUPPORTED_ANTIBODIES}. If you know your antibody "
                f"should be called with narrow or broad peaks, supply 'narrow' or 'broad' as the antibody."
                f"It will run 'narrow' if the antibody is not supported.")
            antibody = 'narrow'
        antibody = antibodies.ANTIBODIES[antibody]
        logger.info(
            f"{antibody.name} specified, using {antibody.peaktype} peak settings."
        )
        peaksettings = select_peak_parameters(antibody)
    elif method == "atac":
        logger.info(f"ATAC-seq specified, using narrow peak settings.")
        peaksettings = " "
    else:
        peaksettings = " "
    options = " ".join(resources.get("macs2", {}).get("options", ""))
    genome_size = bam.fasta.total_sequence_length(dd.get_ref_file(data))
    genome_size = "" if options.find("-g") > -1 else "-g %s" % genome_size
    paired = "-f BAMPE" if bam.is_paired(chip_bam) else ""
    with utils.chdir(out_dir):
        cmd = _macs2_cmd(data)
        cmd += peaksettings
        try:
            do.run(cmd.format(**locals()), "macs2 for %s" % name)
            utils.move_safe(macs2_file, out_file)
        except subprocess.CalledProcessError:
            raise RuntimeWarning(
                "macs2 terminated with an error. "
                "Please, check the message and report "
                "error if it is related to bcbio. "
                "You can add specific options for the sample "
                "setting resources as explained in docs: "
                "https://bcbio-nextgen.readthedocs.org/en/latest/contents/configuration.html#sample-specific-resources"
            )
    _compress_and_sort_bdg_files(out_dir, data)
    return _get_output_files(out_dir)
Пример #36
0
def _ref_version(ref_file):
    _, ext = os.path.splitext(glob.glob(ref_file + "*")[0])
    if ext == ".ebwt":
        return 1
    elif ext == ".bt2":
        return 2
    else:
        logger.error("Cannot detect which reference version %s is. "
                     "Should end in either .ebwt (bowtie) or .bt2 "
                     "(bowtie2)." % (ref_file))
        exit(1)
Пример #37
0
def _ref_version(ref_file):
    _, ext = os.path.splitext(glob.glob(ref_file + "*")[0])
    if ext == ".ebwt":
        return 1
    elif ext == ".bt2":
        return 2
    else:
        logger.error("Cannot detect which reference version %s is. "
                     "Should end in either .ebwt (bowtie) or .bt2 "
                     "(bowtie2)." % (ref_file))
        exit(1)
Пример #38
0
def _get_pipeline(item):
    from bcbio.log import logger
    SUPPORTED_PIPELINES = {x.name.lower(): x for x in
                           utils.itersubclasses(AbstractPipeline)}
    analysis_type = item.get("analysis", "").lower()
    if analysis_type not in SUPPORTED_PIPELINES:
        logger.error("Cannot determine which type of analysis to run, "
                      "set in the run_info under details.")
        sys.exit(1)
    else:
        return SUPPORTED_PIPELINES[analysis_type]
Пример #39
0
def get_sample_barcodes(fn, out_dir):
    if not fn:
        logger.error("Sample demultiplexing needs a list of known indexes provided "
                     "with via the sample_barcodes option in the algorithm section.")
        sys.exit(1)
    utils.safe_makedir(out_dir)
    out_fn = os.path.join(out_dir, "barcodes.csv")
    with open(fn) as inh:
        with open(out_fn, 'w') as outh:
            for line in inh:
                outh.write("%s\n" % (line.strip().split(",")[0]))
    return out_fn
Пример #40
0
def htseq_reader(align_file):
    """
    returns a read-by-read sequence reader for a BAM or SAM file
    """
    if bam.is_sam(align_file):
        read_seq = HTSeq.SAM_Reader(align_file)
    elif bam.is_bam(align_file):
        read_seq = HTSeq.BAM_Reader(align_file)
    else:
        logger.error("%s is not a SAM or BAM file" % (align_file))
        sys.exit(1)
    return read_seq
Пример #41
0
def htseq_reader(align_file):
    """
    returns a read-by-read sequence reader for a BAM or SAM file
    """
    if bam.is_sam(align_file):
        read_seq = HTSeq.SAM_Reader(align_file)
    elif bam.is_bam(align_file):
        read_seq = HTSeq.BAM_Reader(align_file)
    else:
        logger.error("%s is not a SAM or BAM file" % (align_file))
        sys.exit(1)
    return read_seq
Пример #42
0
def umi_transform(data):
    """
    transform each read by identifying the barcode and UMI for each read
    and putting the information in the read name
    """
    fqfiles = data["files"]
    fqfiles.extend(list(repeat("", 4-len(fqfiles))))
    fq1, fq2, fq3, fq4 = fqfiles
    umi_dir = os.path.join(dd.get_work_dir(data), "umis")
    safe_makedir(umi_dir)
    transform = dd.get_umi_type(data)
    if file_exists(transform):
        transform_file = transform
    else:
        transform_file = get_transform_file(transform)
        if not file_exists(transform_file):
            logger.error(
                "The UMI transform can be specified as either a file or a "
                "bcbio-supported transform. Either the file %s does not exist "
                "or the transform is not supported by bcbio. Supported "
                "transforms are %s."
                %(dd.get_umi_type(data), ", ".join(SUPPORTED_TRANSFORMS)))
            sys.exit(1)
    out_base = dd.get_sample_name(data) + ".umitransformed.fq.gz"
    out_file = os.path.join(umi_dir, out_base)
    if file_exists(out_file):
        data["files"] = [out_file]
        return [[data]]
    cellular_barcodes = get_cellular_barcodes(data)
    if len(cellular_barcodes) > 1:
        split_option = "--separate_cb"
    else:
        split_option = ""
    umis = config_utils.get_program("umis", data, default="umis")
    cores = dd.get_num_cores(data)
    # skip transformation if the file already looks transformed
    with open_fastq(fq1) as in_handle:
        read = in_handle.next()
        if "UMI_" in read:
            data["files"] = [out_file]
            return [[data]]

    cmd = ("{umis} fastqtransform {split_option} {transform_file} "
           "--cores {cores} "
           "{fq1} {fq2} {fq3} {fq4}"
           "| seqtk seq -L 20 - | gzip > {tx_out_file}")
    message = ("Inserting UMI and barcode information into the read name of %s"
               % fq1)
    with file_transaction(out_file) as tx_out_file:
        do.run(cmd.format(**locals()), message)
    data["files"] = [out_file]
    return [[data]]
Пример #43
0
def umi_transform(data):
    """
    transform each read by identifying the barcode and UMI for each read
    and putting the information in the read name
    """
    fqfiles = data["files"]
    fqfiles.extend(list(repeat("", 4 - len(fqfiles))))
    fq1, fq2, fq3, fq4 = fqfiles
    umi_dir = os.path.join(dd.get_work_dir(data), "umis")
    safe_makedir(umi_dir)
    transform = dd.get_umi_type(data)
    if file_exists(transform):
        transform_file = transform
    else:
        transform_file = get_transform_file(transform)
        if not file_exists(transform_file):
            logger.error(
                "The UMI transform can be specified as either a file or a "
                "bcbio-supported transform. Either the file %s does not exist "
                "or the transform is not supported by bcbio. Supported "
                "transforms are %s." %
                (transform_file, ", ".join(SUPPORTED_TRANSFORMS)))
            sys.exit(1)
    out_base = dd.get_sample_name(data) + ".umitransformed.fq.gz"
    out_file = os.path.join(umi_dir, out_base)
    if file_exists(out_file):
        data["files"] = [out_file]
        return [[data]]
    cellular_barcodes = get_cellular_barcodes(data)
    if len(cellular_barcodes) == 2:
        split_option = "--separate_cb"
    else:
        split_option = ""
    umis = config_utils.get_program("umis", data, default="umis")
    cores = dd.get_num_cores(data)
    # skip transformation if the file already looks transformed
    with open_fastq(fq1) as in_handle:
        read = in_handle.next()
        if "UMI_" in read:
            data["files"] = [out_file]
            return [[data]]

    cmd = ("{umis} fastqtransform {split_option} {transform_file} "
           "--cores {cores} "
           "{fq1} {fq2} {fq3} {fq4}"
           "| seqtk seq -L 20 - | gzip > {tx_out_file}")
    message = (
        "Inserting UMI and barcode information into the read name of %s" % fq1)
    with file_transaction(out_file) as tx_out_file:
        do.run(cmd.format(**locals()), message)
    data["files"] = [out_file]
    return [[data]]
Пример #44
0
def includes_missingalt(data):
    """
    As of GATK 4.1.0.0, variants with missing alts are generated
    (see https://github.com/broadinstitute/gatk/issues/5650)
    """
    MISSINGALT_VERSION = LooseVersion("4.1.0.0")
    version = LooseVersion(broad.get_gatk_version(config=dd.get_config(data)))
    try:
        return version >= MISSINGALT_VERSION
    except TypeError:
        logger.error(
            f"LooseVersion failing with {version} as the detected version.")
        sys.exit(1)
Пример #45
0
def get_sample_barcodes(fn, out_dir):
    if not fn:
        logger.error(
            "Sample demultiplexing needs a list of known indexes provided "
            "with via the sample_barcodes option in the algorithm section.")
        sys.exit(1)
    utils.safe_makedir(out_dir)
    out_fn = os.path.join(out_dir, "barcodes.csv")
    with open(fn) as inh:
        with open(out_fn, 'w') as outh:
            for line in inh:
                outh.write("%s\n" % (line.strip().split(",")[0]))
    return out_fn
Пример #46
0
Файл: trim.py Проект: roryk/bipy
    def _cut_file(self, in_file):
        """
        run cutadapt on a single file

        """
        adapters = self._get_adapters(self.chemistry)
        out_file = self.in2trimmed(in_file)
        if file_exists(out_file):
            return out_file
        cutadapt = sh.Command(self.stage_config.get("program",
                                                    "cutadapt"))

        quality_format = self.quality_format
        if not quality_format:
            quality_format = self._detect_fastq_format(in_file)
        if quality_format == "sanger":
            logger.info("Quality format detected as sanger.")
            quality_base = 33
        elif quality_format == "illumina":
            logger.info("Quality format set to illumina 1.5/1.3")
            quality_base = 64
        else:
            logger.error("Quality format could not be detected. Quality "
                         "Detected or set as %s. It should be illumina "
                         "or sanger.")
            exit(1)

        # if we want to trim the polya tails we have to first remove
        # the adapters and then trim the tail
        if self.stage_config.get("trim_polya", True):
            temp_cut = tempfile.NamedTemporaryFile(suffix=".fastq",
                                                   dir=self.out_dir)
            # trim off adapters
            cmd = str(cutadapt.bake(in_file, self.options, adapters,
                                    quality_base=quality_base, out=temp_cut.name))
            do.run(cmd, "Cutadapt trim of adapters of %s." % (in_file), None)
            with file_transaction(out_file) as temp_out:
                polya = ADAPTERS.get("polya")
                # trim off polya
                cmd = str(cutadapt.bake(temp_cut.name, self.options, "-a",
                                        polya, "-a", self._rc_adapters(polya),
                                        quality_base=quality_base, out=temp_out))
                do.run(cmd, "Cutadapt trim of polyA tail of %s." % (temp_cut.name),
                       None)
            return out_file
        else:
            with file_transaction(out_file) as temp_out:
                cmd = str(cutadapt.bake(in_file, self.options, adapters,
                                    out=temp_out))
                do.run(cmd, "Cutadapt trim of %s." % (in_file))
            return out_file
Пример #47
0
def align(fastq_file, pair_file, ref_file, names, align_dir, data):
    config = data["config"]
    out_prefix = os.path.join(align_dir, dd.get_lane(data))
    out_file = out_prefix + "Aligned.out.sam"
    out_dir = os.path.join(align_dir, "%s_star" % dd.get_lane(data))

    if not ref_file:
        logger.error("STAR index not found. We don't provide the STAR indexes "
                     "by default because they are very large. You can install "
                     "the index for your genome with: bcbio_nextgen.py upgrade "
                     "--aligners star --genomes genome-build-name --data")
        sys.exit(1)

    final_out = os.path.join(out_dir, "{0}.bam".format(names["sample"]))
    if file_exists(final_out):
        data = _update_data(final_out, out_dir, names, data)
        return data

    star_path = config_utils.get_program("STAR", config)
    fastq = " ".join([fastq_file, pair_file]) if pair_file else fastq_file
    num_cores = config["algorithm"].get("num_cores", 1)

    safe_makedir(align_dir)
    cmd = ("{star_path} --genomeDir {ref_file} --readFilesIn {fastq} "
           "--runThreadN {num_cores} --outFileNamePrefix {out_prefix} "
           "--outReadsUnmapped Fastx --outFilterMultimapNmax 10 "
           "--outStd SAM "
           "--outSAMunmapped Within --outSAMattributes %s" % " ".join(ALIGN_TAGS))
    cmd = cmd + " --readFilesCommand zcat " if is_gzipped(fastq_file) else cmd
    cmd += _read_group_option(names)
    fusion_mode = utils.get_in(data, ("config", "algorithm", "fusion_mode"), False)
    if fusion_mode:
        cmd += " --chimSegmentMin 15 --chimJunctionOverhangMin 15"
    strandedness = utils.get_in(data, ("config", "algorithm", "strandedness"),
                                "unstranded").lower()
    if strandedness == "unstranded":
        cmd += " --outSAMstrandField intronMotif "

    if dd.get_rsem(data) and not is_transcriptome_broken():
        cmd += " --quantMode TranscriptomeSAM "

    with tx_tmpdir(data) as tmp_dir:
        sam_to_bam = bam.sam_to_bam_stream_cmd(config)
        sort = bam.sort_cmd(config, tmp_dir)
        cmd += "| {sam_to_bam} | {sort} -o {tx_final_out} "
        run_message = "Running STAR aligner on %s and %s" % (fastq_file, ref_file)
        with file_transaction(data, final_out) as tx_final_out:
            do.run(cmd.format(**locals()), run_message, None)

    data = _update_data(final_out, out_dir, names, data)
    return data
Пример #48
0
def _fetch_chrom_sizes(config):

    PROGRAM = "fetchChromSizes"

    if not program_exists(PROGRAM):
        logger.error("%s is not in the path or is not executable. Make sure "
                     "it is installed or go to "
                     "http://hgdownload.cse.ucsc.edu/admin/exe/"
                     "to download it." % (PROGRAM))
        exit(1)

    if "annotation" not in config:
        logger.error("'annotation' must be in the yaml file. See example "
                     " configuration files")
        exit(1)
    if "name" not in config["annotation"]:
        logger.error("'name' must be in the yaml file under  "
                     " 'annotation'. See example configuration files.")
        exit(1)
    genome = config["annotation"]["name"]
    chrom_size_file = os.path.join(_results_dir(config),
                                   genome + ".sizes")
    if file_exists(chrom_size_file):
        return chrom_size_file

    with file_transaction(chrom_size_file) as tmp_chrom_size_file:
        sh.fetchChromSizes(genome, _out=tmp_chrom_size_file)

    if not file_exists(chrom_size_file):
        logger.error("chromosome size file does not exist. Check "
                     "'annotation': 'name' to make sure it is valid.")
        exit(1)
    return chrom_size_file
Пример #49
0
def run_with_config(input_file, config, stage, out_file=None):

    if out_file is None:
        out_dir = os.path.join(config["dir"].get("results", None), stage)
        out_file = os.path.join(out_dir, _get_outfilename(input_file))

    safe_makedir(out_dir)
    if "annotation" not in config:
        logger.error("annotation must appear in the config file, see example "
                     "configuration files.")
        exit(1)
    ref = prepare_ref_file(config["annotation"], config)
    out_file = run(input_file, ref, out_file)
    return out_file
Пример #50
0
def run_rnaseq_variant_calling(data):
    variantcaller = dd.get_variantcaller(data)
    if isinstance(variantcaller, list) and len(variantcaller) > 1:
        logger.error("Only one variantcaller can be run for RNA-seq at "
                     "this time. Post an issue here "
                     "(https://github.com/chapmanb/bcbio-nextgen/issues) "
                     "if this is something you need to do.")
        sys.exit(1)

    if variantcaller and "gatk" in variantcaller:
        data = variation.rnaseq_gatk_variant_calling(data)
    if vardict.get_vardict_command(data):
        data = variation.rnaseq_vardict_variant_calling(data)
    return [[data]]
Пример #51
0
def run_with_config(input_file, config, stage, out_file=None):

    if out_file is None:
        out_dir = os.path.join(config["dir"].get("results", None), stage)
        out_file = os.path.join(out_dir, _get_outfilename(input_file))

    safe_makedir(out_dir)
    if "annotation" not in config:
        logger.error("annotation must appear in the config file, see example "
                     "configuration files.")
        exit(1)
    ref = prepare_ref_file(config["annotation"], config)
    out_file = run(input_file, ref, out_file)
    return out_file
Пример #52
0
def run_rnaseq_variant_calling(data):
    variantcaller = dd.get_variantcaller(data)
    if isinstance(variantcaller, list) and len(variantcaller) > 1:
        logger.error("Only one variantcaller can be run for RNA-seq at "
                     "this time. Post an issue here "
                     "(https://github.com/chapmanb/bcbio-nextgen/issues) "
                     "if this is something you need to do.")
        sys.exit(1)

    if variantcaller and "gatk" in variantcaller:
        data = variation.rnaseq_gatk_variant_calling(data)
    if vardict.get_vardict_command(data):
        data = variation.rnaseq_vardict_variant_calling(data)
    return [[data]]
Пример #53
0
def singlecell_rnaseq(samples, run_parallel):
    quantifier = dd.get_in_samples(samples, dd.get_singlecell_quantifier)
    quantifier = quantifier.lower()
    samples = run_parallel("run_umi_transform", samples)
    samples = run_parallel("run_barcode_histogram", samples)
    samples = run_parallel("run_filter_barcodes", samples)
    if quantifier == "rapmap":
        samples = run_parallel("run_rapmap_align", samples)
        samples = run_parallel("run_tagcount", samples)
    elif quantifier == "kallisto":
        samples = run_parallel("run_kallisto_singlecell", samples)
    else:
        logger.error(("%s is not supported for singlecell RNA-seq "
                      "quantification." % quantifier))
        sys.exit(1)
    return samples
Пример #54
0
def _check_stems(files):
    """check if stem names are the same and use full path then"""
    used = set()
    for fn in files:
        if os.path.basename(fn) in used:
            logger.error("%s stem is multiple times in your file list, "
                         "so we don't know "
                         "how to assign it to the sample data in the CSV. "
                         "We are gonna use full path to make a difference, "
                         "that means paired files should be in the same folder. "
                         "If this is a problem, you should rename the files you want "
                         "to merge. Sorry, no possible magic here." % os.path.basename(fn)
                         )
            return True
        used.add(os.path.basename(fn))
    return False
Пример #55
0
def bamindex(in_file, samtools="samtools"):
    """
    index a bam file
    avoids use of pysam.index which is not working for indexing as of 0.7.4
    with ipython
    """
    assert(is_bam(in_file)), "bamindex requires a BAM file, got %s" % in_file
    out_file = replace_suffix(in_file, ".bai")
    if file_exists(out_file):
        return out_file
    cmd = ["samtools", "index", in_file]
    try:
        subprocess.check_call(cmd)
    except subprocess.CalledProcessError:
        cmd_string = subprocess.list2cmdline(cmd)
        logger.error("bamindex returned an error. The command "
                     "used to run bamindex was: %s." % (cmd_string))
    return out_file
Пример #56
0
def bam2sam(in_file, samtools="samtools"):
    """
    converts a bam file to a sam file
    bam2sam("file.bam") -> "file.sam"
    """
    assert(is_bam(in_file)), "bam2sam requires a BAM file, got %s" % in_file
    out_file = replace_suffix(in_file, ".sam")
    if file_exists(out_file):
        return out_file
    with file_transaction(out_file) as tmp_out_file:
        #pysam.view("-h", "-o" + tmp_out_file, in_file)
        cmd = "{samtools} view -h -o {tmp_out_file} {in_file}".format(**locals())
        try:
            subprocess.check_call(cmd)
        except subprocess.CalledProcessError:
            cmd_string = subprocess.list2cmdline(cmd)
            logger.error("bam2sam returned an error. The command "
                         "used to run bam2sam was: %s." % (cmd_string))
    return out_file
Пример #57
0
def wig2bigwig(wiggle_file, chrom_size_file, out_file):
    """
    convert wiggle file to bigwig file using the UCSC tool
    """
    PROGRAM = "wigToBigWig"
    if not program_exists(PROGRAM):
        logger.error("%s is not in the path or is not executable. Make sure "
                     "it is installed or go to "
                     "http://hgdownload.cse.ucsc.edu/admin/exe/"
                     "to download it." % (PROGRAM))
        exit(1)

    if file_exists(out_file):
        return out_file

    wigToBigWig = sh.Command(which(PROGRAM))
    with file_transaction(out_file) as tx_out_file:
        cmd = str(wigToBigWig.bake(wiggle_file, chrom_size_file, tx_out_file))
        do.run(cmd, "Converting %s from wig to bigwig." % (wiggle_file), None)
    return out_file