Exemplo n.º 1
0
def kallisto_rnaseq(fq1, fq2, kallisto_dir, gtf_file, fasta_file, data):
    samplename = dd.get_sample_name(data)
    quant_dir = os.path.join(kallisto_dir, "quant")
    safe_makedir(kallisto_dir)
    sentinel_file = os.path.join(quant_dir, "abundance.h5")
    if os.path.exists(sentinel_file):
        return quant_dir
    num_cores = dd.get_num_cores(data)
    strandedness = dd.get_strandedness(data).lower()
    kallisto = config_utils.get_program("kallisto", dd.get_config(data))
    index = kallisto_index(gtf_file, fasta_file, data,
                           os.path.dirname(kallisto_dir))
    fusion_flag = "--fusion" if dd.get_fusion_mode(
        data) or dd.get_fusion_caller(data) else ""
    single_flag = "--single" if not fq2 else ""
    fraglength_flag = "--fragment-length=200" if not fq2 else ""
    sd_flag = "--sd=25" if not fq2 else ""
    bootstrap_flag = "--bootstrap-samples=30"
    fq2 = "" if not fq2 else fq2
    if not fq2:
        logger.warning(
            "kallisto was run on single-end data and we set the "
            "estimated fragment length to 200 and the standard "
            "deviation to 25, if these don't reflect your data then "
            "the results may be inaccurate. Use with caution. See "
            "https://groups.google.com/forum/#!topic/kallisto-sleuth-users/h5LeAlWS33w "
            "for details.")
    cmd = ("{kallisto} quant {fusion_flag} -t {num_cores} {single_flag} "
           "{fraglength_flag} {sd_flag} {bootstrap_flag} "
           "-o {tx_out_dir} -i {index} {fq1} {fq2}")
    with file_transaction(data, quant_dir) as tx_out_dir:
        message = ("Quantifying transcripts with kallisto.")
        do.run(cmd.format(**locals()), message, None)
    return quant_dir
Exemplo n.º 2
0
def should_run_fusion(with_caller, config):
    fusion_mode = dd.get_fusion_mode(config) or \
        utils.get_in(config, ("algorithm", "fusion_mode"), False)
    fusion_caller = dd.get_fusion_caller(config) or \
        utils.get_in(config, ("algorithm", "fusion_caller"), None)

    return fusion_mode and fusion_caller in (None, with_caller)
Exemplo n.º 3
0
def kallisto_rnaseq(fq1, fq2, kallisto_dir, gtf_file, fasta_file, data):
    samplename = dd.get_sample_name(data)
    quant_dir = os.path.join(kallisto_dir, "quant")
    safe_makedir(kallisto_dir)
    num_cores = dd.get_num_cores(data)
    strandedness = dd.get_strandedness(data).lower()
    kallisto = config_utils.get_program("kallisto", dd.get_config(data))
    index = kallisto_index(gtf_file, fasta_file, data, os.path.dirname(kallisto_dir))
    fusion_flag = "--fusion" if dd.get_fusion_mode(data) or dd.get_fusion_caller(data) else ""
    single_flag = "--single" if not fq2 else ""
    fraglength_flag = "--fragment-length=200" if not fq2 else ""
    sd_flag = "--sd=25" if not fq2 else ""
    bootstrap_flag = "--bootstrap-samples=30"
    fq2 = "" if not fq2 else fq2
    if not fq2:
        logger.warning("kallisto was run on single-end data and we set the "
          "estimated fragment length to 200 and the standard "
          "deviation to 25, if these don't reflect your data then "
          "the results may be inaccurate. Use with caution. See "
          "https://groups.google.com/forum/#!topic/kallisto-sleuth-users/h5LeAlWS33w "
          "for details.")
    cmd = ("{kallisto} quant {fusion_flag} -t {num_cores} {single_flag} "
           "{fraglength_flag} {sd_flag} {bootstrap_flag} "
           "-o {tx_out_dir} -i {index} {fq1} {fq2}")
    with file_transaction(data, quant_dir) as tx_out_dir:
        message = ("Quantifying transcripts with kallisto.")
        do.run(cmd.format(**locals()), message, None)
    return quant_dir
Exemplo n.º 4
0
def should_run_fusion(with_caller, config):
    fusion_mode = dd.get_fusion_mode(config) or \
        utils.get_in(config, ("algorithm", "fusion_mode"), False)
    fusion_caller = dd.get_fusion_caller(config) or \
        utils.get_in(config, ("algorithm", "fusion_caller"), None)

    return fusion_mode and fusion_caller in (None, with_caller)
Exemplo n.º 5
0
def generate_transcript_counts(data):
    """Generate counts per transcript and per exon from an alignment"""
    data["count_file"] = featureCounts.count(data)

    if dd.get_fusion_mode(data, False):
        oncofuse_file = oncofuse.run(data)
        if oncofuse_file:
            data = dd.set_oncofuse_file(data, oncofuse_file)

    if dd.get_transcriptome_align(data) and not dd.get_transcriptome_bam(data):
        file1, file2 = None, None

        if dd.get_disambiguate(data):
            bam_path = data["work_bam"]
            fastq_paths = alignprep._bgzip_from_bam(bam_path, data["dirs"], data["config"], is_retry=False, output_infix='-transcriptome')
            if len(fastq_paths) == 2:
                file1, file2 = fastq_paths
            else:
                file1, file2 = fastq_paths[0], None
        else:
            file1, file2 = dd.get_input_sequence_files(data)

        ref_file = dd.get_ref_file(data)
        logger.info("Transcriptome alignment was flagged to run, but the "
                    "transcriptome BAM file was not found. Aligning to the "
                    "transcriptome with bowtie2.")
        data = bowtie2.align_transcriptome(file1, file2, ref_file, data)
    return [[data]]
Exemplo n.º 6
0
def quantitate_expression_parallel(samples, run_parallel):
    """
    quantitate expression, all programs run here should be multithreaded to
    take advantage of the threaded run_parallel environment
    """
    data = samples[0][0]
    samples = run_parallel("generate_transcript_counts", samples)
    if "cufflinks" in dd.get_expression_caller(data):
        samples = run_parallel("run_cufflinks", samples)
    if "stringtie" in dd.get_expression_caller(data):
        samples = run_parallel("run_stringtie_expression", samples)
    if ("kallisto" in dd.get_expression_caller(data)
            or dd.get_fusion_mode(data)
            or "pizzly" in dd.get_fusion_caller(data, [])):
        samples = run_parallel("run_kallisto_index", [samples])
        samples = run_parallel("run_kallisto_rnaseq", samples)
    if "sailfish" in dd.get_expression_caller(data):
        samples = run_parallel("run_sailfish_index", [samples])
        samples = run_parallel("run_sailfish", samples)
    # always run salmon
    samples = run_parallel("run_salmon_index", [samples])
    samples = run_parallel("run_salmon_reads", samples)

    samples = run_parallel("detect_fusions", samples)
    return samples
Exemplo n.º 7
0
def quantitate_expression_parallel(samples, run_parallel):
    """
    quantitate expression, all programs run here should be multithreaded to
    take advantage of the threaded run_parallel environment
    """
    data = samples[0][0]
    samples = run_parallel("generate_transcript_counts", samples)
    if "cufflinks" in dd.get_expression_caller(data):
        samples = run_parallel("run_cufflinks", samples)
    if "stringtie" in dd.get_expression_caller(data):
        samples = run_parallel("run_stringtie_expression", samples)
    if ("kallisto" in dd.get_expression_caller(data) or
        dd.get_fusion_mode(data) or
        "pizzly" in dd.get_fusion_caller(data, [])):
        samples = run_parallel("run_kallisto_index", [samples])
        samples = run_parallel("run_kallisto_rnaseq", samples)
    if "sailfish" in dd.get_expression_caller(data):
        samples = run_parallel("run_sailfish_index", [samples])
        samples = run_parallel("run_sailfish", samples)
    # always run salmon
    samples = run_parallel("run_salmon_index", [samples])
    samples = run_parallel("run_salmon_reads", samples)

    samples = run_parallel("detect_fusions", samples)
    return samples
Exemplo n.º 8
0
def generate_transcript_counts(data):
    """Generate counts per transcript and per exon from an alignment"""
    data["count_file"] = featureCounts.count(data)

    if dd.get_fusion_mode(data, False):
        oncofuse_file = oncofuse.run(data)
        if oncofuse_file:
            data = dd.set_oncofuse_file(data, oncofuse_file)

    if dd.get_transcriptome_align(data) and not dd.get_transcriptome_bam(data):
        file1, file2 = None, None

        if dd.get_disambiguate(data):
            bam_path = data["work_bam"]
            fastq_paths = alignprep._bgzip_from_bam(
                bam_path,
                data["dirs"],
                data["config"],
                is_retry=False,
                output_infix='-transcriptome')
            if len(fastq_paths) == 2:
                file1, file2 = fastq_paths
            else:
                file1, file2 = fastq_paths[0], None
        else:
            file1, file2 = dd.get_input_sequence_files(data)

        ref_file = dd.get_ref_file(data)
        logger.info("Transcriptome alignment was flagged to run, but the "
                    "transcriptome BAM file was not found. Aligning to the "
                    "transcriptome with bowtie2.")
        data = bowtie2.align_transcriptome(file1, file2, ref_file, data)
    return [[data]]
Exemplo n.º 9
0
def detect_fusions(data):
    data = to_single_data(data)
    # support the old style of fusion mode calling
    if dd.get_fusion_mode(data, False):
        data = dd.set_fusion_caller(data, ["oncofuse", "pizzly"])
        logger.warning(
            "``fusion_mode`` is deprecated in favor of turning on "
            "callers with ``fusion_caller``. It will run pizzly and "
            "oncofuse for now, but will eventually have support "
            "dropped.")
    fusion_caller = dd.get_fusion_caller(data, [])
    if "oncofuse" in fusion_caller:
        oncofuse_file = oncofuse.run(data)
        if oncofuse_file:
            data = dd.set_oncofuse_file(data, oncofuse_file)
    if "pizzly" in fusion_caller:
        pizzly_dir = pizzly.run_pizzly(data)
        if pizzly_dir:
            data = dd.set_pizzly_dir(data, pizzly_dir)
            data["fusion"] = {
                "fasta":
                os.path.join(pizzly_dir,
                             "%s.fusions.fasta" % dd.get_sample_name(data)),
                "json":
                os.path.join(pizzly_dir, "%s.json" % dd.get_sample_name(data))
            }
    if "ericscript" in fusion_caller:
        ericscript_dir = ericscript.run(data)
    return [[data]]
Exemplo n.º 10
0
def detect_fusions(data):
    if dd.get_fusion_mode(data, False):
        oncofuse_file = oncofuse.run(data)
        if oncofuse_file:
            data = dd.set_oncofuse_file(data, oncofuse_file)
        pizzly_dir = pizzly.run_pizzly(data)
        if pizzly_dir:
            data = dd.set_pizzly_dir(data, pizzly_dir)
    return [[data]]
Exemplo n.º 11
0
def generate_transcript_counts(data):
    """Generate counts per transcript and per exon from an alignment"""
    data["count_file"] = featureCounts.count(data)
    if dd.get_fusion_mode(data, False):
        oncofuse_file = oncofuse.run(data)
        if oncofuse_file:
            data["oncofuse_file"] = oncofuse.run(data)
    if dd.get_dexseq_gff(data, None):
        data = dd.set_dexseq_counts(data, dexseq.bcbio_run(data))
    return [[data]]
Exemplo n.º 12
0
def generate_transcript_counts(data):
    """Generate counts per transcript and per exon from an alignment"""
    data["count_file"] = featureCounts.count(data)
    if dd.get_fusion_mode(data, False):
        oncofuse_file = oncofuse.run(data)
        if oncofuse_file:
            data["oncofuse_file"] = oncofuse.run(data)
    if dd.get_dexseq_gff(data, None):
        data = dd.set_dexseq_counts(data, dexseq.bcbio_run(data))
    # if RSEM was run, stick the transcriptome BAM file into the datadict
    if dd.get_aligner(data).lower() == "star" and dd.get_rsem(data):
        base, ext = os.path.splitext(dd.get_work_bam(data))
        data = dd.set_transcriptome_bam(data, base + ".transcriptome" + ext)
    return [[data]]
Exemplo n.º 13
0
def generate_transcript_counts(data):
    """Generate counts per transcript and per exon from an alignment"""
    data["count_file"] = featureCounts.count(data)
    if dd.get_fusion_mode(data, False):
        oncofuse_file = oncofuse.run(data)
        if oncofuse_file:
            data["oncofuse_file"] = oncofuse.run(data)
    if dd.get_dexseq_gff(data, None):
        data = dd.set_dexseq_counts(data, dexseq.bcbio_run(data))
    # if RSEM was run, stick the transcriptome BAM file into the datadict
    if dd.get_aligner(data).lower() == "star" and dd.get_rsem(data):
        base, ext = os.path.splitext(dd.get_work_bam(data))
        data = dd.set_transcriptome_bam(data, base + ".transcriptome" + ext)
    return [[data]]
Exemplo n.º 14
0
def generate_transcript_counts(data):
    """Generate counts per transcript and per exon from an alignment"""
    data["count_file"] = featureCounts.count(data)
    if dd.get_fusion_mode(data, False):
        oncofuse_file = oncofuse.run(data)
        if oncofuse_file:
            data = dd.set_oncofuse_file(data, oncofuse_file)
    # if RSEM set to run, but the aligner didn't create the transcriptome BAM
    # file, make one with bwa
    if dd.get_rsem(data) and not dd.get_transcriptome_bam(data):
        file1, file2 = dd.get_input_sequence_files(data)
        ref_file = dd.get_ref_file(data)
        logger.info("RSEM was flagged to run, but the transcriptome BAM file "
                    "was not found. Aligning to the transcriptome with bowtie2.")
        data = bowtie2.align_transcriptome(file1, file2, ref_file, data)
    return [[data]]
Exemplo n.º 15
0
def detect_fusions(data):
    # support the old style of fusion mode calling
    if dd.get_fusion_mode(data, False):
        data = dd.set_fusion_caller(data, ["oncofuse", "pizzly"])
        logger.warning("``fusion_mode`` is deprecated in favor of turning on "
                       "callers with ``fusion_caller``. It will run pizzly and "
                       "oncofuse for now, but will eventually have support "
                       "dropped.")
    if "oncofuse" in dd.get_fusion_caller(data, []):
        oncofuse_file = oncofuse.run(data)
        if oncofuse_file:
            data = dd.set_oncofuse_file(data, oncofuse_file)
    if "pizzly" in dd.get_fusion_caller(data, []):
        pizzly_dir = pizzly.run_pizzly(data)
        if pizzly_dir:
            data = dd.set_pizzly_dir(data, pizzly_dir)
    return [[data]]
Exemplo n.º 16
0
def generate_transcript_counts(data):
    """Generate counts per transcript and per exon from an alignment"""
    data["count_file"] = featureCounts.count(data)
    if dd.get_fusion_mode(data, False):
        oncofuse_file = oncofuse.run(data)
        if oncofuse_file:
            data = dd.set_oncofuse_file(data, oncofuse_file)
    # if RSEM set to run, but the aligner didn't create the transcriptome BAM
    # file, make one with bwa
    if dd.get_rsem(data) and not dd.get_transcriptome_bam(data):
        file1, file2 = dd.get_input_sequence_files(data)
        ref_file = dd.get_ref_file(data)
        logger.info(
            "RSEM was flagged to run, but the transcriptome BAM file "
            "was not found. Aligning to the transcriptome with bowtie2.")
        data = bowtie2.align_transcriptome(file1, file2, ref_file, data)
    return [[data]]
Exemplo n.º 17
0
def quantitate_expression_parallel(samples, run_parallel):
    """
    quantitate expression, all programs run here should be multithreaded to
    take advantage of the threaded run_parallel environment
    """
    data = samples[0][0]
    to_index = determine_indexes_to_make(samples)
    samples = run_parallel("generate_transcript_counts", samples)
    if "cufflinks" in dd.get_expression_caller(data):
        samples = run_parallel("run_cufflinks", samples)
    if "stringtie" in dd.get_expression_caller(data):
        samples = run_parallel("run_stringtie_expression", samples)
    if ("kallisto" in dd.get_expression_caller(data)
            or dd.get_fusion_mode(data)
            or "pizzly" in dd.get_fusion_caller(data, [])):
        run_parallel("run_kallisto_index", [to_index])
        samples = run_parallel("run_kallisto_rnaseq", samples)
    if "sailfish" in dd.get_expression_caller(data):
        run_parallel("run_sailfish_index", [to_index])
        samples = run_parallel("run_sailfish", samples)

    # always run salmon
    run_parallel("run_salmon_index", [to_index])
    if dd.get_quantify_genome_alignments(data):
        if dd.get_aligner(data).lower() != "star":
            if dd.get_genome_build(data) == "hg38":
                logger.warning(
                    "Whole genome alignment-based Salmon quantification is "
                    "only supported for the STAR aligner. Since this is hg38 we will fall "
                    "back to the decoy method")
                samples = run_parallel("run_salmon_decoy", samples)
            else:
                logger.warning(
                    "Whole genome alignment-based Salmon quantification is "
                    "only supported for the STAR aligner. Falling back to the "
                    "transcriptome-only method.")
                samples = run_parallel("run_salmon_reads", samples)
        else:
            samples = run_parallel("run_salmon_bam", samples)
    else:
        samples = run_parallel("run_salmon_reads", samples)

    samples = run_parallel("detect_fusions", samples)
    return samples
Exemplo n.º 18
0
def generate_transcript_counts(data):
    """Generate counts per transcript and per exon from an alignment"""
    data["count_file"] = featureCounts.count(data)
    if dd.get_fusion_mode(data, False):
        oncofuse_file = oncofuse.run(data)
        if oncofuse_file:
            data = dd.set_oncofuse_file(data, oncofuse_file)
    # if RSEM set to run, but the aligner didn't create the transcriptome BAM
    # file, make one with bwa
    if dd.get_disambiguate(data):
        logger.info("RSEM is not supported yet for disambiguation protocols. "
                    "See https://github.com/chapmanb/bcbio-nextgen/issues/859")
        return [[data]]
    if dd.get_rsem(data) and not dd.get_transcriptome_bam(data):
        file1, file2 = dd.get_input_sequence_files(data)
        ref_file = dd.get_ref_file(data)
        logger.info("RSEM was flagged to run, but the transcriptome BAM file "
                    "was not found. Aligning to the transcriptome with bowtie2.")
        data = bowtie2.align_transcriptome(file1, file2, ref_file, data)
    return [[data]]
Exemplo n.º 19
0
def detect_fusions(data):
    data = to_single_data(data)
    # support the old style of fusion mode calling
    if dd.get_fusion_mode(data, False):
        data = dd.set_fusion_caller(data, ["oncofuse", "pizzly"])
        logger.warning("``fusion_mode`` is deprecated in favor of turning on "
                       "callers with ``fusion_caller``. It will run pizzly and "
                       "oncofuse for now, but will eventually have support "
                       "dropped.")
    fusion_caller = dd.get_fusion_caller(data, [])
    if "oncofuse" in fusion_caller:
        oncofuse_file = oncofuse.run(data)
        if oncofuse_file:
            data = dd.set_oncofuse_file(data, oncofuse_file)
    if "pizzly" in fusion_caller:
        pizzly_dir = pizzly.run_pizzly(data)
        if pizzly_dir:
            data = dd.set_pizzly_dir(data, pizzly_dir)
            data["fusion"] = {"fasta": os.path.join(pizzly_dir, "%s.fusions.fasta" % dd.get_sample_name(data)),
                              "json": os.path.join(pizzly_dir, "%s.json" % dd.get_sample_name(data))}
    if "ericscript" in fusion_caller:
        ericscript_dir = ericscript.run(data)
    return [[data]]