Пример #1
0
def align_transcriptome(fastq_file, pair_file, ref_file, data):
    """
    bwa mem with settings for aligning to the transcriptome for eXpress/RSEM/etc
    """
    work_bam = dd.get_work_bam(data)
    base, ext = os.path.splitext(work_bam)
    out_file = base + ".transcriptome" + ext
    if utils.file_exists(out_file):
        data = dd.set_transcriptome_bam(data, out_file)
        return data
    # bwa mem needs phred+33 quality, so convert if it is Illumina
    if dd.get_quality_format(data).lower() == "illumina":
        logger.info("bwa mem does not support the phred+64 quality format, "
                    "converting %s and %s to phred+33.")
        fastq_file = fastq.groom(fastq_file, data, in_qual="fastq-illumina")
        if pair_file:
            pair_file = fastq.groom(pair_file, data, in_qual="fastq-illumina")
    bwa = config_utils.get_program("bwa", data["config"])
    gtf_file = dd.get_gtf_file(data)
    gtf_fasta = index_transcriptome(gtf_file, ref_file, data)
    args = " ".join(_bwa_args_from_config(data["config"]))
    num_cores = data["config"]["algorithm"].get("num_cores", 1)
    samtools = config_utils.get_program("samtools", data["config"])
    cmd = ("{bwa} mem {args} -a -t {num_cores} {gtf_fasta} {fastq_file} "
           "{pair_file} | {samtools} view -bhS - > {tx_out_file}")

    with file_transaction(data, out_file) as tx_out_file:
        message = "Aligning %s and %s to the transcriptome." % (fastq_file,
                                                                pair_file)
        do.run(cmd.format(**locals()), message)
    data = dd.set_transcriptome_bam(data, out_file)
    return data
Пример #2
0
def align_transcriptome(fastq_file, pair_file, ref_file, data):
    """
    bwa mem with settings for aligning to the transcriptome for eXpress/RSEM/etc
    """
    work_bam = dd.get_work_bam(data)
    base, ext = os.path.splitext(work_bam)
    out_file = base + ".transcriptome" + ext
    if utils.file_exists(out_file):
        data = dd.set_transcriptome_bam(data, out_file)
        return data
    # bwa mem needs phred+33 quality, so convert if it is Illumina
    if dd.get_quality_format(data).lower() == "illumina":
        logger.info("bwa mem does not support the phred+64 quality format, " "converting %s and %s to phred+33.")
        fastq_file = fastq.groom(fastq_file, in_qual="fastq-illumina", data=data)
        if pair_file:
            pair_file = fastq.groom(pair_file, in_qual="fastq-illumina", data=data)
    bwa = config_utils.get_program("bwa", data["config"])
    gtf_file = dd.get_gtf_file(data)
    gtf_fasta = index_transcriptome(gtf_file, ref_file, data)
    args = " ".join(_bwa_args_from_config(data["config"]))
    num_cores = data["config"]["algorithm"].get("num_cores", 1)
    cmd = (
        "{bwa} mem {args} -a -t {num_cores} {gtf_fasta} {fastq_file} "
        "{pair_file} | samtools view -bhS - > {tx_out_file}"
    )

    with file_transaction(out_file) as tx_out_file:
        message = "Aligning %s and %s to the transcriptome." % (fastq_file, pair_file)
        do.run(cmd.format(**locals()), message)
    data = dd.set_transcriptome_bam(data, out_file)
    return data
Пример #3
0
def align_transcriptome(fastq_file, pair_file, ref_file, data):
    """
    bowtie2 with settings for aligning to the transcriptome for eXpress/RSEM/etc
    """
    work_bam = dd.get_work_bam(data)
    base, ext = os.path.splitext(work_bam)
    out_file = base + ".transcriptome" + ext
    if utils.file_exists(out_file):
        data = dd.set_transcriptome_bam(data, out_file)
        return data
    bowtie2 = config_utils.get_program("bowtie2", data["config"])
    gtf_file = dd.get_gtf_file(data)
    gtf_index = index_transcriptome(gtf_file, ref_file, data)
    num_cores = data["config"]["algorithm"].get("num_cores", 1)
    fastq_cmd = "-1 %s" % fastq_file if pair_file else "-U %s" % fastq_file
    pair_cmd = "-2 %s " % pair_file if pair_file else ""
    cmd = (
        "{bowtie2} -p {num_cores} -a -X 600 --rdg 6,5 --rfg 6,5 --score-min L,-.6,-.4 --no-discordant --no-mixed -x {gtf_index} {fastq_cmd} {pair_cmd} "
    )
    with file_transaction(data, out_file) as tx_out_file:
        message = "Aligning %s and %s to the transcriptome." % (fastq_file,
                                                                pair_file)
        cmd += "| " + postalign.sam_to_sortbam_cl(
            data, tx_out_file, name_sort=True)
        do.run(cmd.format(**locals()), message)
    data = dd.set_transcriptome_bam(data, out_file)
    return data
Пример #4
0
def _update_data(align_file, out_dir, names, data):
    data = dd.set_work_bam(data, align_file)
    data = dd.set_align_bam(data, align_file)
    if dd.get_transcriptome_align(data) and not is_transcriptome_broken():
        transcriptome_file = _move_transcriptome_file(out_dir, names)
        data = dd.set_transcriptome_bam(data, transcriptome_file)
    return data
Пример #5
0
def _update_data(align_file, out_dir, names, data):
    data = dd.set_work_bam(data, align_file)
    data = dd.set_align_bam(data, align_file)
    if dd.get_transcriptome_align(data) and not is_transcriptome_broken(data):
        transcriptome_file = _move_transcriptome_file(out_dir, names)
        data = dd.set_transcriptome_bam(data, transcriptome_file)
    return data
Пример #6
0
def _update_data(align_file, out_dir, names, data):
    data = dd.set_work_bam(data, align_file)
    data = dd.set_align_bam(data, align_file)
    transcriptome_file = _move_transcriptome_file(out_dir, names)
    data = dd.set_transcriptome_bam(data, transcriptome_file)
    sjfile = get_splicejunction_file(out_dir, data)
    sjbed = junction2bed(sjfile)
    data = dd.set_junction_bed(data, sjbed)
    return data
Пример #7
0
def _update_data(align_file, out_dir, names, data):
    data = dd.set_work_bam(data, align_file)
    data = dd.set_align_bam(data, align_file)
    transcriptome_file = _move_transcriptome_file(out_dir, names)
    data = dd.set_transcriptome_bam(data, transcriptome_file)
    sjfile = get_splicejunction_file(out_dir, data)
    if sjfile:
        sjbed = junction2bed(sjfile)
        data = dd.set_junction_bed(data, sjbed)
    return data
Пример #8
0
def align_transcriptome(fastq_file, pair_file, ref_file, data):
    """
    bowtie2 with settings for aligning to the transcriptome for eXpress/RSEM/etc
    """
    work_bam = dd.get_work_bam(data)
    base, ext = os.path.splitext(work_bam)
    out_file = base + ".transcriptome" + ext
    if file_exists(out_file):
        data = dd.set_transcriptome_bam(data, out_file)
        return data
    bowtie2 = config_utils.get_program("bowtie2", data["config"])
    gtf_file = dd.get_gtf_file(data)
    gtf_index = index_transcriptome(gtf_file, ref_file, data)
    num_cores = data["config"]["algorithm"].get("num_cores", 1)
    fastq_cmd = "-1 %s" % fastq_file if pair_file else "-U %s" % fastq_file
    pair_cmd = "-2 %s " % pair_file if pair_file else ""
    cmd = ("{bowtie2} -p {num_cores} -a -X 600 --rdg 6,5 --rfg 6,5 --score-min L,-.6,-.4 --no-discordant --no-mixed -x {gtf_index} {fastq_cmd} {pair_cmd} | samtools view -hbS - > {tx_out_file}")
    with file_transaction(out_file) as tx_out_file:
        message = "Aligning %s and %s to the transcriptome." % (fastq_file, pair_file)
        do.run(cmd.format(**locals()), message)
    data = dd.set_transcriptome_bam(data, out_file)
    return data
Пример #9
0
def generate_transcript_counts(data):
    """Generate counts per transcript and per exon from an alignment"""
    data["count_file"] = featureCounts.count(data)
    if dd.get_fusion_mode(data, False):
        oncofuse_file = oncofuse.run(data)
        if oncofuse_file:
            data["oncofuse_file"] = oncofuse.run(data)
    if dd.get_dexseq_gff(data, None):
        data = dd.set_dexseq_counts(data, dexseq.bcbio_run(data))
    # if RSEM was run, stick the transcriptome BAM file into the datadict
    if dd.get_aligner(data).lower() == "star" and dd.get_rsem(data):
        base, ext = os.path.splitext(dd.get_work_bam(data))
        data = dd.set_transcriptome_bam(data, base + ".transcriptome" + ext)
    return [[data]]
Пример #10
0
def generate_transcript_counts(data):
    """Generate counts per transcript and per exon from an alignment"""
    data["count_file"] = featureCounts.count(data)
    if dd.get_fusion_mode(data, False):
        oncofuse_file = oncofuse.run(data)
        if oncofuse_file:
            data["oncofuse_file"] = oncofuse.run(data)
    if dd.get_dexseq_gff(data, None):
        data = dd.set_dexseq_counts(data, dexseq.bcbio_run(data))
    # if RSEM was run, stick the transcriptome BAM file into the datadict
    if dd.get_aligner(data).lower() == "star" and dd.get_rsem(data):
        base, ext = os.path.splitext(dd.get_work_bam(data))
        data = dd.set_transcriptome_bam(data, base + ".transcriptome" + ext)
    return [[data]]
Пример #11
0
def run_rapmap_align(data):
    samplename = dd.get_sample_name(data)
    files = dd.get_input_sequence_files(data)
    work_dir = dd.get_work_dir(data)
    if len(files) == 2:
        fq1, fq2 = files
    else:
        fq1, fq2 = files[0], None
    rapmap_dir = os.path.join(work_dir, "rapmap", samplename)
    gtf_file = dd.get_gtf_file(data)
    fasta_file = dd.get_ref_file(data)
    out_file = rapmap_align(fq1, fq2, rapmap_dir, gtf_file, fasta_file,
                            "quasi", data)
    data = dd.set_transcriptome_bam(data, out_file)
    return [[data]]
Пример #12
0
def run_rapmap_align(data):
    samplename = dd.get_sample_name(data)
    files = dd.get_input_sequence_files(data)
    work_dir = dd.get_work_dir(data)
    if len(files) == 2:
        fq1, fq2 = files
    else:
        fq1, fq2 = files[0], None
    rapmap_dir = os.path.join(work_dir, "rapmap", samplename)
    gtf_file = dd.get_gtf_file(data)
    fasta_file = dd.get_ref_file(data)
    out_file = rapmap_align(fq1, fq2, rapmap_dir, gtf_file, fasta_file,
                            "quasi", data)
    data = dd.set_transcriptome_bam(data, out_file)
    return [[data]]
Пример #13
0
def run_rapmap_pseudoalign(data):
    samplename = dd.get_sample_name(data)
    files = dd.get_input_sequence_files(data)
    work_dir = dd.get_work_dir(data)
    if len(files) == 2:
        fq1, fq2 = files
    else:
        fq1, fq2 = files[0], None
    rapmap_dir = os.path.join(work_dir, "rapmap", samplename)
    gtf_file = dd.get_gtf_file(data)
    assert file_exists(gtf_file), "%s was not found, exiting." % gtf_file
    fasta_file = dd.get_ref_file(data)
    assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file
    out_file = rapmap_pseudoalign(fq1, fq2, rapmap_dir, gtf_file, fasta_file, data)
    data = dd.set_transcriptome_bam(data, out_file)
    return [[data]]
Пример #14
0
def _update_data(align_file, out_dir, names, data):
    data = dd.set_work_bam(data, align_file)
    data = dd.set_align_bam(data, align_file)
    transcriptome_file = _move_transcriptome_file(out_dir, names)
    data = dd.set_transcriptome_bam(data, transcriptome_file)
    return data
Пример #15
0
def _update_data(align_file, out_dir, names, data):
    data = dd.set_work_bam(data, align_file)
    data = dd.set_align_bam(data, align_file)
    transcriptome_file = _move_transcriptome_file(out_dir, names)
    data = dd.set_transcriptome_bam(data, transcriptome_file)
    return data