示例#1
0
def align(fastq_file, pair_file, ref_file, names, align_dir, data):
    config = data["config"]
    out_prefix = path.join(align_dir, names["lane"])
    out_file = out_prefix + "Aligned.out.sam"
    if file_exists(out_file):
        return out_file
    star_path = config_utils.get_program("STAR", config)
    fastq = " ".join([fastq_file, pair_file]) if pair_file else fastq_file
    num_cores = config["algorithm"].get("num_cores", 1)

    safe_makedir(align_dir)
    cmd = ("{star_path} --genomeDir {ref_file} --readFilesIn {fastq} "
           "--runThreadN {num_cores} --outFileNamePrefix {out_prefix} "
           "--outReadsUnmapped Fastx --outFilterMultimapNmax 10 "
           "--outSAMunmapped Within")
    cmd += _read_group_option(names)
    fusion_mode = get_in(data, ("config", "algorithm", "fusion_mode"), False)
    if fusion_mode:
        cmd += " --chimSegmentMin 15 --chimJunctionOverhangMin 15"
    strandedness = get_in(data, ("config", "algorithm", "strandedness"),
                          "unstranded").lower()
    if strandedness == "unstranded":
        cmd += " --outSAMstrandField intronMotif"
    run_message = "Running STAR aligner on %s and %s." % (pair_file, ref_file)
    do.run(cmd.format(**locals()), run_message, None)
    picard = broad.runner_from_config(config)
    out_file = bam.sam_to_bam(out_file, config)
    out_file = _fix_sam_header(out_file, config)
    return out_file
示例#2
0
def align(fastq_file, pair_file, ref_file, names, align_dir, data):
    config = data["config"]
    out_prefix = os.path.join(align_dir, names["lane"])
    out_file = out_prefix + "Aligned.out.sam"
    out_dir = os.path.join(align_dir, "%s_star" % names["lane"])

    final_out = os.path.join(out_dir, "{0}.bam".format(names["sample"]))
    if file_exists(final_out):
        return final_out
    star_path = config_utils.get_program("STAR", config)
    fastq = " ".join([fastq_file, pair_file]) if pair_file else fastq_file
    num_cores = config["algorithm"].get("num_cores", 1)

    safe_makedir(align_dir)
    cmd = ("{star_path} --genomeDir {ref_file} --readFilesIn {fastq} "
           "--runThreadN {num_cores} --outFileNamePrefix {out_prefix} "
           "--outReadsUnmapped Fastx --outFilterMultimapNmax 10 "
           "--outSAMunmapped Within")
    cmd += _read_group_option(names)
    fusion_mode = get_in(data, ("config", "algorithm", "fusion_mode"), False)
    if fusion_mode:
        cmd += " --chimSegmentMin 15 --chimJunctionOverhangMin 15"
    strandedness = get_in(data, ("config", "algorithm", "strandedness"),
                          "unstranded").lower()
    if strandedness == "unstranded":
        cmd += " --outSAMstrandField intronMotif"
    run_message = "Running STAR aligner on %s and %s." % (pair_file, ref_file)
    do.run(cmd.format(**locals()), run_message, None)
    out_file = bam.sam_to_bam(out_file, config)
    out_file = _fix_sam_header(out_file, config)
    if not file_exists(final_out):
        symlink_plus(out_file, final_out)
    return final_out
示例#3
0
文件: star.py 项目: zeneofa/bcbio
def align(fastq_file, pair_file, ref_file, names, align_dir, data):
    config = data["config"]
    out_prefix = os.path.join(align_dir, names["lane"])
    out_file = out_prefix + "Aligned.out.sam"
    out_dir = os.path.join(align_dir, "%s_star" % names["lane"])

    final_out = os.path.join(out_dir, "{0}.bam".format(names["sample"]))
    if file_exists(final_out):
        return final_out
    star_path = config_utils.get_program("STAR", config)
    fastq = " ".join([fastq_file, pair_file]) if pair_file else fastq_file
    num_cores = config["algorithm"].get("num_cores", 1)

    safe_makedir(align_dir)
    cmd = ("{star_path} --genomeDir {ref_file} --readFilesIn {fastq} "
           "--runThreadN {num_cores} --outFileNamePrefix {out_prefix} "
           "--outReadsUnmapped Fastx --outFilterMultimapNmax 10 "
           "--outSAMunmapped Within --outSAMattributes %s" %
           " ".join(ALIGN_TAGS))
    cmd += _read_group_option(names)
    fusion_mode = get_in(data, ("config", "algorithm", "fusion_mode"), False)
    if fusion_mode:
        cmd += " --chimSegmentMin 15 --chimJunctionOverhangMin 15"
    strandedness = get_in(data, ("config", "algorithm", "strandedness"),
                          "unstranded").lower()
    if strandedness == "unstranded":
        cmd += " --outSAMstrandField intronMotif"
    run_message = "Running STAR aligner on %s and %s." % (pair_file, ref_file)
    do.run(cmd.format(**locals()), run_message, None)
    out_file = bam.sam_to_bam(out_file, config)
    if not file_exists(final_out):
        symlink_plus(out_file, final_out)
    return final_out
示例#4
0
def _align_from_fastq(fastq1, fastq2, aligner, align_ref, sam_ref, names, align_dir, data):
    """Align from fastq inputs, producing sorted BAM output.
    """
    config = data["config"]
    align_fn = TOOLS[aligner].align_fn
    out = align_fn(fastq1, fastq2, align_ref, names, align_dir, data)
    # handle align functions that update the main data dictionary in place
    if isinstance(out, dict):
        assert "work_bam" in out
        return out
    # handle output of raw SAM files that need to be converted to BAM
    else:
        work_bam = bam.sam_to_bam(out, config)
        data["work_bam"] = bam.sort(work_bam, config)
        return data
示例#5
0
def _align_from_fastq(fastq1, fastq2, aligner, align_ref, sam_ref, names,
                      align_dir, data):
    """Align from fastq inputs, producing sorted BAM output.
    """
    config = data["config"]
    align_fn = TOOLS[aligner].align_fn
    out = align_fn(fastq1, fastq2, align_ref, names, align_dir, data)
    # handle align functions that update the main data dictionary in place
    if isinstance(out, dict):
        assert "work_bam" in out
        return out
    # handle output of raw SAM files that need to be converted to BAM
    else:
        work_bam = bam.sam_to_bam(out, config)
        data["work_bam"] = bam.sort(work_bam, config)
        return data
示例#6
0
def merge_unmapped(mapped_sam, unmapped_bam, config):
    merged_bam = os.path.join(os.path.dirname(mapped_sam), "merged.bam")
    bam_file = bam.sam_to_bam(mapped_sam, config)
    if not file_exists(merged_bam):
        merged_bam = bam.merge([bam_file, unmapped_bam], merged_bam, config)
    return merged_bam
示例#7
0
def merge_unmapped(mapped_sam, unmapped_bam, config):
    merged_bam = os.path.join(os.path.dirname(mapped_sam), "merged.bam")
    bam_file = bam.sam_to_bam(mapped_sam, config)
    if not file_exists(merged_bam):
        merged_bam = bam.merge([bam_file, unmapped_bam], merged_bam, config)
    return merged_bam
        cmd += f"--fastqfile2 {fq2} --pairedend TRUE"
    check_call(cmd, shell=True)

if __name__ == "__main__":
    parser = ArgumentParser()
    parser.add_argument("bcbio_config", help="final bcbio configuration file.")
    parser.add_argument("bowtie_index", help="location of bowtie2 index")
    parser.add_argument("annotation", help="repeat annotation")
    parser.add_argument("rep2_setup", help="repenrich2 setup directory")
    parser.add_argument("--threads", default=1, help="Number of threads to use.")
    parser.add_argument("--outdir", default="align", help="output directory")
    parser.add_argument("--rep2_path", default="RepEnrich2", help="path to RepEnrich2 code")
    args = parser.parse_args()
    filedict = get_files(args.bcbio_config)
    safe_makedir(args.outdir)
    config = config["algorithm"] = {"num_cores": args.threads}
    repenrich = "RepEnrich2/RepEnrich.py"
    annotation = "metadata/hg38_repeatmasker_clean.txt"
    repenrich_setup = "metadata/RepEnrich2_setup_hg38"
    for samplename, files in filedict.items():
        out_dir = os.path.join(args.outdir, "repenrich", samplename)
        logging.info(f"Aligning {samplename} to {args.bowtie_index}.")
        out_sam = run_bowtie2(files, samplename, args.bowtie_index, args.threads, args.outdir)
        logging.info(f"Converting {out_sam} to BAM format.")
        out_bam = sam_to_bam(out_sam, config)
        logging.info(f"Subsetting {out_bam} into unique and multimapped reads.")
        subset_files = subset_reads(out_bam, samplename, args.rep2_path)
        logging.info(f"Running RepEnrich2 on {samplename}.")
        run_repenrich2(subset_files, samplename, args.annotation, args.rep2_path, args.rep2_setup, args.threads)
        logging.info(f"Finished {samplename}.")