Пример #1
0
def main():
    mcc_out = snakemake.config["args"]['out']
    mccutils.mkdir(mcc_out+"/results/")
    coverage_out = mcc_out+"/results/coverage/"
    mccutils.mkdir(coverage_out)

    # ensures intermediate files from previous runs are removed
    for f in os.listdir(coverage_out):
        mccutils.remove(coverage_out+"/"+f)

    run_id = snakemake.config['args']['run_id']
    te_seqs = snakemake.input.consensus
    log = snakemake.params.log
    
    # always use consensus fasta for masking the genome
    mccutils.mkdir(coverage_out+"/input")
    mccutils.mkdir(coverage_out+"/te-depth-files")
    masked_reference, masked_gff = repeatmask_genome(snakemake.input.ref, te_seqs, snakemake.threads, run_id, coverage_out, log)

    # uses coverage fasta (if exists) for augmenting and coverage analysis
    if snakemake.config['in']['coverage_fasta'] != "None":
        te_seqs = snakemake.input.coverage_fa


    augmented_reference = augment_genome(masked_reference, te_seqs, coverage_out)
    index_genome(snakemake.input.ref, log)
    index_genome(augmented_reference, log)
    
    if snakemake.config['in']['fq2'] == "None":
        sam = map_reads(augmented_reference, snakemake.input.fq1, snakemake.threads, snakemake.params.sample, run_id, coverage_out, log)
    else:
        sam = map_reads(augmented_reference, snakemake.input.fq1, snakemake.threads, snakemake.params.sample, run_id, coverage_out, log, fq2=snakemake.input.fq2)

    bam = sam_to_bam(sam, augmented_reference, snakemake.params.sample, snakemake.threads, run_id, coverage_out, log)
    nonte_bed = make_nonte_bed(snakemake.input.ref, masked_gff, run_id, coverage_out, log)
    genome_depth = get_genome_depth(nonte_bed, bam, run_id, coverage_out, log)

    edge_trim = 0
    if config.OMIT_EDGES:
        if config.OMIT_EDGES_READ_LENGTH:
            edge_trim = mccutils.estimate_read_length(snakemake.input.fq1)
        else:
            edge_trim = config.OMIT_EDGES_LENGTH
            
    te_names, all_coverage_files, uniq_coverage_files, avg_norm_te_depths = make_depth_table(te_seqs, bam, genome_depth, run_id, coverage_out, snakemake.output[0], log, trim_edges=edge_trim)
    make_plots(te_names, all_coverage_files, uniq_coverage_files, avg_norm_te_depths, genome_depth, snakemake.params.sample, coverage_out, trim_edges=edge_trim)

    mccutils.remove(sam)
Пример #2
0
def make_run_summary(out_file_map,
                     commit,
                     methods,
                     failed_methods,
                     fq1,
                     fq2,
                     ref,
                     bam,
                     flagstat,
                     median_insert_size,
                     command,
                     execution_dir,
                     start_time,
                     log_dir,
                     out_dir,
                     out_file,
                     paired=False):
    out_lines = ["\n"]
    out_lines.append(("-" * 34) + "\n")
    out_lines.append("MCCLINTOCK SUMMARY REPORT\n")
    out_lines.append(("-" * 34) + "\n")
    split_command = command.split(" ")
    for x, split in enumerate(split_command):
        if split[0] == "-":
            split_command[x] = split_command[x].replace("-", " \\\n\t-", 1)
    command = " ".join(split_command)
    out_lines.append("McClintock Version: " + commit + "\n\n")
    out_lines.append("Command:\n" + command + "\n")
    out_lines.append("\nrun from directory: " + execution_dir + "\n")
    out_lines.append(pad("Started:", 12) + start_time + "\n")
    out_lines.append(pad("Completed:", 12) + "{{END_TIME}}" + "\n")

    mapping_info = None
    if os.path.exists(bam) and os.path.exists(flagstat) and os.path.exists(
            median_insert_size) and os.path.exists(ref):
        mapping_info = {}
        out_lines.append(("-" * 34) + "\n")
        out_lines.append("MAPPED READ INFORMATION\n")
        out_lines.append(("-" * 34) + "\n")

        fq1_read_len = str(mccutils.estimate_read_length(fq1))
        mapping_info["read1_length"] = fq1_read_len
        out_lines.append(
            pad("read1 sequence length:", 24) + fq1_read_len + "\n")
        if paired:
            fq2_read_len = str(mccutils.estimate_read_length(fq2))
            mapping_info["read2_length"] = fq2_read_len
            out_lines.append(
                pad("read2 sequence length:", 24) + fq2_read_len + "\n")

        with open(flagstat, "r") as stat:
            for line in stat:
                if paired:
                    if "read1" in line:
                        reads = line.split("+")[0].replace(" ", "")
                        mapping_info['read1_reads'] = str(reads)
                        out_lines.append(
                            pad("read1 reads:", 24) + str(reads) + "\n")

                    if "read2" in line:
                        reads = line.split("+")[0].replace(" ", "")
                        mapping_info['read2_reads'] = str(reads)
                        out_lines.append(
                            pad("read2 reads:", 24) + str(reads) + "\n")

                elif "mapped (" in line:
                    reads = line.split("+")[0].replace(" ", "")
                    mapping_info['read1_reads'] = str(reads)
                    out_lines.append(
                        pad("read1 reads:", 24) + str(reads) + "\n")

        with open(median_insert_size, "r") as median_insert:
            for line in median_insert:
                line = line.split("=")[1].replace("\n", "")
                insert_size = str(int(float(line)))
                mapping_info["median_insert_size"] = insert_size
                out_lines.append(
                    pad("median insert size:", 24) + insert_size + "\n")

        avg_genome_cov = str(get_avg_coverage(ref, bam, out_dir))
        mapping_info['avg_genome_cov'] = avg_genome_cov
        out_lines.append(
            pad("avg genome coverage:", 24) + avg_genome_cov + "\n")
        out_lines.append(("-" * 34) + "\n")

    len_longest_name = 0
    for method in config.ALL_METHODS:
        if len(method) > len_longest_name:
            len_longest_name = len(method)

    width1 = len_longest_name + 2
    width2 = 10
    width3 = 13
    width4 = 14

    out_lines.append("\n")
    out_lines.append("-" * (width1) + "-" * width2 + "-" * width3 +
                     "-" * width4 + "\n")
    out_lines.append(
        pad("METHOD", width1) + pad("ALL", width2) + pad("REFERENCE", width3) +
        pad("NON-REFERENCE", width4) + "\n")
    out_lines.append("-" * (width1) + "-" * width2 + "-" * width3 +
                     "-" * width4 + "\n")
    for method in config.ALL_METHODS:
        if "nonredundant.bed" in out_file_map[method]:
            if method in methods:
                all_te, ref_te, nonref_te = get_te_counts(out_file_map[method])
                out_lines.append(
                    pad(method, width1) + pad(str(all_te), width2) +
                    pad(str(ref_te), width3) + pad(str(nonref_te), width4) +
                    "\n")
            else:
                out_lines.append(
                    pad(method, width1) + pad("NA", width2) +
                    pad("NA", width3) + pad("NA", width4) + "\n")

    out_lines.append("-" * (width1) + "-" * width2 + "-" * width3 +
                     "-" * width4 + "\n")

    if len(failed_methods) > 0:
        out_lines.append(
            "\n!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n")
        out_lines.append(
            "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n")
        out_lines.append("!! SOME METHODS FAILED TO COMPLETE\n")
        out_lines.append("!! FAILED METHODS: " + " ".join(failed_methods) +
                         "\n")
        out_lines.append("!!\n")
        out_lines.append("!! Please check the associated logs:\n")
        out_lines.append("!!     " + log_dir + "\n")
        out_lines.append("!!\n")
        out_lines.append("!! Errors can be reported to:\n")
        out_lines.append(
            "!!     https://github.com/bergmanlab/mcclintock/issues \n")
        out_lines.append("!!\n")
        out_lines.append(
            "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n")
        out_lines.append(
            "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n")

    with open(out_file, "w") as out:
        for line in out_lines:
            if "{{END_TIME}}" in line:
                now = datetime.now()
                completed = now.strftime("%Y-%m-%d %H:%M:%S")
                line = line.replace("{{END_TIME}}", completed)
            print(line, end="")
            out.write(line)

    return mapping_info, completed
Пример #3
0
def make_run_summary(out_file_map,
                     methods,
                     fq1,
                     fq2,
                     ref,
                     bam,
                     flagstat,
                     median_insert_size,
                     command,
                     execution_dir,
                     start_time,
                     out_dir,
                     out_file,
                     paired=False):
    out_lines = ["\n"]
    out_lines.append(("-" * 34) + "\n")
    out_lines.append("MCCLINTOCK SUMMARY REPORT\n")
    out_lines.append(("-" * 34) + "\n")
    split_command = command.split(" ")
    for x, split in enumerate(split_command):
        if split[0] == "-":
            split_command[x] = split_command[x].replace("-", " \\\n\t-", 1)
    command = " ".join(split_command)
    out_lines.append("Command:\n" + command + "\n")
    out_lines.append("\nrun from directory: " + execution_dir + "\n")
    out_lines.append(pad("Started:", 12) + start_time + "\n")
    out_lines.append(pad("Completed:", 12) + "{{END_TIME}}" + "\n\n")

    if os.path.exists(bam) and os.path.exists(flagstat) and os.path.exists(
            median_insert_size) and os.path.exists(ref):
        out_lines.append(("-" * 34) + "\n")
        out_lines.append("MAPPED READ INFORMATION\n")
        out_lines.append(("-" * 34) + "\n")

        out_lines.append(
            pad("read1 sequence length:", 24) +
            str(mccutils.estimate_read_length(fq1)) + "\n")
        if paired:
            out_lines.append(
                pad("read2 sequence length:", 24) +
                str(mccutils.estimate_read_length(fq2)) + "\n")

        with open(flagstat, "r") as stat:
            for line in stat:
                if "read1" in line:
                    reads = line.split("+")[0].replace(" ", "")
                    out_lines.append(
                        pad("read1 reads:", 24) + str(reads) + "\n")

                if paired and "read2" in line:
                    reads = line.split("+")[0].replace(" ", "")
                    out_lines.append(
                        pad("read2 reads:", 24) + str(reads) + "\n")

        with open(median_insert_size, "r") as median_insert:
            for line in median_insert:
                line = line.split("=")[1].replace("\n", "")
                out_lines.append(
                    pad("median insert size:", 24) + str(int(float(line))) +
                    "\n")

        out_lines.append(
            pad("avg genome coverage:", 24) +
            str(get_avg_coverage(ref, bam, out_dir)) + "\n")
        out_lines.append(("-" * 34) + "\n")

    len_longest_name = 0
    for method in config.ALL_METHODS:
        if len(method) > len_longest_name:
            len_longest_name = len(method)

    width1 = len_longest_name + 2
    width2 = 10
    width3 = 13
    width4 = 14

    out_lines.append("\n")
    out_lines.append("-" * (width1) + "-" * width2 + "-" * width3 +
                     "-" * width4 + "\n")
    out_lines.append(
        pad("METHOD", width1) + pad("ALL", width2) + pad("REFERENCE", width3) +
        pad("NON-REFERENCE", width4) + "\n")
    out_lines.append("-" * (width1) + "-" * width2 + "-" * width3 +
                     "-" * width4 + "\n")
    for method in config.ALL_METHODS:
        if "nonredundant.bed" in out_file_map[method]:
            if method in methods:
                all_te, ref_te, nonref_te = get_te_counts(out_file_map[method])
                out_lines.append(
                    pad(method, width1) + pad(str(all_te), width2) +
                    pad(str(ref_te), width3) + pad(str(nonref_te), width4) +
                    "\n")
            else:
                out_lines.append(
                    pad(method, width1) + pad("NA", width2) +
                    pad("NA", width3) + pad("NA", width4) + "\n")

    out_lines.append("-" * (width1) + "-" * width2 + "-" * width3 +
                     "-" * width4 + "\n")

    with open(out_file, "w") as out:
        for line in out_lines:
            if "{{END_TIME}}" in line:
                now = datetime.now()
                completed = now.strftime("%Y-%m-%d %H:%M:%S")
                line = line.replace("{{END_TIME}}", completed)
            print(line, end="")
            out.write(line)