def main(): mcc_out = snakemake.config["args"]['out'] mccutils.mkdir(mcc_out+"/results/") coverage_out = mcc_out+"/results/coverage/" mccutils.mkdir(coverage_out) # ensures intermediate files from previous runs are removed for f in os.listdir(coverage_out): mccutils.remove(coverage_out+"/"+f) run_id = snakemake.config['args']['run_id'] te_seqs = snakemake.input.consensus log = snakemake.params.log # always use consensus fasta for masking the genome mccutils.mkdir(coverage_out+"/input") mccutils.mkdir(coverage_out+"/te-depth-files") masked_reference, masked_gff = repeatmask_genome(snakemake.input.ref, te_seqs, snakemake.threads, run_id, coverage_out, log) # uses coverage fasta (if exists) for augmenting and coverage analysis if snakemake.config['in']['coverage_fasta'] != "None": te_seqs = snakemake.input.coverage_fa augmented_reference = augment_genome(masked_reference, te_seqs, coverage_out) index_genome(snakemake.input.ref, log) index_genome(augmented_reference, log) if snakemake.config['in']['fq2'] == "None": sam = map_reads(augmented_reference, snakemake.input.fq1, snakemake.threads, snakemake.params.sample, run_id, coverage_out, log) else: sam = map_reads(augmented_reference, snakemake.input.fq1, snakemake.threads, snakemake.params.sample, run_id, coverage_out, log, fq2=snakemake.input.fq2) bam = sam_to_bam(sam, augmented_reference, snakemake.params.sample, snakemake.threads, run_id, coverage_out, log) nonte_bed = make_nonte_bed(snakemake.input.ref, masked_gff, run_id, coverage_out, log) genome_depth = get_genome_depth(nonte_bed, bam, run_id, coverage_out, log) edge_trim = 0 if config.OMIT_EDGES: if config.OMIT_EDGES_READ_LENGTH: edge_trim = mccutils.estimate_read_length(snakemake.input.fq1) else: edge_trim = config.OMIT_EDGES_LENGTH te_names, all_coverage_files, uniq_coverage_files, avg_norm_te_depths = make_depth_table(te_seqs, bam, genome_depth, run_id, coverage_out, snakemake.output[0], log, trim_edges=edge_trim) make_plots(te_names, all_coverage_files, uniq_coverage_files, avg_norm_te_depths, genome_depth, snakemake.params.sample, coverage_out, trim_edges=edge_trim) mccutils.remove(sam)
def make_run_summary(out_file_map, commit, methods, failed_methods, fq1, fq2, ref, bam, flagstat, median_insert_size, command, execution_dir, start_time, log_dir, out_dir, out_file, paired=False): out_lines = ["\n"] out_lines.append(("-" * 34) + "\n") out_lines.append("MCCLINTOCK SUMMARY REPORT\n") out_lines.append(("-" * 34) + "\n") split_command = command.split(" ") for x, split in enumerate(split_command): if split[0] == "-": split_command[x] = split_command[x].replace("-", " \\\n\t-", 1) command = " ".join(split_command) out_lines.append("McClintock Version: " + commit + "\n\n") out_lines.append("Command:\n" + command + "\n") out_lines.append("\nrun from directory: " + execution_dir + "\n") out_lines.append(pad("Started:", 12) + start_time + "\n") out_lines.append(pad("Completed:", 12) + "{{END_TIME}}" + "\n") mapping_info = None if os.path.exists(bam) and os.path.exists(flagstat) and os.path.exists( median_insert_size) and os.path.exists(ref): mapping_info = {} out_lines.append(("-" * 34) + "\n") out_lines.append("MAPPED READ INFORMATION\n") out_lines.append(("-" * 34) + "\n") fq1_read_len = str(mccutils.estimate_read_length(fq1)) mapping_info["read1_length"] = fq1_read_len out_lines.append( pad("read1 sequence length:", 24) + fq1_read_len + "\n") if paired: fq2_read_len = str(mccutils.estimate_read_length(fq2)) mapping_info["read2_length"] = fq2_read_len out_lines.append( pad("read2 sequence length:", 24) + fq2_read_len + "\n") with open(flagstat, "r") as stat: for line in stat: if paired: if "read1" in line: reads = line.split("+")[0].replace(" ", "") mapping_info['read1_reads'] = str(reads) out_lines.append( pad("read1 reads:", 24) + str(reads) + "\n") if "read2" in line: reads = line.split("+")[0].replace(" ", "") mapping_info['read2_reads'] = str(reads) out_lines.append( pad("read2 reads:", 24) + str(reads) + "\n") elif "mapped (" in line: reads = line.split("+")[0].replace(" ", "") mapping_info['read1_reads'] = str(reads) out_lines.append( pad("read1 reads:", 24) + str(reads) + "\n") with open(median_insert_size, "r") as median_insert: for line in median_insert: line = line.split("=")[1].replace("\n", "") insert_size = str(int(float(line))) mapping_info["median_insert_size"] = insert_size out_lines.append( pad("median insert size:", 24) + insert_size + "\n") avg_genome_cov = str(get_avg_coverage(ref, bam, out_dir)) mapping_info['avg_genome_cov'] = avg_genome_cov out_lines.append( pad("avg genome coverage:", 24) + avg_genome_cov + "\n") out_lines.append(("-" * 34) + "\n") len_longest_name = 0 for method in config.ALL_METHODS: if len(method) > len_longest_name: len_longest_name = len(method) width1 = len_longest_name + 2 width2 = 10 width3 = 13 width4 = 14 out_lines.append("\n") out_lines.append("-" * (width1) + "-" * width2 + "-" * width3 + "-" * width4 + "\n") out_lines.append( pad("METHOD", width1) + pad("ALL", width2) + pad("REFERENCE", width3) + pad("NON-REFERENCE", width4) + "\n") out_lines.append("-" * (width1) + "-" * width2 + "-" * width3 + "-" * width4 + "\n") for method in config.ALL_METHODS: if "nonredundant.bed" in out_file_map[method]: if method in methods: all_te, ref_te, nonref_te = get_te_counts(out_file_map[method]) out_lines.append( pad(method, width1) + pad(str(all_te), width2) + pad(str(ref_te), width3) + pad(str(nonref_te), width4) + "\n") else: out_lines.append( pad(method, width1) + pad("NA", width2) + pad("NA", width3) + pad("NA", width4) + "\n") out_lines.append("-" * (width1) + "-" * width2 + "-" * width3 + "-" * width4 + "\n") if len(failed_methods) > 0: out_lines.append( "\n!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n") out_lines.append( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n") out_lines.append("!! SOME METHODS FAILED TO COMPLETE\n") out_lines.append("!! FAILED METHODS: " + " ".join(failed_methods) + "\n") out_lines.append("!!\n") out_lines.append("!! Please check the associated logs:\n") out_lines.append("!! " + log_dir + "\n") out_lines.append("!!\n") out_lines.append("!! Errors can be reported to:\n") out_lines.append( "!! https://github.com/bergmanlab/mcclintock/issues \n") out_lines.append("!!\n") out_lines.append( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n") out_lines.append( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n") with open(out_file, "w") as out: for line in out_lines: if "{{END_TIME}}" in line: now = datetime.now() completed = now.strftime("%Y-%m-%d %H:%M:%S") line = line.replace("{{END_TIME}}", completed) print(line, end="") out.write(line) return mapping_info, completed
def make_run_summary(out_file_map, methods, fq1, fq2, ref, bam, flagstat, median_insert_size, command, execution_dir, start_time, out_dir, out_file, paired=False): out_lines = ["\n"] out_lines.append(("-" * 34) + "\n") out_lines.append("MCCLINTOCK SUMMARY REPORT\n") out_lines.append(("-" * 34) + "\n") split_command = command.split(" ") for x, split in enumerate(split_command): if split[0] == "-": split_command[x] = split_command[x].replace("-", " \\\n\t-", 1) command = " ".join(split_command) out_lines.append("Command:\n" + command + "\n") out_lines.append("\nrun from directory: " + execution_dir + "\n") out_lines.append(pad("Started:", 12) + start_time + "\n") out_lines.append(pad("Completed:", 12) + "{{END_TIME}}" + "\n\n") if os.path.exists(bam) and os.path.exists(flagstat) and os.path.exists( median_insert_size) and os.path.exists(ref): out_lines.append(("-" * 34) + "\n") out_lines.append("MAPPED READ INFORMATION\n") out_lines.append(("-" * 34) + "\n") out_lines.append( pad("read1 sequence length:", 24) + str(mccutils.estimate_read_length(fq1)) + "\n") if paired: out_lines.append( pad("read2 sequence length:", 24) + str(mccutils.estimate_read_length(fq2)) + "\n") with open(flagstat, "r") as stat: for line in stat: if "read1" in line: reads = line.split("+")[0].replace(" ", "") out_lines.append( pad("read1 reads:", 24) + str(reads) + "\n") if paired and "read2" in line: reads = line.split("+")[0].replace(" ", "") out_lines.append( pad("read2 reads:", 24) + str(reads) + "\n") with open(median_insert_size, "r") as median_insert: for line in median_insert: line = line.split("=")[1].replace("\n", "") out_lines.append( pad("median insert size:", 24) + str(int(float(line))) + "\n") out_lines.append( pad("avg genome coverage:", 24) + str(get_avg_coverage(ref, bam, out_dir)) + "\n") out_lines.append(("-" * 34) + "\n") len_longest_name = 0 for method in config.ALL_METHODS: if len(method) > len_longest_name: len_longest_name = len(method) width1 = len_longest_name + 2 width2 = 10 width3 = 13 width4 = 14 out_lines.append("\n") out_lines.append("-" * (width1) + "-" * width2 + "-" * width3 + "-" * width4 + "\n") out_lines.append( pad("METHOD", width1) + pad("ALL", width2) + pad("REFERENCE", width3) + pad("NON-REFERENCE", width4) + "\n") out_lines.append("-" * (width1) + "-" * width2 + "-" * width3 + "-" * width4 + "\n") for method in config.ALL_METHODS: if "nonredundant.bed" in out_file_map[method]: if method in methods: all_te, ref_te, nonref_te = get_te_counts(out_file_map[method]) out_lines.append( pad(method, width1) + pad(str(all_te), width2) + pad(str(ref_te), width3) + pad(str(nonref_te), width4) + "\n") else: out_lines.append( pad(method, width1) + pad("NA", width2) + pad("NA", width3) + pad("NA", width4) + "\n") out_lines.append("-" * (width1) + "-" * width2 + "-" * width3 + "-" * width4 + "\n") with open(out_file, "w") as out: for line in out_lines: if "{{END_TIME}}" in line: now = datetime.now() completed = now.strftime("%Y-%m-%d %H:%M:%S") line = line.replace("{{END_TIME}}", completed) print(line, end="") out.write(line)