def main(): consensus_fasta = snakemake.input.consensus_fasta bam = snakemake.input.bam ref_fasta = snakemake.input.ref_fasta ref_te_bed = snakemake.input.ref_te_bed taxonomy = snakemake.input.taxonomy log = snakemake.params.log with open(log,"a") as l: l.write("consensus fasta: "+consensus_fasta+"\n") l.write("BAM: "+bam+"\n") l.write("reference fasta: "+ref_fasta+"\n") l.write("taxonomy TSV: "+ taxonomy+"\n") script_dir = snakemake.params.script_dir out_dir = snakemake.params.out_dir ref_name = snakemake.params.ref_name sample_name = snakemake.params.sample_name mccutils.log("retroseq","running RetroSeq", log=log) elements = split_consensus_fasta(consensus_fasta, ref_name, out_dir) bed_location_file = make_consensus_beds(elements, ref_name, ref_te_bed, taxonomy, out_dir) run_retroseq(bam, bed_location_file, ref_fasta, script_dir, sample_name, out_dir, config.PARAMETERS, log=log) mccutils.log("retroseq","RetroSeq complete")
def main(): mccutils.log("teflon", "setting up for TEFLoN") te_gff = snakemake.input.te_gff taxonomy = snakemake.input.taxonomy consensus = snakemake.input.consensus reference_genome = snakemake.input.reference_genome fq1 = snakemake.input.fq1 fq2 = snakemake.input.fq2 threads = snakemake.threads out_dir = snakemake.params.out_dir script_dir = snakemake.params.script_dir log = snakemake.params.log ref_bed = snakemake.output.ref_bed teflon_taxonomy = snakemake.output.teflon_taxonomy make_reference_bed(te_gff, ref_bed) make_taxonomy_file(taxonomy, teflon_taxonomy) prep_annotations(script_dir, out_dir, ref_bed, teflon_taxonomy, consensus, reference_genome, log=log) map_reads(out_dir, fq1, fq2, threads=threads, log=log) mccutils.log("teflon", "setup for TEFLoN complete")
def main(): mccutils.log("te-locate", "processing TE-Locate results") telocate_raw = snakemake.input.telocate_raw te_gff = snakemake.input.te_gff out_dir = snakemake.params.out_dir sample_name = snakemake.params.sample_name chromosomes = snakemake.params.chromosomes.split(",") insertions = read_insertions( telocate_raw, sample_name, chromosomes, rp_threshold=config.READ_PAIR_SUPPORT_THRESHOLD) insertions = filter_by_reference(insertions, te_gff) if len(insertions) > 0: insertions = make_redundant_bed(insertions, sample_name, out_dir) make_nonredundant_bed(insertions, sample_name, out_dir) else: mccutils.run_command( ["touch", out_dir + "/" + sample_name + "_telocate_redundant.bed"]) mccutils.run_command([ "touch", out_dir + "/" + sample_name + "_telocate_nonredundant.bed" ]) mccutils.log("te-locate", "TE-Locate post processing complete")
def main(): log = snakemake.params.log tmp_dir = snakemake.params.tmp_dir mccutils.mkdir(tmp_dir + "/telocate") mccutils.log("processing", "making TE-locate taxonomy file", log=log) try: mccutils.run_command( ["cp", snakemake.input.ref_gff, "telocate_locations.gff"]) mccutils.run_command( ["cp", snakemake.input.taxonomy, "telocate_taxonomy.tsv"]) command = [ "perl", snakemake.input.script, "telocate_locations.gff", "telocate_taxonomy.tsv", "Alias" ] mccutils.run_command(command, log=log) mccutils.run_command( ["cp", "telocate_locations_HL.gff", snakemake.output[0]]) mccutils.check_file_exists(snakemake.output[0]) except Exception as e: track = traceback.format_exc() print(track, file=sys.stderr) print("ERROR...unable to produce TE-locate taxonomy file using", snakemake.input.script, file=sys.stderr) sys.exit(1) mccutils.log("processing", "TE-locate taxonomy file created")
def format_gff(ingff): mccutils.log("setup","checking locations gff: "+ingff) gff_ids = [] with open(ingff,"r") as gff: for line in gff: if "#" not in line[0]: split_line = line.split("\t") if len(split_line) < 9: sys.exit(ingff+" appears to be a malformed GFF file..exiting...\n") else: feats = split_line[8] split_feats = feats.split(";") gff_id = "" for feat in split_feats: if feat[:3] == "ID=": gff_id = feat.split("=")[1].replace("\n","") masked_gff_id = mccutils.replace_special_chars(gff_id) if gff_id != masked_gff_id: mccutils.log("setup", ingff+": ERROR problematic symbol in feature name: "+gff_id+" ... reformat this feature name for compatibility with McClintock") print("Problematic symbols:"," ".join(mccutils.INVALID_SYMBOLS)) sys.exit(1) if masked_gff_id not in gff_ids: gff_ids.append(masked_gff_id) else: sys.exit("ID: "+masked_gff_id+" is not unique. please ensure each feature has a unique ID\n") if masked_gff_id == "": sys.exit("GFF line: "+line+" is missing an ID attribute (ex. ID=chr1_TY1s1)\n") return gff_ids
def main(): mccutils.log("popoolationte", "processing PopoolationTE results") popoolationte_out = snakemake.input.popoolationte_out out_dir = snakemake.params.out_dir sample_name = snakemake.params.sample_name log = snakemake.params.log chromosomes = snakemake.params.chromosomes.split(",") insertions = read_insertions( popoolationte_out, sample_name, chromosomes, require_both_end_support=config.REQUIRE_BOTH_END_SUPPORT, percent_read_support_threshold=config.PERCENT_READ_SUPPORT_THRESHOLD) if len(insertions) >= 1: insertions = mccutils.make_redundant_bed(insertions, sample_name, out_dir, method="popoolationte") mccutils.make_nonredundant_bed(insertions, sample_name, out_dir, method="popoolationte") else: mccutils.run_command([ "touch", out_dir + "/" + sample_name + "_popoolationte_redundant.bed" ]) mccutils.run_command([ "touch", out_dir + "/" + sample_name + "_popoolationte_nonredundant.bed" ]) mccutils.log("popoolationte", "PopoolationTE postprocessing complete")
def main(): mccutils.log("processing", "making coverage fasta") fastas = [] try: length = 80 if snakemake.params.coverage_fasta == "None": mccutils.run_command(["touch", snakemake.output.coverage_fasta]) else: fasta3 = snakemake.params.coverage_fasta fastas.append(fasta3) lines = fix_fasta.fix_fasta_lines(fasta3, length) write_fasta(lines, snakemake.output.coverage_fasta) except Exception as e: track = traceback.format_exc() print(track, file=sys.stderr) print( "ERROR...failed to create coverage fasta, check the formatting of :", snakemake.params.coverage_fasta, file=sys.stderr) mccutils.remove(snakemake.output[0]) mccutils.remove(snakemake.output[1]) mccutils.remove(snakemake.output[2]) sys.exit(1) mccutils.log("processing", "coverage fasta created")
def main(): reference = snakemake.input.ref augment = snakemake.params.augment mcc_out = snakemake.params.mcc_out run_id = snakemake.params.run_id log = snakemake.params.log out_ref = snakemake.output.ref out_aug_ref = snakemake.output.aug_ref if not os.path.exists(mcc_out+"/tmp"): mccutils.mkdir(mcc_out+"/tmp") mccutils.log("processing","making reference fasta") tmp = mcc_out+"/tmp/"+str(run_id)+"reference.tmp" reference = fix_fasta_lines(reference, tmp) reference = mccutils.replace_special_chars_fasta(reference, tmp+"1") augmented_reference = reference if augment != "None": augment = fix_fasta_lines(augment, tmp+"2") augment = mccutils.replace_special_chars_fasta(augment, tmp+"3") augmented_reference = augment_reference(reference, augment, tmp+"4") mccutils.run_command(["cp", reference, out_ref]) mccutils.run_command(["cp", augmented_reference, out_aug_ref]) mccutils.log("processing","reference fasta created")
def main(): mccutils.log("popoolationte2", "processing PopoolationTE2 results") te_predictions = snakemake.input.popoolationte2_out te_gff = snakemake.input.te_gff taxonomy = snakemake.input.taxonomy out_dir = snakemake.params.out_dir sample_name = snakemake.params.sample_name chromosomes = snakemake.params.chromosomes.split(",") log = snakemake.params.log ref_tes = get_ref_tes(te_gff, taxonomy, chromosomes) insertions = read_insertions( te_predictions, ref_tes, chromosomes, sample_name, both_end_support_needed=config.REQUIRE_BOTH_END_SUPPORT, support_threshold=config.FREQUENCY_THRESHOLD) if len(insertions) >= 1: insertions = make_redundant_bed(insertions, sample_name, out_dir) make_nonredundant_bed(insertions, sample_name, out_dir) else: mccutils.run_command([ "touch", out_dir + "/" + sample_name + "_popoolationte2_redundant.bed" ]) mccutils.run_command([ "touch", out_dir + "/" + sample_name + "_popoolationte2_nonredundant.bed" ]) mccutils.log("popoolationte2", "PopoolationTE2 postprocessing complete")
def main(): mccutils.log("tebreak","running tebreak post processing") tebreak_out = snakemake.input.tebreak_out ref_fasta = snakemake.input.ref_fasta out_dir = snakemake.params.out_dir ref_name = snakemake.params.ref_name sample_name = snakemake.params.sample_name chromosomes = snakemake.params.chromosomes.split(",") status_log = snakemake.params.status_log prev_steps_succeeded = mccutils.check_status_file(status_log) if prev_steps_succeeded: insertions = read_insertions(tebreak_out, sample_name, chromosomes, config) if len(insertions) > 0: insertions = output.make_redundant_bed(insertions, sample_name, out_dir, method="tebreak") insertions = output.make_nonredundant_bed(insertions, sample_name, out_dir, method="tebreak") output.write_vcf(insertions, ref_fasta, sample_name, "tebreak", out_dir) else: mccutils.run_command(["touch", out_dir+"/"+sample_name+"_tebreak_redundant.bed"]) mccutils.run_command(["touch", out_dir+"/"+sample_name+"_tebreak_nonredundant.bed"]) else: mccutils.run_command(["touch", out_dir+"/"+sample_name+"_tebreak_redundant.bed"]) mccutils.run_command(["touch", out_dir+"/"+sample_name+"_tebreak_nonredundant.bed"]) mccutils.log("tebreak","tebreak postprocessing complete")
def main(): relocate_gff = snakemake.input.relocate_gff te_gff = snakemake.input.te_gff out_dir = snakemake.params.out_dir log = snakemake.params.log sample_name = snakemake.params.sample_name chromosomes = snakemake.params.chromosomes.split(",") mccutils.log("relocate", "processing RelocaTE results") insertions = get_insertions( relocate_gff, sample_name, chromosomes, ref_l_threshold=config.REF_LEFT_THRESHOLD, ref_r_threshold=config.REF_RIGHT_THRESHOLD, nonref_l_threshold=config.NONREF_LEFT_THRESHOLD, nonref_r_threshold=config.NONREF_RIGHT_THRESHOLD) insertions = set_ref_orientations(insertions, te_gff) if len(insertions) >= 1: insertions = make_redundant_bed(insertions, sample_name, out_dir) make_nonredundant_bed(insertions, sample_name, out_dir) else: mccutils.run_command( ["touch", out_dir + "/" + sample_name + "_relocate_redundant.bed"]) mccutils.run_command([ "touch", out_dir + "/" + sample_name + "_relocate_nonredundant.bed" ]) mccutils.log("relocate", "RelocaTE postprocessing complete")
def main(): mccutils.log("retroseq", "processing RetroSeq results") retroseq_out = snakemake.input.retroseq_out reference_fasta = snakemake.input.reference_fasta out_dir = snakemake.params.out_dir ref_name = snakemake.params.ref_name sample_name = snakemake.params.sample_name chromosomes = snakemake.params.chromosomes.split(",") insertions = read_insertions( retroseq_out, sample_name, chromosomes, support_threshold=config.READ_SUPPORT_THRESHOLD, breakpoint_threshold=config.BREAKPOINT_CONFIDENCE_THRESHOLD) if len(insertions) >= 1: insertions = output.make_redundant_bed(insertions, sample_name, out_dir, method="retroseq") insertions = output.make_nonredundant_bed(insertions, sample_name, out_dir, method="retroseq") output.write_vcf(insertions, reference_fasta, sample_name, "retroseq", out_dir) else: mccutils.run_command( ["touch", out_dir + "/" + sample_name + "_retroseq_redundant.bed"]) mccutils.run_command([ "touch", out_dir + "/" + sample_name + "_retroseq_nonredundant.bed" ]) mccutils.log("retroseq", "RetroSeq post processing complete")
def sort_bam(bam, sorted_bam, threads=1, log=None): mccutils.log("popoolationte2", "sorting BAM", log=log) mccutils.run_command( ["samtools", "sort", "-@", str(threads), bam, "-o", sorted_bam], log=log) return sorted_bam
def main(): ref_bed = snakemake.input.ref_bed nonref_bed = snakemake.input.nonref_bed reference_fasta = snakemake.input.reference_fasta threads = snakemake.threads log = snakemake.params.log sample_name = snakemake.params.sample_name out_dir = snakemake.params.out_dir chromosomes = snakemake.params.chromosomes.split(",") status_log = snakemake.params.status_log out_bed = snakemake.output[0] succeeded = mccutils.check_status_file(status_log) if succeeded: mccutils.log("ngs_te_mapper2","processing ngs_te_mapper2 results", log=log) insertions = read_insertions(ref_bed, nonref_bed, chromosomes, sample_name, out_dir) if len(insertions) > 0: insertions = output.make_redundant_bed(insertions, sample_name, out_dir, method="ngs_te_mapper2") intertions = output.make_nonredundant_bed(insertions, sample_name, out_dir, method="ngs_te_mapper2") output.write_vcf(insertions, reference_fasta, sample_name, "ngs_te_mapper2", out_dir) else: mccutils.run_command(["touch", out_dir+"/"+sample_name+"_ngs_te_mapper2_redundant.bed"]) mccutils.run_command(["touch", out_dir+"/"+sample_name+"_ngs_te_mapper2_nonredundant.bed"]) mccutils.log("ngs_te_mapper2","ngs_te_mapper2 postprocessing complete") else: mccutils.run_command(["touch", out_dir+"/"+sample_name+"_ngs_te_mapper2_redundant.bed"]) mccutils.run_command(["touch", out_dir+"/"+sample_name+"_ngs_te_mapper2_nonredundant.bed"])
def make_plots(te_names, all_coverage_files, uniq_coverage_files, avg_norm_te_depths, genome_depth, sample_name, out, trim_edges=0): mccutils.log("coverage", "creating TE coverage plots") mccutils.mkdir(out + "/plots") for x, te_name in enumerate(te_names): chrom, all_pos, all_cov = read_samtools_depth_file( all_coverage_files[x]) chrom2, uniq_pos, uniq_cov = read_samtools_depth_file( uniq_coverage_files[x]) plot_height = 3 plot_width = 10 hline = avg_norm_te_depths[x] output = out + "plots/" + te_name + ".png" plot = plot_coverage(chrom, all_pos, all_cov, uniq_pos, uniq_cov, sample_name, plot_height, plot_width, genome_depth, hline, trim_edges=trim_edges) plot.savefig(output, bbox_inches="tight") plot.close() mccutils.log("coverage", "plot created: " + output)
def map_reads(ref, fq1, fq2, out, threads=1, log=None): mccutils.log("popoolationte2", "mapping reads", log=log) sam = out + "/" + "mapped.sam" mccutils.run_command_stdout( ["bwa", "bwasw", "-t", str(threads), ref, fq1, fq2], sam, log=log) return sam
def main(): mccutils.log("teflon", "Running TEFLoN") consensus = snakemake.input.consensus reference_genome = snakemake.input.reference_genome ref_bed = snakemake.input.ref_bed teflon_taxonomy = snakemake.input.teflon_taxonomy bam = snakemake.input.bam threads = snakemake.threads out_dir = snakemake.params.out_dir script_dir = snakemake.params.script_dir log = snakemake.params.log sample_table = make_sample_table(out_dir, bam) run_teflon(script_dir, out_dir, sample_table, threads=threads, log=log, quality_threshold=config.PARAMETERS['q'], stdev=config.PARAMETERS['sd'], cov=config.PARAMETERS['cov'], te_support1=config.PARAMETERS['n1'], te_support2=config.PARAMETERS['n2'], read_count_lower_threshold=config.PARAMETERS['lt'], read_count_higher_threshold=config.PARAMETERS['ht'])
def make_nonte_bed(reference, masked_gff, run_id, out, log): mccutils.log("coverage", "creating BED file of non-TE regions", log=log) masked_bed = out + "/input/" + run_id + "_ref_tes.bed" repeatmasker_gff_to_bed(masked_gff, masked_bed) sorted_bed = out + "/input/" + run_id + "_ref_tes_sorted.bed" mccutils.run_command_stdout(["bedtools", "sort", "-i", masked_bed], sorted_bed, log=log) chromosome_names = [] with open(reference, "r") as fa: for line in fa: if ">" in line: chromosome_names.append( line.replace(">", "").replace("\n", "")) chrom_idx = out + "/input/" + run_id + "_ref.genome" with open(reference + ".fai", "r") as faidx: with open(chrom_idx, "w") as genome: for line in faidx: split_line = line.split("\t") out_line = "\t".join([split_line[0], split_line[1]]) genome.write(out_line + "\n") non_te_bed = out + "/input/" + run_id + "_ref_nonte.bed" command = ["bedtools", "complement", "-i", sorted_bed, "-g", chrom_idx] mccutils.run_command_stdout(command, non_te_bed, log=log) for f in [masked_bed, sorted_bed, chrom_idx]: mccutils.remove(f) return non_te_bed
def main(): bam = snakemake.input.bam twobit = snakemake.input.twobit consensus = snakemake.input.consensus ref_te_bed = snakemake.input.ref_te_bed taxonomy = snakemake.input.taxonomy median_insert_size_file = snakemake.input.median_insert_size log = snakemake.params.log with open(log, "a") as l: l.write("BAM: " + bam + "\n") l.write("2bit: " + twobit + "\n") l.write("consensus fasta: " + consensus + "\n") l.write("reference TE BED: " + ref_te_bed + "\n") l.write("Taxonomy TSV: " + taxonomy + "\n") threads = snakemake.threads out_dir = snakemake.params.out_dir scripts_dir = snakemake.params.scripts_dir sample_name = snakemake.params.sample_name mccutils.log("temp", "running TEMP Module") median_insert_size = get_median_insert_size(median_insert_size_file) run_temp_insertion(bam, scripts_dir, consensus, ref_te_bed, taxonomy, median_insert_size, threads, out_dir, log) run_temp_absence(bam, scripts_dir, consensus, ref_te_bed, twobit, taxonomy, median_insert_size, threads, out_dir, log) for f in os.listdir(out_dir): if ".sorted.bam" in f or ".fastq" in f: mccutils.remove(f)
def main(): mccutils.log("processing", "making PopoolationTE reference fasta") command = [ "cat", snakemake.input[0], snakemake.input[1], snakemake.input[2] ] mccutils.run_command_stdout(command, snakemake.output[0]) mccutils.log("processing", "PopoolationTE reference fasta created")
def main(): mccutils.log("retroseq","processing RetroSeq results") retroseq_out = snakemake.input.retroseq_out reference_fasta = snakemake.input.reference_fasta out_dir = snakemake.params.out_dir ref_name = snakemake.params.ref_name sample_name = snakemake.params.sample_name chromosomes = snakemake.params.chromosomes.split(",") status_log = snakemake.params.status_log prev_steps_succeeded = mccutils.check_status_file(status_log) if prev_steps_succeeded: insertions = read_insertions(retroseq_out, sample_name, chromosomes, support_threshold=config.PARAMS["read_support_threshold"], breakpoint_threshold=config.PARAMS["breakpoint_confidence_threshold"]) if len(insertions) >= 1: insertions = output.make_redundant_bed(insertions, sample_name, out_dir, method="retroseq") insertions = output.make_nonredundant_bed(insertions, sample_name, out_dir, method="retroseq") output.write_vcf(insertions, reference_fasta, sample_name, "retroseq", out_dir) else: mccutils.run_command(["touch",out_dir+"/"+sample_name+"_retroseq_redundant.bed"]) mccutils.run_command(["touch",out_dir+"/"+sample_name+"_retroseq_nonredundant.bed"]) else: mccutils.run_command(["touch",out_dir+"/"+sample_name+"_retroseq_redundant.bed"]) mccutils.run_command(["touch",out_dir+"/"+sample_name+"_retroseq_nonredundant.bed"]) mccutils.log("retroseq","RetroSeq post processing complete")
def main(): mccutils.log("te-locate","processing TE-Locate results") telocate_raw = snakemake.input.telocate_raw te_gff = snakemake.input.te_gff reference_fasta = snakemake.input.reference_fasta out_dir = snakemake.params.out_dir sample_name = snakemake.params.sample_name chromosomes = snakemake.params.chromosomes.split(",") status_log = snakemake.params.status_log prev_steps_succeeded = mccutils.check_status_file(status_log) if prev_steps_succeeded: insertions = read_insertions(telocate_raw, sample_name, chromosomes, rp_threshold=config.PARAMS['read_pair_support_threshold']) insertions = filter_by_reference(insertions, te_gff) if len(insertions) > 0: insertions = output.make_redundant_bed(insertions, sample_name, out_dir, method="telocate") intertions = output.make_nonredundant_bed(insertions, sample_name, out_dir,method="telocate") output.write_vcf(insertions, reference_fasta, sample_name, "telocate", out_dir) else: mccutils.run_command(["touch", out_dir+"/"+sample_name+"_telocate_redundant.bed"]) mccutils.run_command(["touch", out_dir+"/"+sample_name+"_telocate_nonredundant.bed"]) else: mccutils.run_command(["touch", out_dir+"/"+sample_name+"_telocate_redundant.bed"]) mccutils.run_command(["touch", out_dir+"/"+sample_name+"_telocate_nonredundant.bed"]) mccutils.log("te-locate", "TE-Locate post processing complete")
def main(): log = snakemake.params.log mccutils.log("processing", "creating 2bit file from reference genome fasta", log=log) command = ["faToTwoBit", snakemake.input[0], snakemake.output[0]] mccutils.run_command(command, log=log) mccutils.log("processing", "reference 2bit file created")
def main(): mccutils.log("jitterbug", "jitterbug postprocessing") jitterbug_out = snakemake.input.jitterbug_out te_taxonomy = snakemake.input.taxonomy reference_fasta = snakemake.input.reference_fasta out_dir = snakemake.params.out_dir log = snakemake.params.log sample_name = snakemake.params.sample_name chromosomes = snakemake.params.chromosomes.split(",") status_log = snakemake.params.status_log out = snakemake.output.out prev_steps_succeeded = mccutils.check_status_file(status_log) if prev_steps_succeeded: insertions = read_insertions( jitterbug_out, te_taxonomy, chromosomes, sample_name, min_fwd_read_support=config.FILTER['MIN_FWD_READ_SUPPORT'], min_rev_read_support=config.FILTER['MIN_REV_READ_SUPPORT'], min_sr_support=config.FILTER['MIN_SPLIT_READ_SUPPORT'], min_zygosity=config.FILTER['MIN_ZYGOSITY']) if len(insertions) >= 1: insertions = output.make_redundant_bed(insertions, sample_name, out_dir, method="jitterbug") insertions = output.make_nonredundant_bed(insertions, sample_name, out_dir, method="jitterbug") output.write_vcf(insertions, reference_fasta, sample_name, "jitterbug", out_dir) else: mccutils.run_command([ "touch", out_dir + "/" + sample_name + "_jitterbug_redundant.bed" ]) mccutils.run_command([ "touch", out_dir + "/" + sample_name + "_jitterbug_nonredundant.bed" ]) else: mccutils.run_command([ "touch", out_dir + "/" + sample_name + "_jitterbug_redundant.bed" ]) mccutils.run_command([ "touch", out_dir + "/" + sample_name + "_jitterbug_nonredundant.bed" ])
def sam_to_bam(jar, fq1, fq2, sam1, sam2, bam, out_dir, threads=1, log=None): mccutils.log("popoolationte2", "converting SAM to BAM", log=log) mccutils.run_command([ "java", "-Djava.io.tmpdir=" + out_dir + "/tmp", "-jar", jar, "se2pe", "--fastq1", fq1, "--fastq2", fq2, "--bam1", sam1, "--bam2", sam2, "--sort", "--output", bam ], log=log) return bam
def main(): mccutils.log("popoolationte2", "processing PopoolationTE2 results") te_predictions = snakemake.input.popoolationte2_out te_gff = snakemake.input.te_gff taxonomy = snakemake.input.taxonomy reference_fasta = snakemake.input.reference_fasta out_dir = snakemake.params.out_dir sample_name = snakemake.params.sample_name chromosomes = snakemake.params.chromosomes.split(",") log = snakemake.params.log status_log = snakemake.params.status_log prev_step_succeeded = mccutils.check_status_file(status_log) if prev_step_succeeded: ref_tes = get_ref_tes(te_gff, taxonomy, chromosomes) insertions = read_insertions( te_predictions, ref_tes, chromosomes, sample_name, both_end_support_needed=config.PARAMS["require_both_end_support"], support_threshold=config.PARAMS["frequency_threshold"]) if len(insertions) >= 1: insertions = output.make_redundant_bed(insertions, sample_name, out_dir, method="popoolationte2") insertions = output.make_nonredundant_bed(insertions, sample_name, out_dir, method="popoolationte2") output.write_vcf(insertions, reference_fasta, sample_name, "popoolationte2", out_dir) else: mccutils.run_command([ "touch", out_dir + "/" + sample_name + "_popoolationte2_redundant.bed" ]) mccutils.run_command([ "touch", out_dir + "/" + sample_name + "_popoolationte2_nonredundant.bed" ]) else: mccutils.run_command([ "touch", out_dir + "/" + sample_name + "_popoolationte2_redundant.bed" ]) mccutils.run_command([ "touch", out_dir + "/" + sample_name + "_popoolationte2_nonredundant.bed" ]) mccutils.log("popoolationte2", "PopoolationTE2 postprocessing complete")
def make_depth_table(te_fasta, bam, genome_depth, run_id, out, depth_csv, log, trim_edges=0): mccutils.log("coverage","creating TE depth coverage table", log=log) with open(depth_csv, "w") as table: table.write("TE-Family,Normalized-Depth,Normalized-Unique-Depth"+"\n") te_names = [] uniq_coverage_files = [] all_coverage_files = [] avg_norm_depths = [] avg_uniq_norm_depths = [] with open(te_fasta,"r") as fa: for line in fa: if ">" in line: te_name = line.replace("\n","") te_name = te_name.replace(">","") mccutils.mkdir(out+"/te-depth-files") highQ = out+"/te-depth-files/"+te_name+".highQ.cov" command = ["samtools", "depth", "-aa", "-r", te_name, bam, "-d", "0", "-Q", "1"] mccutils.run_command_stdout(command, highQ, log=log) allQ = out+"/te-depth-files/"+te_name+".allQ.cov" command = ["samtools", "depth", "-aa", "-r", te_name, bam, "-d", "0", "-Q", "0"] mccutils.run_command_stdout(command, allQ, log=log) # make normalized coverage files allQ_chrom, allQ_pos, allQ_cov = read_samtools_depth_file(allQ) with open(out+"/te-depth-files/"+te_name+".allQ.normalized.cov","w") as covfile: for i,pos in enumerate(allQ_pos): cov = str(round(allQ_cov[i]/genome_depth,2)) line = "\t".join([allQ_chrom,str(pos),cov]) covfile.write(line+"\n") highQ_chrom, highQ_pos, highQ_cov = read_samtools_depth_file(highQ) with open(out+"/te-depth-files/"+te_name+".highQ.normalized.cov","w") as covfile: for i,pos in enumerate(highQ_pos): cov = str(round(highQ_cov[i]/genome_depth,2)) line = "\t".join([highQ_chrom,str(pos),cov]) covfile.write(line+"\n") avg_depth = get_avg_depth(allQ, trim_edges=trim_edges) avg_norm_depth = avg_depth/genome_depth avg_uniq_depth = get_avg_depth(highQ, trim_edges=trim_edges) avg_uniq_norm_depth = avg_uniq_depth/genome_depth with open(depth_csv, "a") as table: table.write(te_name+","+str(round(avg_norm_depth,2))+","+str(round(avg_uniq_norm_depth,2))+"\n") te_names.append(te_name) uniq_coverage_files.append(highQ) all_coverage_files.append(allQ) avg_norm_depths.append(avg_norm_depth) return te_names, all_coverage_files, uniq_coverage_files, avg_norm_depths
def main(): consensus = snakemake.input.consensus mcc_out = snakemake.params.mcc_out run_id = snakemake.params.run_id out_consensus = snakemake.output.consensus mccutils.log("processing", "making consensus fasta") out_consensus = fix_fasta_lines(consensus, out_consensus) mccutils.log("processing", "consensus fasta created")
def main(): mccutils.log("popoolationte", "running PopoolationTE") ref_fasta = snakemake.input.ref_fasta taxonomy = snakemake.input.taxonomy te_gff = snakemake.input.te_gff fq1 = snakemake.input.fq1 fq2 = snakemake.input.fq2 sam = snakemake.input.sam log = snakemake.params.log with open(log, "a") as l: l.write("reference fasta: " + ref_fasta + "\n") l.write("Taxonomy TSV: " + taxonomy + "\n") l.write("TE GFF: " + te_gff + "\n") l.write("fastq1: " + fq1 + '\n') l.write("fastq2: " + fq2 + "\n") l.write("SAM: " + sam + "\n") out_dir = snakemake.params.out_dir sample_name = snakemake.params.sample_name script_dir = snakemake.params.script_dir mccutils.log("popoolationte", "getting read length") read_length = get_read_length(fq1, fq2) mccutils.log("popoolationte", "calculating median insert size") median_insert_size = get_median_insert_size(sam) max_dist = int(median_insert_size * 3) + read_length mccutils.log("popoolationte", "converting TE gff to PoPoolationTE known TE file") known_inserts = make_known_insert_file(te_gff, out_dir) mccutils.log("popoolationte", "running the PoPoolationTE workflow scripts") run_popoolationte( sam, ref_fasta, taxonomy, read_length, median_insert_size, max_dist, known_inserts, script_dir, out_dir, log=log, identify_min_count=config.IDENTIFY_TE_INSERTSITES["min-count"], identify_min_qual=config.IDENTIFY_TE_INSERTSITES["min-map-qual"], crosslink_site_shift=config.CROSSLINK_TE_SITES['single-site-shift'], update_te_inserts_site_shift=config. UPDATE_TEINSERTS_WITH_KNOWNTES['single-site-shift'], estimate_polymorphism_min_qual=config. ESTIMATE_POLYMORPHISM['min-map-qual'], filter_min_count=config.FILTER['min-count']) mccutils.run_command(["touch", snakemake.output[0]]) mccutils.remove(sam) mccutils.remove(fq1) mccutils.remove(fq2)
def main(): nonref_gff = snakemake.input.nonref_gff ref_gff = snakemake.input.ref_gff rm_out = snakemake.input.rm_out log = snakemake.params.log out_dir = snakemake.params.out_dir sample_name = snakemake.params.sample_name chromosomes = snakemake.params.chromosomes.split(",") mccutils.log("relocate2", "processing RelocaTE2 results") ref_insertions = get_insertions( ref_gff, sample_name, chromosomes, insert_type="ref", l_support_threshold=config.REF_LEFT_SUPPORT_THRESHOLD, r_support_threshold=config.REF_RIGHT_SUPPORT_THRESHOLD, l_junction_threshold=config.REF_LEFT_JUNCTION_THRESHOLD, r_junction_threshold=config.REF_RIGHT_JUNCTION_THRESHOLD) nonref_insertions = get_insertions( nonref_gff, sample_name, chromosomes, insert_type="nonref", l_support_threshold=config.NONREF_LEFT_SUPPORT_THRESHOLD, r_support_threshold=config.NONREF_RIGHT_SUPPORT_THRESHOLD, l_junction_threshold=config.NONREF_LEFT_JUNCTION_THRESHOLD, r_junction_threshold=config.NONREF_RIGHT_JUNCTION_THRESHOLD) ref_insertions = fix_ref_te_names(ref_insertions, rm_out, sample_name) all_insertions = ref_insertions + nonref_insertions if len(all_insertions) >= 1: all_insertions = mccutils.make_redundant_bed(all_insertions, sample_name, out_dir, method="relocate2") mccutils.make_nonredundant_bed(all_insertions, sample_name, out_dir, method="relocate2") else: mccutils.run_command([ "touch", out_dir + "/" + sample_name + "_relocate2_redundant.bed" ]) mccutils.run_command([ "touch", out_dir + "/" + sample_name + "_relocate2_nonredundant.bed" ]) mccutils.log("relocate2", "RelocaTE2 postprocessing complete")