def main(): bam = snakemake.input.bam twobit = snakemake.input.twobit consensus = snakemake.input.consensus ref_te_bed = snakemake.input.ref_te_bed taxonomy = snakemake.input.taxonomy median_insert_size_file = snakemake.input.median_insert_size log = snakemake.params.log with open(log, "a") as l: l.write("BAM: " + bam + "\n") l.write("2bit: " + twobit + "\n") l.write("consensus fasta: " + consensus + "\n") l.write("reference TE BED: " + ref_te_bed + "\n") l.write("Taxonomy TSV: " + taxonomy + "\n") threads = snakemake.threads out_dir = snakemake.params.out_dir scripts_dir = snakemake.params.scripts_dir sample_name = snakemake.params.sample_name mccutils.log("temp", "running TEMP Module") median_insert_size = get_median_insert_size(median_insert_size_file) run_temp_insertion(bam, scripts_dir, consensus, ref_te_bed, taxonomy, median_insert_size, threads, out_dir, log) run_temp_absence(bam, scripts_dir, consensus, ref_te_bed, twobit, taxonomy, median_insert_size, threads, out_dir, log) for f in os.listdir(out_dir): if ".sorted.bam" in f or ".fastq" in f: mccutils.remove(f)
def get_non_absent_ref_tes(te_gff, absence_bed, sample, out, log): insertions = [] tmp_gff = out+"/tmp.ref_nonabs.gff" command = ["bedtools", "subtract", "-A", "-a", te_gff, "-b", absence_bed] mccutils.run_command_stdout(command, tmp_gff, log=log) with open(tmp_gff,"r") as gff: for line in gff: if "#" not in line: line = line.replace(";","\t") split_line = line.split("\t") insert = mccutils.Insertion() insert.chromosome = split_line[0] insert.start = int(split_line[3]) insert.end = int(split_line[4]) insert.temp.support = "!" insert.name = split_line[9].split("=")[1]+"_reference_"+sample+"_temp_nonab_" insert.strand = split_line[6] insert.temp.classification = "!" insert.temp.junction1Support = "!" insert.temp.junction2Support = "!" insert.temp.junction1 = '!' insert.temp.junction2 = "!" insert.temp.frequency = "!" insert.type = "reference" insertions.append(insert) mccutils.remove(tmp_gff) return insertions
def get_non_absent_ref_tes(te_gff, absence_bed, sample, out, log): insertions = [] tmp_gff = out + "/tmp.ref_nonabs.gff" command = ["bedtools", "subtract", "-A", "-a", te_gff, "-b", absence_bed] mccutils.run_command_stdout(command, tmp_gff, log=log) with open(tmp_gff, "r") as gff: for line in gff: if "#" not in line: line = line.replace(";", "\t") split_line = line.split("\t") insert = output.Insertion(output.Temp()) insert.chromosome = split_line[0] insert.start = int(split_line[3]) insert.end = int(split_line[4]) insert.name = split_line[9].split( "=")[1] + "|reference|NA|" + sample + "|temp|nonab|" insert.strand = split_line[6] insert.type = "reference" insertions.append(insert) mccutils.remove(tmp_gff) return insertions
def split_consensus_fasta(fasta, ref_name, out): elements = [] out_dir = out+"/split_fasta/" mccutils.mkdir(out_dir) fasta_records = SeqIO.parse(fasta,"fasta") for record in fasta_records: fasta_name = str(record.id) elements.append(fasta_name) special_chars = [";","&","(",")","|","*","?","[","]","~","{","}","<","!","^",'"',"'","\\","$","/"] for char in special_chars: fasta_name = fasta_name.replace(char,"_") tmp_fasta = out_dir+ref_name+"_"+fasta_name+".fasta.tmp" with open(tmp_fasta,"w") as outfa: outfa.write(">"+str(record.id)+"\n") outfa.write(str(record.seq)+"\n") fasta_lines = fix_fasta.fix_fasta_lines(tmp_fasta, 80) out_fasta = out_dir+ref_name+"_"+fasta_name+".fasta" with open(out_fasta,"w") as outfa: for line in fasta_lines: outfa.write(line+"\n") mccutils.remove(tmp_fasta) return elements
def main(): full_command = " ".join(["python3"] + sys.argv) current_directory = os.getcwd() args = parse_args() mccutils.mkdir(args.out + "/logs") mccutils.mkdir(args.out + "/tmp") check_input_files(args.reference, args.consensus, args.first, fq2=args.second, locations=args.locations, taxonomy=args.taxonomy, coverage_fasta=args.coverage_fasta, augment_fasta=args.augment, annotations_only=args.make_annotations) ref_name = mccutils.get_base_name(args.reference) run_id = make_run_config(args, args.sample_name, ref_name, full_command, current_directory, debug=args.debug) run_workflow(args, args.sample_name, ref_name, run_id, debug=args.debug, annotations_only=args.make_annotations) mccutils.remove(args.out + "/tmp")
def make_nonte_bed(reference, masked_gff, run_id, out, log): mccutils.log("coverage", "creating BED file of non-TE regions", log=log) masked_bed = out + "/input/" + run_id + "_ref_tes.bed" repeatmasker_gff_to_bed(masked_gff, masked_bed) sorted_bed = out + "/input/" + run_id + "_ref_tes_sorted.bed" mccutils.run_command_stdout(["bedtools", "sort", "-i", masked_bed], sorted_bed, log=log) chromosome_names = [] with open(reference, "r") as fa: for line in fa: if ">" in line: chromosome_names.append( line.replace(">", "").replace("\n", "")) chrom_idx = out + "/input/" + run_id + "_ref.genome" with open(reference + ".fai", "r") as faidx: with open(chrom_idx, "w") as genome: for line in faidx: split_line = line.split("\t") out_line = "\t".join([split_line[0], split_line[1]]) genome.write(out_line + "\n") non_te_bed = out + "/input/" + run_id + "_ref_nonte.bed" command = ["bedtools", "complement", "-i", sorted_bed, "-g", chrom_idx] mccutils.run_command_stdout(command, non_te_bed, log=log) for f in [masked_bed, sorted_bed, chrom_idx]: mccutils.remove(f) return non_te_bed
def make_nonredundant_bed(insertions, sample_name, out_dir): uniq_inserts = {} for insert in insertions: key = "_".join([insert.chromosome, str(insert.end)]) if key not in uniq_inserts.keys(): uniq_inserts[key] = insert else: if uniq_inserts[key].read_pair_support > insert.read_pair_support: uniq_inserts[key] = insert tmp_bed = out_dir+"/tmp_telocate_nonredundant.bed" with open(tmp_bed, "w") as outbed: for key in uniq_inserts.keys(): insert = uniq_inserts[key] out_line = "\t".join([insert.chromosome, str(insert.start-1), str(insert.end), insert.name, "0", insert.strand]) outbed.write(out_line+"\n") sorted_bed = out_dir+"/sorted.bed" command = ["bedtools", "sort", "-i", tmp_bed] mccutils.run_command_stdout(command, sorted_bed) nonredundant_bed = out_dir+"/"+sample_name+"_telocate_nonredundant.bed" with open(sorted_bed, "r") as inbed: with open(nonredundant_bed, "w") as outbed: header = 'track name="'+sample_name+'_TE-locate" description="'+sample_name+'_TE-locate"\n' outbed.write(header) for line in inbed: outbed.write(line) mccutils.remove(tmp_bed) mccutils.remove(sorted_bed)
def process_bed(bed, chromosomes, sample_name, log, out_dir, min_read_cutoff=0): unsorted_bed = out_dir+"/unsorted.bed" with open(unsorted_bed, "w") as outbed: with open(bed,"r") as inbed: insertion_count = 0 for x,line in enumerate(inbed): line = line.replace(";","\t") split_line = line.split("\t") if int(split_line[7]) > min_read_cutoff and split_line[0] in chromosomes: insertion_count += 1 outline = "\t".join([split_line[0], split_line[1], split_line[2], split_line[5]+"_"+split_line[8].replace("\n","")+"_"+sample_name+"_ngs_te_mapper_sr_"+str(x+1),"0", split_line[4]]) outbed.write(outline+"\n") if insertion_count >= 1: sorted_bed = out_dir+"/sorted.bed" command = ["bedtools", "sort", "-i", unsorted_bed] mccutils.run_command_stdout(command, sorted_bed, log=log) final_bed = out_dir+"/"+sample_name+"_ngs_te_mapper_nonredundant.bed" with open(final_bed,"w") as outbed: header = 'track name="'+sample_name+'_ngs_te_mapper" description="'+sample_name+'_ngs_te_mapper"\n' outbed.write(header) with open(sorted_bed, "r") as inbed: for line in inbed: # line = line.replace("NA",".") outbed.write(line) mccutils.remove(sorted_bed) else: mccutils.run_command(["touch",out_dir+"/"+sample_name+"_ngs_te_mapper_nonredundant.bed"]) mccutils.remove(unsorted_bed)
def make_data_copies(methods, results_dir, out_dir): mccutils.mkdir(out_dir + "/data/") if "trimgalore" in methods: if os.path.exists(out_dir + "/data/trimgalore/"): mccutils.remove(out_dir + "/data/trimgalore/") mccutils.mkdir(out_dir + "/data/trimgalore/") for f in os.listdir(results_dir + "/trimgalore"): if ".zip" not in f: mccutils.run_command([ "cp", "-r", results_dir + "/trimgalore/" + f, out_dir + "/data/trimgalore/" ]) if "coverage" in methods: if os.path.exists(out_dir + "/data/coverage/"): mccutils.remove(out_dir + "/data/coverage/") mccutils.mkdir(out_dir + "/data/coverage/") for f in os.listdir(results_dir + "/coverage/"): if not os.path.isdir(results_dir + "/coverage/" + f): mccutils.run_command([ "cp", results_dir + "/coverage/" + f, out_dir + "/data/coverage/" ]) for f in os.listdir(results_dir + "/coverage/te-depth-files/"): mccutils.run_command([ "cp", results_dir + "/coverage/te-depth-files/" + f, out_dir + "/data/coverage/" ]) for f in os.listdir(out_dir + "/data/coverage/"): tmp = out_dir + "/data/coverage/" + f o = f.replace(".csv", ".txt") o = o.replace(".cov", ".txt") mccutils.run_command(["mv", tmp, out_dir + "/data/coverage/" + o])
def map_reads(out_dir, fq1, fq2, threads=1, log=None): reference_genome = out_dir+"/teflon.prep_MP/teflon.mappingRef.fa" command = ["bwa", "index", reference_genome] mccutils.run_command(command, log=log) out_sam = out_dir+"teflon.sam" command = [ "bwa", "mem", "-t", str(threads), "-Y", reference_genome, fq1, fq2 ] mccutils.run_command_stdout(command, out_sam, log=log) out_bam = out_dir+"teflon.bam" command = ["samtools", "view", "-Sb", out_sam] mccutils.run_command_stdout(command, out_bam, log=log) sorted_bam = out_dir+"teflon.sorted.bam" command = ["samtools", "sort", "-@", str(threads), "-o", sorted_bam, out_bam] mccutils.run_command(command, log=log) command = ["samtools", "index", sorted_bam ] mccutils.run_command(command, log=log) mccutils.remove(out_sam) mccutils.remove(out_bam) return sorted_bam
def run_repeatmasker(reference, ref_name, te_seqs, threads, log, outfile, outdir): tmp_dir = outdir + "/tmp/repeatmasker" mccutils.remove(tmp_dir) mccutils.mkdir(tmp_dir) os.chdir(tmp_dir) command = [ "RepeatMasker", "-pa", str(threads), "-lib", te_seqs, "-dir", tmp_dir, "-s", "-nolow", "-no_is", reference ] mccutils.run_command(command, log=log) os.chdir(outdir) rm_out = "" for f in os.listdir(tmp_dir): if "fasta.out" in f and f[-9:] == "fasta.out": rm_out = tmp_dir + "/" + f if rm_out == "": sys.exit("can't find Repeatmasker output in:" + tmp_dir + "\n") mccutils.run_command(["mv", rm_out, outfile])
def remove_intermediate_files(options, run_config_file, methods, ref_name, sample_name, outdir): if "all" in options: return with open(run_config_file) as f: run_config_data = json.load(f) keep_paths = [] if "methods" not in options: for method in methods: method_out = "/".join(run_config_data['out'][method].split("/")[:-1])+"/" if method not in options: essential_paths = run_config_data['essential'][method] if os.path.exists(method_out): # delete all files not marked as essential for root, subdirs, files in os.walk(method_out, topdown=False): for f in files: file_path = os.path.join(root, f) is_essential = False for essential_path in essential_paths: if (os.path.isdir(essential_path) and essential_path in file_path) or (essential_path == file_path): is_essential = True if not is_essential: mccutils.remove(file_path) # remove empty directories for root, subdirs, files in os.walk(method_out, topdown=False): for d in subdirs: dir_path = os.path.join(root, d) if len(os.listdir(dir_path)) < 1: mccutils.remove(dir_path) else: keep_paths.append(method_out) if "general" not in options: intermediate_dir = outdir+"/"+sample_name+"/intermediate/" for root, subdirs, files in os.walk(intermediate_dir, topdown=False): for f in files: file_path = os.path.join(root, f) keep = False for keep_path in keep_paths: if keep_path in file_path: keep = True if not keep: mccutils.remove(file_path) # remove empty directories for root, subdirs, files in os.walk(intermediate_dir, topdown=False): for d in subdirs: dir_path = os.path.join(root, d) if len(os.listdir(dir_path)) < 1: mccutils.remove(dir_path) if len(os.listdir(intermediate_dir)) < 1: mccutils.remove(intermediate_dir)
def main(): mccutils.log("popoolationte", "running PopoolationTE preprocessing steps") ref_fasta = snakemake.input.ref_fasta fq1 = snakemake.input.fq1 fq2 = snakemake.input.fq2 out_dir = snakemake.params.out_dir sample_name = snakemake.params.sample_name log = snakemake.params.log script_dir = snakemake.params.script_dir status_log = snakemake.params.status_log threads = snakemake.threads try: # ensures intermediate files from previous runs are removed for f in os.listdir(out_dir): mccutils.remove(out_dir + "/" + f) mccutils.log("popoolationte", "formatting read names") fq1, fq2 = format_read_names(fq1, fq2, sample_name, out_dir) mccutils.log("popoolationte", "indexing popoolationTE reference fasta", log=log) index_fasta(ref_fasta, log=log) mccutils.log("popoolationte", "mapping fastq1 reads", log=log) sam1 = map_reads(fq1, ref_fasta, threads=threads, log=log) mccutils.log("popoolationte", "mapping fastq2 reads", log=log) sam2 = map_reads(fq2, ref_fasta, threads=threads, log=log) mccutils.log("popoolationte", "combining alignments", log=log) combined_sam = combine_alignments(sam1, sam2, fq1, fq2, script_dir, out_dir, log=log) mccutils.log("popoolationte", "sorting sam file", log=log) bam = sam_to_bam(combined_sam, threads=threads, log=log) sorted_bam = sort_bam(bam, threads=threads, log=log) sorted_sam = bam_to_sam(sorted_bam, threads=threads, log=log) with open(status_log, "w") as l: l.write("COMPLETED\n") mccutils.log("popoolationte", "PopoolationTE preprocessing complete") except Exception as e: track = traceback.format_exc() print(track, file=sys.stderr) with open(log, "a") as l: print(track, file=l) with open(status_log, "w") as l: l.write("FAILED\n") mccutils.run_command(["touch", snakemake.output[0]]) mccutils.run_command(["touch", snakemake.output[1]]) mccutils.run_command(["touch", snakemake.output[2]])
def main(): mccutils.log("popoolationte2", "setting up for PopoolationTE2") ref_fasta = snakemake.input.ref_fasta fq1 = snakemake.input.fq1 fq2 = snakemake.input.fq2 jar = snakemake.params.jar log = snakemake.params.log out_dir = snakemake.params.out_dir threads = snakemake.threads status_log = snakemake.params.status_log try: # ensures intermediate files from previous runs are removed for f in os.listdir(out_dir): mccutils.remove(out_dir + "/" + f) mccutils.mkdir(out_dir + "/tmp") index_fasta(ref_fasta, log=log) fq1 = format_fastq(fq1, out_dir + "/reads_1.fastq", log=log) fq2 = format_fastq(fq2, out_dir + "/reads_2.fastq", log=log) sam1 = map_reads(ref_fasta, fq1, out_dir + "/mapped_1.sam", threads=threads, log=log) sam2 = map_reads(ref_fasta, fq2, out_dir + "/mapped_2.sam", threads=threads, log=log) bam = sam_to_bam(jar, fq1, fq2, sam1, sam2, snakemake.output.bam, out_dir, threads=threads, log=log) mccutils.remove(out_dir + "/tmp") mccutils.check_file_exists(snakemake.output.bam) with open(status_log, "w") as l: l.write("COMPLETED\n") mccutils.log("popoolationte2", "PopoolationTE2 preprocessing complete") except Exception as e: track = traceback.format_exc() print(track, file=sys.stderr) with open(log, "a") as l: print(track, file=l) mccutils.log("popoolationte2", "popoolationte2 preprocessing failed") with open(status_log, "w") as l: l.write("FAILED\n") mccutils.run_command(["touch", snakemake.output.bam])
def main(): install_path = snakemake.config['paths']['install'] + "/tools/" mccutils.remove(snakemake.params.zipfile) download_success = mccutils.download(snakemake.params.url, snakemake.params.zipfile, md5=snakemake.params.md5, max_attempts=3) if not download_success: print("popoolationte download failed... exiting...") print("try running --install with --clean for clean installation") sys.exit(1) mccutils.remove(snakemake.config['paths']['install'] + "popoolationte") command = ["unzip", snakemake.params.zipfile] mccutils.run_command(command, log=snakemake.params.log) mccutils.remove(install_path + "popoolationte") mccutils.mkdir(install_path + "popoolationte") for f in os.listdir(snakemake.config['paths']['install'] + "popoolationte"): command = [ "mv", snakemake.config['paths']['install'] + "popoolationte/" + f, install_path + "popoolationte" ] mccutils.run_command(command, log=snakemake.params.log) command = [ "patch", "-i", snakemake.params.patch1, install_path + "popoolationte/Modules/TEInsertUtility.pm" ] mccutils.run_command(command, log=snakemake.params.log) command = [ "patch", "-i", snakemake.params.patch2, install_path + "popoolationte/Modules/TEInsert.pm" ] mccutils.run_command(command, log=snakemake.params.log) command = [ "patch", "-i", snakemake.params.patch3, install_path + "popoolationte/samro.pl" ] mccutils.run_command(command, log=snakemake.params.log) command = [ "patch", "-i", snakemake.params.patch4, install_path + "popoolationte/identify-te-insertsites.pl" ] mccutils.run_command(command, log=snakemake.params.log) mccutils.remove(snakemake.params.zipfile) mccutils.remove(snakemake.config['paths']['install'] + "popoolationte") # write version to file with open( snakemake.config['paths']['install'] + "/tools/popoolationte/version.log", "w") as version: version.write(snakemake.params.md5)
def run_workflow(args, sample_name, run_id, debug=False): log = args.out + "/mcclintock." + str(run_id) + ".log" results_dir = args.out + "/results/" input_dir = args.out + "/method_input/" out_files = config.OUT_PATHS for key in out_files.keys(): out_files[key] = out_files[key].replace(config.INPUT_DIR, input_dir) out_files[key] = out_files[key].replace(config.RESULTS_DIR, results_dir) out_files[key] = out_files[key].replace(config.SAMPLE_NAME, sample_name) path = os.path.dirname(os.path.abspath(__file__)) mccutils.mkdir(args.out + "/snakemake") snakemake_path = args.out + "/snakemake/" + str(run_id) mccutils.mkdir(snakemake_path) mccutils.run_command(["cp", path + "/Snakefile", snakemake_path]) os.chdir(snakemake_path) command = [ "snakemake", "--use-conda", "--conda-prefix", path + "/install/envs/conda" ] if not debug: command.append("--quiet") else: command.append("--reason") command += [ "--configfile", args.out + "/snakemake/config/config_" + str(run_id) + ".json" ] command += ["--cores", str(args.proc)] if args.clean: clean_command = command + ["--delete-all-output"] mccutils.run_command(clean_command) mccutils.remove(args.out + "/input") for method in args.methods: command.append(out_files[method]) command.append(args.out + "/results/summary/summary_report.txt") # print(" ".join(command)) try: mccutils.run_command(command) mccutils.check_file_exists(args.out + "/results/summary/summary_report.txt") except Exception as e: track = traceback.format_exc() print(track, file=sys.stderr) print( "McClintock Pipeline Failed... please open an issue at https://github.com/bergmanlab/mcclintock/issues if you are having trouble using McClintock", file=sys.stderr) sys.exit(1) mccutils.remove(args.out + "/tmp")
def main(): fq1 = snakemake.input.fq1 fq2 = snakemake.input.fq2 bam = snakemake.input.bam reference = snakemake.input.reference twobit = snakemake.input.twobit consensus = snakemake.input.consensus ref_te_bed = snakemake.input.ref_te_bed taxonomy = snakemake.input.taxonomy median_insert_size_file = snakemake.input.median_insert_size log = snakemake.params.log with open(log,"a") as l: l.write("BAM: "+bam+"\n") l.write("2bit: "+twobit+"\n") l.write("consensus fasta: "+consensus+"\n") l.write("reference TE BED: "+ref_te_bed+"\n") l.write("Taxonomy TSV: "+taxonomy+"\n") threads = snakemake.threads out_dir = snakemake.params.out_dir script_dir = snakemake.params.script_dir sample_name = snakemake.params.sample_name status_log = snakemake.params.status_log # ensures intermediate files from previous runs are removed for f in os.listdir(out_dir): mccutils.remove(out_dir+"/"+f) mccutils.log("temp2","running TEMP2 Module") try: median_insert_size = get_median_insert_size(median_insert_size_file) run_temp2_insertion(fq1, fq2, bam, median_insert_size, reference, script_dir, consensus, ref_te_bed, threads, out_dir, config, log) run_temp2_absence(script_dir, bam, twobit, ref_te_bed, median_insert_size, threads, out_dir+"/absence", config, log) mccutils.run_command(["cp", out_dir+'/absence/'+sample_name+".absence.refined.bp.summary", out_dir], log=log) mccutils.check_file_exists(snakemake.output[0]) mccutils.check_file_exists(snakemake.output[1]) with open(status_log,"w") as l: l.write("COMPLETED\n") mccutils.log("temp2","TEMP2 run complete") except Exception as e: track = traceback.format_exc() print(track, file=sys.stderr) with open(log,"a") as l: print(track, file=l) mccutils.log("temp2","TEMP2 run failed") with open(status_log,"w") as l: l.write("FAILED\n") mccutils.run_command(["touch", snakemake.output[0]]) mccutils.run_command(["touch", snakemake.output[1]])
def get_genome_depth(non_te_bed, bam, run_id, out, log): mccutils.log("coverage","determining the coverage depth of the genome", log=log) depth_file = out+"/input/"+run_id+"genome.depth" command = ["samtools", "depth", "-aa", "-b", non_te_bed, bam, "-d", "0"] mccutils.run_command_stdout(command, depth_file, log=log) genome_depth = get_avg_depth(depth_file) mccutils.remove(depth_file) return genome_depth
def main(): consensus_fasta = snakemake.input.consensus_fasta reference_fasta = snakemake.input.reference_fasta fastq1 = snakemake.input.fastq1 fastq2 = snakemake.input.fastq2 log = snakemake.params.log with open(log, "a") as l: l.write("consensus fasta: " + consensus_fasta + "\n") l.write("reference fasta: " + reference_fasta + "\n") l.write("fastq1: " + fastq1 + "\n") l.write("fastq2: " + fastq2 + "\n") threads = snakemake.threads sample_name = snakemake.params.sample_name script_dir = snakemake.params.script_dir out_dir = snakemake.params.out_dir out_bed = snakemake.output[0] # ensures intermediate files from previous runs are removed for f in os.listdir(out_dir): mccutils.remove(out_dir + "/" + f) is_paired = True if snakemake.params.raw_fq2 == "None": is_paired = False command = [ 'Rscript', "--vanilla", script_dir + "/ngs_te_mapper.R", "genome=" + reference_fasta, "teFile=" + consensus_fasta, "tsd=" + str(config.MAX_TSD), "thread=" + str(threads), "output=" + out_dir, "sourceCodeFolder=" + script_dir ] if is_paired: command.append("sample=" + fastq1 + ";" + fastq2) else: command.append("sample=" + fastq1) mccutils.log("ngs_te_mapper", "running ngs_te_mapper", log=log) mccutils.run_command(command, log=log) mccutils.log("ngs_te_mapper", "ngs_te_mapper run complete", log=log) raw_bed = "" for f in os.listdir(out_dir + "/bed_tsd/"): if "insertions.bed" in f: raw_bed = out_dir + "/bed_tsd/" + f mccutils.run_command(["cp", raw_bed, out_bed]) mccutils.remove(out_dir + "/aligned_te/") mccutils.log("ngs_te_mapper", "ngs_te_mapper run complete")
def main(): te_gff = snakemake.input.te_gff sam = snakemake.input.sam ref_fasta = snakemake.input.ref median_insert_size_file = snakemake.input.median_insert_size log = snakemake.params.log mccutils.log("te-locate", "running TE-Locate", log=log) with open(log, "a") as l: l.write("TE GFF: " + te_gff + "\n") l.write("SAM: " + sam + "\n") l.write("reference fasta: " + ref_fasta + "\n") telocate = snakemake.params.run_script out_dir = snakemake.params.out_dir # ensures intermediate files from previous runs are removed for f in os.listdir(out_dir): mccutils.remove(out_dir + "/" + f) sam_dir = out_dir + "/sam/" mccutils.mkdir(sam_dir) te_locate_sam = sam_dir + "te-locate.sam" if os.path.exists(te_locate_sam): os.remove(te_locate_sam) os.symlink(sam, te_locate_sam) os.chdir(os.path.dirname(telocate)) median_insert_size = mccutils.get_median_insert_size( median_insert_size_file) distance = (median_insert_size * config.MIN_DISTANCE) command = [ "perl", telocate, str(config.MAX_MEM), sam_dir, te_gff, ref_fasta, out_dir, str(distance), str(config.MIN_SUPPORT_READS), str(config.MIN_SUPPORT_INDIVIDUALS) ] mccutils.run_command(command, log=log) mccutils.run_command([ "cp", out_dir + "_" + str(distance) + "_reads3_acc1.info", out_dir + "te-locate-raw.info" ]) mccutils.log("te-locate", "TE-Locate complete")
def main(): consensus_fasta = snakemake.input.consensus_fasta bam = snakemake.input.bam ref_fasta = snakemake.input.ref_fasta ref_te_bed = snakemake.input.ref_te_bed taxonomy = snakemake.input.taxonomy log = snakemake.params.log status_log = snakemake.params.status_log try: with open(log,"a") as l: l.write("consensus fasta: "+consensus_fasta+"\n") l.write("BAM: "+bam+"\n") l.write("reference fasta: "+ref_fasta+"\n") l.write("taxonomy TSV: "+ taxonomy+"\n") script_dir = snakemake.params.script_dir out_dir = snakemake.params.out_dir ref_name = snakemake.params.ref_name sample_name = snakemake.params.sample_name # ensures intermediate files from previous runs are removed for f in os.listdir(out_dir): mccutils.remove(out_dir+"/"+f) mccutils.log("retroseq","running RetroSeq", log=log) elements = split_consensus_fasta(consensus_fasta, ref_name, out_dir) bed_location_file = make_consensus_beds(elements, ref_name, ref_te_bed, taxonomy, out_dir) run_retroseq(bam, bed_location_file, ref_fasta, script_dir, sample_name, out_dir, config.PARAMS, log=log) with open(status_log,"w") as l: l.write("COMPLETED\n") mccutils.log("retroseq","RetroSeq complete") except Exception as e: track = traceback.format_exc() print(track, file=sys.stderr) with open(log,"a") as l: print(track, file=l) mccutils.log("retroseq","RetroSeq run failed") with open(status_log,"w") as l: l.write("FAILED\n") mccutils.run_command(["touch", snakemake.output[0]])
def make_redundant_bed(insertions, sample_name, out_dir): tmp_bed = out_dir + "/tmp.bed" insertion_dict = {} out_inserts = [] for insert in insertions: insertion_dict["_".join([ insert.chromosome, str(insert.start - 1), str(insert.end), insert.name, "0", insert.strand ])] = insert with open(tmp_bed, "w") as out: for insert in insertions: out_line = "\t".join([ insert.chromosome, str(insert.start - 1), str(insert.end), insert.name, "0", insert.strand ]) out.write(out_line + "\n") sorted_bed = out_dir + "/sorted.bed" command = ["bedtools", "sort", "-i", tmp_bed] mccutils.run_command_stdout(command, sorted_bed) redundant_bed = out_dir + "/" + sample_name + "_relocate2_redundant.bed" with open(redundant_bed, "w") as outbed: header = 'track name="' + sample_name + '_RelocaTE2" description="' + sample_name + '_RelocaTE2"\n' outbed.write(header) with open(sorted_bed, "r") as inbed: for x, line in enumerate(inbed): # outputs inserts in sorted order with unique number added to name key = line.replace("\t", "_") key = key.replace("\n", "") insert = insertion_dict[key] insert.name += str(x + 1) out_inserts.append(insert) # write to bed with unique number added to name split_line = line.split("\t") split_line[3] += str(x + 1) line = "\t".join(split_line) outbed.write(line) mccutils.remove(tmp_bed) mccutils.remove(sorted_bed) return out_inserts
def main(): mccutils.log("popoolationte2", "running PopoolationTE2") ref_fasta = snakemake.input.ref_fasta bam = snakemake.input.bam taxonomy = snakemake.input.taxonomy jar = snakemake.params.jar out_dir = snakemake.params.out_dir sample_name = snakemake.params.sample_name log = snakemake.params.log taxonomy = format_taxonomy(taxonomy, out_dir) ppileup = popoolationte2_ppileup(jar, config.ppileup, bam, taxonomy, out_dir, log=log) ppileup = popoolationte2_subsample(jar, config.subsampleppileup, ppileup, out_dir, log=log) signatures = popoolationte2_signatures(jar, config.identifySignatures, ppileup, out_dir, log=log) signatures = popoolationte2_strand(jar, config.updateStrand, signatures, bam, taxonomy, out_dir, log=log) signatures = popoolationte2_frequency(jar, ppileup, signatures, out_dir, log=log) te_insertions = popoolationte2_pairup(jar, config.pairupSignatures, signatures, ref_fasta, taxonomy, out_dir, log=log) mccutils.remove(bam)
def main(): mcc_out = snakemake.config["args"]['out'] mccutils.mkdir(mcc_out+"/results/") coverage_out = mcc_out+"/results/coverage/" mccutils.mkdir(coverage_out) # ensures intermediate files from previous runs are removed for f in os.listdir(coverage_out): mccutils.remove(coverage_out+"/"+f) run_id = snakemake.config['args']['run_id'] te_seqs = snakemake.input.consensus log = snakemake.params.log # always use consensus fasta for masking the genome mccutils.mkdir(coverage_out+"/input") mccutils.mkdir(coverage_out+"/te-depth-files") masked_reference, masked_gff = repeatmask_genome(snakemake.input.ref, te_seqs, snakemake.threads, run_id, coverage_out, log) # uses coverage fasta (if exists) for augmenting and coverage analysis if snakemake.config['in']['coverage_fasta'] != "None": te_seqs = snakemake.input.coverage_fa augmented_reference = augment_genome(masked_reference, te_seqs, coverage_out) index_genome(snakemake.input.ref, log) index_genome(augmented_reference, log) if snakemake.config['in']['fq2'] == "None": sam = map_reads(augmented_reference, snakemake.input.fq1, snakemake.threads, snakemake.params.sample, run_id, coverage_out, log) else: sam = map_reads(augmented_reference, snakemake.input.fq1, snakemake.threads, snakemake.params.sample, run_id, coverage_out, log, fq2=snakemake.input.fq2) bam = sam_to_bam(sam, augmented_reference, snakemake.params.sample, snakemake.threads, run_id, coverage_out, log) nonte_bed = make_nonte_bed(snakemake.input.ref, masked_gff, run_id, coverage_out, log) genome_depth = get_genome_depth(nonte_bed, bam, run_id, coverage_out, log) edge_trim = 0 if config.OMIT_EDGES: if config.OMIT_EDGES_READ_LENGTH: edge_trim = mccutils.estimate_read_length(snakemake.input.fq1) else: edge_trim = config.OMIT_EDGES_LENGTH te_names, all_coverage_files, uniq_coverage_files, avg_norm_te_depths = make_depth_table(te_seqs, bam, genome_depth, run_id, coverage_out, snakemake.output[0], log, trim_edges=edge_trim) make_plots(te_names, all_coverage_files, uniq_coverage_files, avg_norm_te_depths, genome_depth, snakemake.params.sample, coverage_out, trim_edges=edge_trim) mccutils.remove(sam)
def sam_to_bam(sam, reference, sample_name, threads, run_id, out, log): mccutils.log("coverage","converting SAM to BAM, and indexing", log=log) threads = str(threads) tmp_bam = out+"/input/"+run_id+"_tmp.bam" command = ["samtools", "view", "-Sb", "-@", threads, "-t", reference+".fai", sam] mccutils.run_command_stdout(command, tmp_bam, log=log) sorted_bam = out+"/input/"+run_id+"_"+sample_name+".bam" command = ["samtools", "sort", "-@", threads, tmp_bam] mccutils.run_command_stdout(command, sorted_bam, log=log) mccutils.run_command(["samtools", "index", sorted_bam], log=log) mccutils.remove(tmp_bam) return sorted_bam
def main(): full_command = " ".join(["python3"] + sys.argv) current_directory = os.getcwd() expected_configs = sysconfig.CONFIGS args = parse_args(expected_configs) sys.path = [args.config] + sys.path mccutils.mkdir(args.out+"/logs") mccutils.mkdir(args.out+"/tmp") check_installed_modules(args.methods, sysconfig.NO_INSTALL_METHODS, config_install.MD5, os.path.dirname(os.path.abspath(__file__))+"/install/") check_input_files(args.reference, args.consensus, args.first, fq2=args.second, locations=args.locations, taxonomy=args.taxonomy, coverage_fasta=args.coverage_fasta, augment_fasta=args.augment, annotations_only=args.make_annotations) ref_name = mccutils.get_base_name(args.reference) run_id, out_files = make_run_config(args, args.sample_name, ref_name, full_command, current_directory, debug=args.debug) run_workflow(args, args.sample_name, ref_name, run_id, out_files, debug=args.debug, annotations_only=args.make_annotations) mccutils.remove(args.out+"/tmp")
def main(): mccutils.remove(snakemake.params.tar) download_success = mccutils.download(snakemake.params.url, snakemake.params.tar, md5=snakemake.params.md5, max_attempts=3) if not download_success: print("TE-Locate download failed... exiting...") print("try running --install with --clean for clean installation") sys.exit(1) command = ["tar", "-xvf", snakemake.params.tar, "-C", snakemake.config['paths']['install']+"/tools/te-locate/"] mccutils.run_command(command, log=snakemake.params.log) # write version to file with open(snakemake.config['paths']['install']+"/tools/te-locate/version.log","w") as version: version.write(snakemake.params.md5)
def main(): mccutils.log("popoolationte2", "setting up for PopoolationTE2") ref_fasta = snakemake.input.ref_fasta fq1 = snakemake.input.fq1 fq2 = snakemake.input.fq2 log = snakemake.params.log out_dir = snakemake.params.out_dir threads = snakemake.threads index_fasta(ref_fasta, log=log) sam = map_reads(ref_fasta, fq1, fq2, out_dir, threads=threads, log=log) bam = sam_to_bam(sam, out_dir + "/tmp.bam", threads=threads, log=log) sorted_bam = sort_bam(bam, snakemake.output.bam, threads=threads, log=log) mccutils.remove(sam) mccutils.remove(bam)
def main(): mccutils.log("popoolationte", "running PopoolationTE preprocessing steps") ref_fasta = snakemake.input.ref_fasta fq1 = snakemake.input.fq1 fq2 = snakemake.input.fq2 out_dir = snakemake.params.out_dir sample_name = snakemake.params.sample_name log = snakemake.params.log script_dir = snakemake.params.script_dir threads = snakemake.threads # ensures intermediate files from previous runs are removed for f in os.listdir(out_dir): mccutils.remove(out_dir + "/" + f) mccutils.log("popoolationte", "formatting read names") fq1, fq2 = format_read_names(fq1, fq2, sample_name, out_dir) mccutils.log("popoolationte", "indexing popoolationTE reference fasta", log=log) index_fasta(ref_fasta, log=log) mccutils.log("popoolationte", "mapping fastq1 reads", log=log) sam1 = map_reads(fq1, ref_fasta, threads=threads, log=log) mccutils.log("popoolationte", "mapping fastq2 reads", log=log) sam2 = map_reads(fq2, ref_fasta, threads=threads, log=log) mccutils.log("popoolationte", "combining alignments", log=log) combined_sam = combine_alignments(sam1, sam2, fq1, fq2, script_dir, out_dir, log=log) mccutils.log("popoolationte", "sorting sam file", log=log) bam = sam_to_bam(combined_sam, threads=threads, log=log) sorted_bam = sort_bam(bam, threads=threads, log=log) sorted_sam = bam_to_sam(sorted_bam, threads=threads, log=log) files_to_remove = [sam1, sam2, combined_sam, bam, sorted_bam] for f in files_to_remove: mccutils.remove(f) mccutils.log("popoolationte", "PopoolationTE preprocessing complete")
def main(): fq1 = snakemake.input.fq1 fq2 = snakemake.params.fq2 methods = snakemake.params.methods.split(",") processors = snakemake.threads mcc_out = snakemake.params.out run_id = snakemake.params.run_id log = snakemake.params.log # now = datetime.now() # start = now.strftime("%Y-%m-%d %H:%M:%S") mccutils.log("processing", "prepping reads for McClintock") # trims adaptors of input fastq(s) trimmedfq = fq1 trimmedfq2 = fq2 try: check_fastqs(fq1, fq2, mcc_out, min_length=30, log=log) if "trimgalore" in methods: mccutils.log("processing", "running trim_galore", log=log) if fq2 == "None": flags = trimgalore.SINGLE_END_FLAGS trimmedfq = run_trim_galore(fq1, run_id, log, mcc_out, cores=processors, flags=flags) else: flags = trimgalore.PAIRED_END_FLAGS trimmedfq, trimmedfq2 = run_trim_galore(fq1, run_id, log, mcc_out, fq2=fq2, cores=processors, flags=flags) run_multiqc(mcc_out+"/results/trimgalore/") # make unzipped copies in mcc input dir make_copies(trimmedfq, trimmedfq2, snakemake.output[0], snakemake.output[1]) # removes trimmed read files from trimgalore directory if trimmedfq != fq1: mccutils.remove(trimmedfq) if trimmedfq2 != fq2: mccutils.remove(trimmedfq2) except Exception as e: track = traceback.format_exc() print(track, file=sys.stderr) print("ERROR processing of FastQ files failed...check that your FastQ files are formatted correctly...Exiting...", file=sys.stderr) mccutils.remove(snakemake.output[0]) mccutils.remove(snakemake.output[1]) sys.exit(1) # now = datetime.now() # end = now.strftime("%Y-%m-%d %H:%M:%S") # mccutils.log("setup_reads", "start: "+start) # mccutils.log("setup_reads", "end: "+end) mccutils.log("processing", "read setup complete")