def get_non_absent_ref_tes(te_gff, absence_bed, sample, out, log): insertions = [] tmp_gff = out + "/tmp.ref_nonabs.gff" command = ["bedtools", "subtract", "-A", "-a", te_gff, "-b", absence_bed] mccutils.run_command_stdout(command, tmp_gff, log=log) with open(tmp_gff, "r") as gff: for line in gff: if "#" not in line: line = line.replace(";", "\t") split_line = line.split("\t") insert = output.Insertion(output.Temp()) insert.chromosome = split_line[0] insert.start = int(split_line[3]) insert.end = int(split_line[4]) insert.name = split_line[9].split( "=")[1] + "|reference|NA|" + sample + "|temp|nonab|" insert.strand = split_line[6] insert.type = "reference" insertions.append(insert) mccutils.remove(tmp_gff) return insertions
def process_bed(bed, chromosomes, sample_name, log, out_dir, min_read_cutoff=0): unsorted_bed = out_dir+"/unsorted.bed" with open(unsorted_bed, "w") as outbed: with open(bed,"r") as inbed: insertion_count = 0 for x,line in enumerate(inbed): line = line.replace(";","\t") split_line = line.split("\t") if int(split_line[7]) > min_read_cutoff and split_line[0] in chromosomes: insertion_count += 1 outline = "\t".join([split_line[0], split_line[1], split_line[2], split_line[5]+"_"+split_line[8].replace("\n","")+"_"+sample_name+"_ngs_te_mapper_sr_"+str(x+1),"0", split_line[4]]) outbed.write(outline+"\n") if insertion_count >= 1: sorted_bed = out_dir+"/sorted.bed" command = ["bedtools", "sort", "-i", unsorted_bed] mccutils.run_command_stdout(command, sorted_bed, log=log) final_bed = out_dir+"/"+sample_name+"_ngs_te_mapper_nonredundant.bed" with open(final_bed,"w") as outbed: header = 'track name="'+sample_name+'_ngs_te_mapper" description="'+sample_name+'_ngs_te_mapper"\n' outbed.write(header) with open(sorted_bed, "r") as inbed: for line in inbed: # line = line.replace("NA",".") outbed.write(line) mccutils.remove(sorted_bed) else: mccutils.run_command(["touch",out_dir+"/"+sample_name+"_ngs_te_mapper_nonredundant.bed"]) mccutils.remove(unsorted_bed)
def map_reads(out_dir, fq1, fq2, threads=1, log=None): reference_genome = out_dir+"/teflon.prep_MP/teflon.mappingRef.fa" command = ["bwa", "index", reference_genome] mccutils.run_command(command, log=log) out_sam = out_dir+"teflon.sam" command = [ "bwa", "mem", "-t", str(threads), "-Y", reference_genome, fq1, fq2 ] mccutils.run_command_stdout(command, out_sam, log=log) out_bam = out_dir+"teflon.bam" command = ["samtools", "view", "-Sb", out_sam] mccutils.run_command_stdout(command, out_bam, log=log) sorted_bam = out_dir+"teflon.sorted.bam" command = ["samtools", "sort", "-@", str(threads), "-o", sorted_bam, out_bam] mccutils.run_command(command, log=log) command = ["samtools", "index", sorted_bam ] mccutils.run_command(command, log=log) mccutils.remove(out_sam) mccutils.remove(out_bam) return sorted_bam
def make_nonte_bed(reference, masked_gff, run_id, out, log): mccutils.log("coverage", "creating BED file of non-TE regions", log=log) masked_bed = out + "/input/" + run_id + "_ref_tes.bed" repeatmasker_gff_to_bed(masked_gff, masked_bed) sorted_bed = out + "/input/" + run_id + "_ref_tes_sorted.bed" mccutils.run_command_stdout(["bedtools", "sort", "-i", masked_bed], sorted_bed, log=log) chromosome_names = [] with open(reference, "r") as fa: for line in fa: if ">" in line: chromosome_names.append( line.replace(">", "").replace("\n", "")) chrom_idx = out + "/input/" + run_id + "_ref.genome" with open(reference + ".fai", "r") as faidx: with open(chrom_idx, "w") as genome: for line in faidx: split_line = line.split("\t") out_line = "\t".join([split_line[0], split_line[1]]) genome.write(out_line + "\n") non_te_bed = out + "/input/" + run_id + "_ref_nonte.bed" command = ["bedtools", "complement", "-i", sorted_bed, "-g", chrom_idx] mccutils.run_command_stdout(command, non_te_bed, log=log) for f in [masked_bed, sorted_bed, chrom_idx]: mccutils.remove(f) return non_te_bed
def main(): mccutils.log("processing", "making PopoolationTE reference fasta") command = [ "cat", snakemake.input[0], snakemake.input[1], snakemake.input[2] ] mccutils.run_command_stdout(command, snakemake.output[0]) mccutils.log("processing", "PopoolationTE reference fasta created")
def map_reads(ref, fq1, fq2, out, threads=1, log=None): mccutils.log("popoolationte2", "mapping reads", log=log) sam = out + "/" + "mapped.sam" mccutils.run_command_stdout( ["bwa", "bwasw", "-t", str(threads), ref, fq1, fq2], sam, log=log) return sam
def get_non_absent_ref_tes(te_gff, absence_bed, sample, out, log): insertions = [] tmp_gff = out+"/tmp.ref_nonabs.gff" command = ["bedtools", "subtract", "-A", "-a", te_gff, "-b", absence_bed] mccutils.run_command_stdout(command, tmp_gff, log=log) with open(tmp_gff,"r") as gff: for line in gff: if "#" not in line: line = line.replace(";","\t") split_line = line.split("\t") insert = mccutils.Insertion() insert.chromosome = split_line[0] insert.start = int(split_line[3]) insert.end = int(split_line[4]) insert.temp.support = "!" insert.name = split_line[9].split("=")[1]+"_reference_"+sample+"_temp_nonab_" insert.strand = split_line[6] insert.temp.classification = "!" insert.temp.junction1Support = "!" insert.temp.junction2Support = "!" insert.temp.junction1 = '!' insert.temp.junction2 = "!" insert.temp.frequency = "!" insert.type = "reference" insertions.append(insert) mccutils.remove(tmp_gff) return insertions
def make_nonredundant_bed(insertions, sample_name, out_dir): uniq_inserts = {} for insert in insertions: key = "_".join([insert.chromosome, str(insert.end)]) if key not in uniq_inserts.keys(): uniq_inserts[key] = insert else: if uniq_inserts[key].read_pair_support > insert.read_pair_support: uniq_inserts[key] = insert tmp_bed = out_dir+"/tmp_telocate_nonredundant.bed" with open(tmp_bed, "w") as outbed: for key in uniq_inserts.keys(): insert = uniq_inserts[key] out_line = "\t".join([insert.chromosome, str(insert.start-1), str(insert.end), insert.name, "0", insert.strand]) outbed.write(out_line+"\n") sorted_bed = out_dir+"/sorted.bed" command = ["bedtools", "sort", "-i", tmp_bed] mccutils.run_command_stdout(command, sorted_bed) nonredundant_bed = out_dir+"/"+sample_name+"_telocate_nonredundant.bed" with open(sorted_bed, "r") as inbed: with open(nonredundant_bed, "w") as outbed: header = 'track name="'+sample_name+'_TE-locate" description="'+sample_name+'_TE-locate"\n' outbed.write(header) for line in inbed: outbed.write(line) mccutils.remove(tmp_bed) mccutils.remove(sorted_bed)
def sort_bam(bam, threads=1, log=None): sorted_bam = bam.split(".") sorted_bam[-1] = "sorted.bam" sorted_bam = ".".join(sorted_bam) command = ["samtools", "sort", bam, "-@", str(threads), "-o", sorted_bam] mccutils.run_command_stdout(command, sorted_bam, log=log) return sorted_bam
def map_reads(fq, fasta, threads=1, log=None): outfile = fq.split(".") outfile[-1] = "sam" outfile = ".".join(outfile) command = ["bwa", "bwasw", "-t", str(threads), fasta, fq] mccutils.run_command_stdout(command, outfile, log=log) return outfile
def bam_to_sam(bam, threads=1, log=None): sam = bam.split(".") sam[-1] = "sam" sam = ".".join(sam) command = ["samtools", "view", "-@", str(threads), bam] mccutils.run_command_stdout(command, sam, log=log) return sam
def sam_to_bam(sam, threads=1, log=None): bam = sam.split(".") bam[-1] = "bam" bam = ".".join(bam) command = ["samtools", "view", "-Sb", "-@", str(threads), sam] mccutils.run_command_stdout(command, bam, log=log) return bam
def make_depth_table(te_fasta, bam, genome_depth, run_id, out, depth_csv, log, trim_edges=0): mccutils.log("coverage","creating TE depth coverage table", log=log) with open(depth_csv, "w") as table: table.write("TE-Family,Normalized-Depth,Normalized-Unique-Depth"+"\n") te_names = [] uniq_coverage_files = [] all_coverage_files = [] avg_norm_depths = [] avg_uniq_norm_depths = [] with open(te_fasta,"r") as fa: for line in fa: if ">" in line: te_name = line.replace("\n","") te_name = te_name.replace(">","") mccutils.mkdir(out+"/te-depth-files") highQ = out+"/te-depth-files/"+te_name+".highQ.cov" command = ["samtools", "depth", "-aa", "-r", te_name, bam, "-d", "0", "-Q", "1"] mccutils.run_command_stdout(command, highQ, log=log) allQ = out+"/te-depth-files/"+te_name+".allQ.cov" command = ["samtools", "depth", "-aa", "-r", te_name, bam, "-d", "0", "-Q", "0"] mccutils.run_command_stdout(command, allQ, log=log) # make normalized coverage files allQ_chrom, allQ_pos, allQ_cov = read_samtools_depth_file(allQ) with open(out+"/te-depth-files/"+te_name+".allQ.normalized.cov","w") as covfile: for i,pos in enumerate(allQ_pos): cov = str(round(allQ_cov[i]/genome_depth,2)) line = "\t".join([allQ_chrom,str(pos),cov]) covfile.write(line+"\n") highQ_chrom, highQ_pos, highQ_cov = read_samtools_depth_file(highQ) with open(out+"/te-depth-files/"+te_name+".highQ.normalized.cov","w") as covfile: for i,pos in enumerate(highQ_pos): cov = str(round(highQ_cov[i]/genome_depth,2)) line = "\t".join([highQ_chrom,str(pos),cov]) covfile.write(line+"\n") avg_depth = get_avg_depth(allQ, trim_edges=trim_edges) avg_norm_depth = avg_depth/genome_depth avg_uniq_depth = get_avg_depth(highQ, trim_edges=trim_edges) avg_uniq_norm_depth = avg_uniq_depth/genome_depth with open(depth_csv, "a") as table: table.write(te_name+","+str(round(avg_norm_depth,2))+","+str(round(avg_uniq_norm_depth,2))+"\n") te_names.append(te_name) uniq_coverage_files.append(highQ) all_coverage_files.append(allQ) avg_norm_depths.append(avg_norm_depth) return te_names, all_coverage_files, uniq_coverage_files, avg_norm_depths
def get_genome_depth(non_te_bed, bam, run_id, out, log): mccutils.log("coverage","determining the coverage depth of the genome", log=log) depth_file = out+"/input/"+run_id+"genome.depth" command = ["samtools", "depth", "-aa", "-b", non_te_bed, bam, "-d", "0"] mccutils.run_command_stdout(command, depth_file, log=log) genome_depth = get_avg_depth(depth_file) mccutils.remove(depth_file) return genome_depth
def map_reads(reference, fq1, threads, sample_name, run_id, out, log, fq2=None): mccutils.log("coverage","mapping reads to augmented reference genome", log=log) command = ["bwa", "mem", "-t", str(threads), "-R", "@RG\\tID:"+sample_name+"\\tSM:"+sample_name, reference, fq1] if fq2 is not None: command.append(fq2) sam = out+"/input/"+run_id+"_"+sample_name+".sam" mccutils.run_command_stdout(command, sam, log=log) return sam
def main(): log = snakemake.params.log mccutils.log("processing", "sorting SAM file for compatibility with TE-locate", log=log) command = [ "sort", "-S", "1G", "--temporary-directory=" + snakemake.config['args']['out'] + "/tmp", snakemake.input[0] ] mccutils.run_command_stdout(command, snakemake.output[0], log=log) mccutils.log("processing", "TE-locate SAM created")
def main(): log = snakemake.params.log mccutils.log("processing","Converting sam to bam", log=log) try: command = ["samtools","view", "-@", str(snakemake.threads), "-Sb", "-t", snakemake.input.ref_idx, snakemake.input.sam] mccutils.run_command_stdout(command, snakemake.output.tmp_bam, log=log) mccutils.check_file_exists(snakemake.output.tmp_bam) except Exception as e: track = traceback.format_exc() print(track, file=sys.stderr) print("ERROR...unable convert sam to bam using SAMtools...sam file:", snakemake.input.sam, file=sys.stderr) sys.exit(1) try: command = ["samtools", "sort", "-@", str(snakemake.threads), snakemake.output.tmp_bam, snakemake.output.bam.replace(".bam", "")] mccutils.run_command(command, log=log) mccutils.check_file_exists(snakemake.output.bam) except Exception as e: track = traceback.format_exc() print(track, file=sys.stderr) print("ERROR...falied to sort the bam file using samtools sort...bam file:", snakemake.output.tmp_bam, file=sys.stderr) sys.exit(1) try: command = ["samtools", "index", snakemake.output.bam] mccutils.run_command(command, log=log) except Exception as e: track = traceback.format_exc() print(track, file=sys.stderr) print("ERROR...falied to index the bam file using samtools index...bam file:", snakemake.output.bam, file=sys.stderr) sys.exit(1) try: command = ["samtools", "flagstat", snakemake.output.bam] mccutils.run_command_stdout(command, snakemake.output.flagstat, log=log) mccutils.check_file_exists(snakemake.output.flagstat) except Exception as e: track = traceback.format_exc() print(track, file=sys.stderr) print("ERROR...falied to generate flagstat file using samtools flagstat...bam file:", snakemake.output.bam, file=sys.stderr) sys.exit(1) mccutils.log("processing","sam to bam converted")
def make_redundant_bed(insertions, sample_name, out_dir): tmp_bed = out_dir + "/tmp.bed" insertion_dict = {} out_inserts = [] for insert in insertions: insertion_dict["_".join([ insert.chromosome, str(insert.start - 1), str(insert.end), insert.name, "0", insert.strand ])] = insert with open(tmp_bed, "w") as out: for insert in insertions: out_line = "\t".join([ insert.chromosome, str(insert.start - 1), str(insert.end), insert.name, "0", insert.strand ]) out.write(out_line + "\n") sorted_bed = out_dir + "/sorted.bed" command = ["bedtools", "sort", "-i", tmp_bed] mccutils.run_command_stdout(command, sorted_bed) redundant_bed = out_dir + "/" + sample_name + "_relocate2_redundant.bed" with open(redundant_bed, "w") as outbed: header = 'track name="' + sample_name + '_RelocaTE2" description="' + sample_name + '_RelocaTE2"\n' outbed.write(header) with open(sorted_bed, "r") as inbed: for x, line in enumerate(inbed): # outputs inserts in sorted order with unique number added to name key = line.replace("\t", "_") key = key.replace("\n", "") insert = insertion_dict[key] insert.name += str(x + 1) out_inserts.append(insert) # write to bed with unique number added to name split_line = line.split("\t") split_line[3] += str(x + 1) line = "\t".join(split_line) outbed.write(line) mccutils.remove(tmp_bed) mccutils.remove(sorted_bed) return out_inserts
def make_copies(fq1, fq2, fq1copy, fq2copy): if "gz" in fq1.split(".")[-1]: mccutils.run_command_stdout(["zcat", fq1], fq1copy) else: mccutils.run_command(["cp", fq1, fq1copy]) if fq2 == "None": mccutils.run_command(["touch", fq2copy]) elif "gz" in fq2.split(".")[-1]: mccutils.run_command_stdout(["zcat", fq2], fq2copy) else: mccutils.run_command(["cp", fq2, fq2copy]) return fq1copy, fq2copy
def sam_to_bam(sam, reference, sample_name, threads, run_id, out, log): mccutils.log("coverage","converting SAM to BAM, and indexing", log=log) threads = str(threads) tmp_bam = out+"/input/"+run_id+"_tmp.bam" command = ["samtools", "view", "-Sb", "-@", threads, "-t", reference+".fai", sam] mccutils.run_command_stdout(command, tmp_bam, log=log) sorted_bam = out+"/input/"+run_id+"_"+sample_name+".bam" command = ["samtools", "sort", "-@", threads, tmp_bam] mccutils.run_command_stdout(command, sorted_bam, log=log) mccutils.run_command(["samtools", "index", sorted_bam], log=log) mccutils.remove(tmp_bam) return sorted_bam
def main(): mccutils.log("processing", "mapping reads to reference", log=snakemake.log[0]) try: command = ["bwa", "mem"] if eval(snakemake.config['args']['save_comments']): command.append("-C") command += [ "-t", str(snakemake.threads), "-R", "@RG\\tID:" + snakemake.params.sample + "\\tSM:" + snakemake.params.sample, snakemake.input.ref, snakemake.input.fq1 ] if snakemake.config['in']['fq2'] != "None": command.append(snakemake.input.fq2) mccutils.run_command_stdout(command, snakemake.output[0], log=snakemake.log[0]) mccutils.check_file_exists(snakemake.output[0]) except Exception as e: track = traceback.format_exc() print(track, file=sys.stderr) if snakemake.config['in']['fq2'] == "None": print( "ERROR...unable to map reads (bwa mem) using reference fasta:", snakemake.input.ref, "and reads:", snakemake.input.fq1, file=sys.stderr) else: print( "ERROR...unable to map reads (bwa mem) using reference fasta:", snakemake.input.ref, "and reads:", snakemake.input.fq1, snakemake.input.fq2, file=sys.stderr) sys.exit(1) mccutils.log("processing", "read mapping complete")
def get_avg_coverage(ref, bam, out): chrom = [] fasta_records = SeqIO.parse(ref, "fasta") for record in fasta_records: chrom.append(str(record.id)) tmp = out + "/tmp" command = ['samtools', 'depth', bam] mccutils.run_command_stdout(command, tmp) cov_total = 0 pos = 0 with open(tmp, "r") as depth: for line in depth: split_line = line.split("\t") if split_line[0] in chrom: pos += 1 cov_total += int(split_line[2]) mccutils.remove(tmp) return round(cov_total / pos, 3)
def make_run_config(args, sample_name, ref_name, full_command, current_directory): run_id = random.randint(1000000, 9999999) mccutils.mkdir(args.out + "/snakemake") mccutils.mkdir(args.out + "/snakemake/config") run_config = args.out + "/snakemake/config/config_" + str(run_id) + ".json" input_dir = args.out + "/method_input/" results_dir = args.out + "/results/" mcc_path = os.path.dirname(os.path.abspath(__file__)) # get git commit hash to provide in summary report git_commit = "?" try: os.chdir(mcc_path) git_commit_file = args.out + "/git-commit.txt" mccutils.run_command_stdout(["git", "rev-parse", "HEAD"], git_commit_file) with open(git_commit_file, "r") as inf: for line in inf: git_commit = line.replace("\n", "") mccutils.remove(git_commit_file) except Exception as e: track = traceback.format_exc() print(track, file=sys.stderr) print("Could not locate git commit hash...using '?' ", file=sys.stderr) git_commit = "?" mccutils.log("SETUP", "McClintock Version: " + git_commit) out_files_to_make = [] out_files = config.OUT_PATHS for key in out_files.keys(): out_files[key] = out_files[key].replace(config.INPUT_DIR, input_dir) out_files[key] = out_files[key].replace(config.RESULTS_DIR, results_dir) out_files[key] = out_files[key].replace(config.SAMPLE_NAME, sample_name) for method in args.methods: out_files_to_make.append(out_files[method]) now = datetime.now() now_str = now.strftime("%Y%m%d.%H%M%S") log_dir = args.out + "/logs/" + now_str + "." + str(run_id) + "/" mccutils.mkdir(log_dir) chromosomes = [] for record in SeqIO.parse(args.reference, "fasta"): chrom = str(record.id) chrom = mccutils.replace_special_chars(chrom) chromosomes.append(chrom) data = {} data['args'] = { 'proc': str(args.proc), 'out': str(args.out), 'log_dir': log_dir, 'augment_fasta': str(args.augment), 'mcc_path': mcc_path, 'commit': git_commit, 'sample_name': sample_name, 'ref_name': ref_name, 'run_id': str(run_id), 'methods': ",".join(args.methods), 'out_files': ",".join(out_files_to_make), 'save_comments': str(args.comments), 'max_threads_per_rule': max( 1, calculate_max_threads(args.proc, args.methods, config.MULTI_THREAD_METHODS, slow=args.slow)), 'full_command': full_command, 'call_directory': current_directory, 'time': now.strftime("%Y-%m-%d %H:%M:%S"), "chromosomes": ",".join(chromosomes) } # input paths for files data["in"] = { 'reference': str(args.reference), 'consensus': str(args.consensus), 'fq1': str(args.first), 'fq2': str(args.second), 'locations': str(args.locations), 'taxonomy': str(args.taxonomy), 'coverage_fasta': str(args.coverage_fasta), } # where mcc copies will be stored data["mcc"] = config.INTERMEDIATE_PATHS for key in data["mcc"].keys(): data["mcc"][key] = data["mcc"][key].replace(config.INPUT_DIR, input_dir) data["mcc"][key] = data["mcc"][key].replace(config.REF_NAME, ref_name) data["mcc"][key] = data["mcc"][key].replace(config.SAMPLE_NAME, sample_name) env_path = os.path.dirname(os.path.abspath(__file__)) + "/install/envs/" data["envs"] = config_install.ENV for key in data["envs"].keys(): data['envs'][key] = data['envs'][key].replace(config_install.ENV_PATH, env_path) with open(run_config, "w") as conf: json.dump(data, conf, indent=4) return run_id
def run_popoolationte(sam, reference, taxon, read_len, insert_size, max_dist, ref_inserts, script_dir, out_dir, log=None, identify_min_count=3, identify_min_qual=15, crosslink_site_shift=100, update_te_inserts_site_shift=100, estimate_polymorphism_min_qual=15, filter_min_count=5): mccutils.log("popoolationte", "identify-te-insertsites.pl") insert_sites = out_dir + "te-fwd-rev.txt" command = [ "perl", script_dir + "identify-te-insertsites.pl", "--input", sam, "--te-hierarchy-file", taxon, "--te-hierarchy-level", "family", "--narrow-range", str(read_len), "--min-count", str(identify_min_count), "--min-map-qual", str(identify_min_qual), "--output", insert_sites, "--insert-distance", str(insert_size), "--read-length", str(read_len) ] mccutils.run_command(command, log=log) mccutils.log("popoolationte", "genomic-N-2gtf.pl") poly_n = out_dir + "poly_n.gtf" command = ["perl", script_dir + "genomic-N-2gtf.pl", "--input", reference] mccutils.run_command_stdout(command, poly_n, log=log) mccutils.log("popoolationte", "crosslink-te-sites.pl") crosslinked = out_dir + "te-inserts.txt" command = [ "perl", script_dir + "crosslink-te-sites.pl", "--directional-insertions", insert_sites, "--min-dist", str(read_len), "--max-dist", str(max_dist), "--output", crosslinked, "--single-site-shift", str(crosslink_site_shift), "--poly-n", poly_n, "--te-hierarchy", taxon, "--te-hier-level", "family" ] mccutils.run_command(command, log=log) mccutils.log("popoolationte", "update-teinserts-with-knowntes.pl") updated_inserts = out_dir + "te-insertions-updated.txt" command = [ "perl", script_dir + "update-teinserts-with-knowntes.pl", "--known", ref_inserts, "--output", updated_inserts, "--te-hierarchy-file", taxon, "--te-hierarchy-level", "family", "--max-dist", str(max_dist), "--te-insertions", crosslinked, "--single-site-shift", str(update_te_inserts_site_shift) ] mccutils.run_command(command, log=log) mccutils.log("popoolationte", "estimate-polymorphism.pl") te_polymorphism = out_dir + "te-polymorphism" command = [ "perl", script_dir + "estimate-polymorphism.pl", "--sam-file", sam, "--te-insert-file", updated_inserts, "--te-hierarchy-file", taxon, "--te-hierarchy-level", "family", "--min-map-qual", str(estimate_polymorphism_min_qual), "--output", te_polymorphism ] mccutils.run_command(command, log=log) mccutils.log("popoolationte", "filter-teinserts.pl") filtered = out_dir + "te-poly-filtered.txt" command = [ "perl", script_dir + "filter-teinserts.pl", "--te-insertions", te_polymorphism, "--output", filtered, "--discard-overlapping", "--min-count", str(filter_min_count) ] mccutils.run_command(command, log=log)
def main(): fq1 = snakemake.input.fq1 fq2 = snakemake.params.fq2 methods = snakemake.params.methods.split(",") processors = snakemake.threads mcc_out = snakemake.params.out run_id = snakemake.params.run_id log = snakemake.params.log mccutils.log("processing", "prepping reads for McClintock") try: # trims adaptors of input fastq(s) trimmedfq = fq1 trimmedfq2 = fq2 if "trimgalore" in methods: mccutils.log("processing", "running trim_galore", log=log) if fq2 == "None": flags = trimgalore.SINGLE_END_FLAGS trimmedfq = run_trim_galore(fq1, run_id, log, mcc_out, cores=processors, flags=flags) else: flags = trimgalore.PAIRED_END_FLAGS trimmedfq, trimmedfq2 = run_trim_galore(fq1, run_id, log, mcc_out, fq2=fq2, cores=processors, flags=flags) # make unzipped copies in mcc input dir if "gz" in trimmedfq.split(".")[-1]: mccutils.run_command_stdout(["zcat", trimmedfq], snakemake.output[0]) else: mccutils.run_command(["cp", trimmedfq, snakemake.output[0]]) if trimmedfq2 == "None": mccutils.run_command(["touch", snakemake.output[1]]) elif "gz" in trimmedfq2.split(".")[-1]: mccutils.run_command_stdout(["zcat", trimmedfq2], snakemake.output[1]) else: mccutils.run_command(["cp", trimmedfq2, snakemake.output[1]]) # removes trimmed read files from trimgalore directory if trimmedfq != fq1: mccutils.remove(trimmedfq) if trimmedfq2 != fq2: mccutils.remove(trimmedfq2) except Exception as e: track = traceback.format_exc() print(track, file=sys.stderr) print( "ERROR processing of FastQ files failed...check that your FastQ files are formatted correctly...Exiting...", file=sys.stderr) mccutils.remove(snakemake.output[0]) mccutils.remove(snakemake.output[1]) sys.exit(1) mccutils.log("processing", "read setup complete")
def run_popoolationte(sam, reference, taxon, read_len, insert_size, max_dist, ref_inserts, script_dir, out_dir, params, log=None): mccutils.log("popoolationte", "identify-te-insertsites.pl") insert_sites = out_dir + "te-fwd-rev.txt" command = [ "perl", script_dir + "identify-te-insertsites.pl", "--input", sam, "--te-hierarchy-file", taxon, "--te-hierarchy-level", "family", "--narrow-range", str(read_len), "--output", insert_sites, "--insert-distance", str(insert_size), "--read-length", str(read_len) ] for param in params["identify-te-insertsites.pl"].keys(): command.append(param) command.append(str(params["identify-te-insertsites.pl"][param])) mccutils.run_command(command, log=log) mccutils.log("popoolationte", "genomic-N-2gtf.pl") poly_n = out_dir + "poly_n.gtf" command = ["perl", script_dir + "genomic-N-2gtf.pl", "--input", reference] mccutils.run_command_stdout(command, poly_n, log=log) mccutils.log("popoolationte", "crosslink-te-sites.pl") crosslinked = out_dir + "te-inserts.txt" command = [ "perl", script_dir + "crosslink-te-sites.pl", "--directional-insertions", insert_sites, "--min-dist", str(read_len), "--max-dist", str(max_dist), "--output", crosslinked, "--poly-n", poly_n, "--te-hierarchy", taxon, "--te-hier-level", "family" ] for param in params["crosslink-te-sites.pl"].keys(): command.append(param) command.append(str(params["crosslink-te-sites.pl"][param])) mccutils.run_command(command, log=log) mccutils.log("popoolationte", "update-teinserts-with-knowntes.pl") updated_inserts = out_dir + "te-insertions-updated.txt" command = [ "perl", script_dir + "update-teinserts-with-knowntes.pl", "--known", ref_inserts, "--output", updated_inserts, "--te-hierarchy-file", taxon, "--te-hierarchy-level", "family", "--max-dist", str(max_dist), "--te-insertions", crosslinked ] for param in params["update-teinserts-with-knowntes.pl"].keys(): command.append(param) command.append(str(params["update-teinserts-with-knowntes.pl"][param])) mccutils.run_command(command, log=log) mccutils.log("popoolationte", "estimate-polymorphism.pl") te_polymorphism = out_dir + "te-polymorphism" command = [ "perl", script_dir + "estimate-polymorphism.pl", "--sam-file", sam, "--te-insert-file", updated_inserts, "--te-hierarchy-file", taxon, "--te-hierarchy-level", "family", "--output", te_polymorphism ] for param in params["estimate-polymorphism.pl"].keys(): command.append(param) command.append(str(params["estimate-polymorphism.pl"][param])) mccutils.run_command(command, log=log) mccutils.log("popoolationte", "filter-teinserts.pl") filtered = out_dir + "te-poly-filtered.txt" command = [ "perl", script_dir + "filter-teinserts.pl", "--te-insertions", te_polymorphism, "--output", filtered, "--discard-overlapping" ] for param in params["filter-teinserts.pl"].keys(): command.append(param) command.append(str(params["filter-teinserts.pl"][param])) mccutils.run_command(command, log=log)
def sam_to_bam(sam, bam, threads=1, log=None): mccutils.log("popoolationte2", "converting SAM to BAM", log=log) mccutils.run_command_stdout( ["samtools", "view", "-@", str(threads), "-Sb", sam], bam, log=log) return bam
def make_nonredundant_bed(insertions, sample, out, log, acceptable_classes=["1p1"], frequency_theshold=0.1): unsorted_nonredundant_bed = out + "/" + sample + "_temp_unsorted_nonredundant.bed" collaped_insertions = {} # collapsing all insterts that share the same chromosome and end position (and pass thresholds) for insert in insertions: if insert.type == "reference" or ( insert.classification in acceptable_classes and insert.frequency > frequency_theshold): if insert.type == "reference": # reference TEs are only considered 'redundant' if they share the same start and end key = insert.chromosome + "_" + str(insert.start) + "_" + str( insert.end) else: key = insert.chromosome + "_" + str(insert.end) if key not in collaped_insertions.keys(): collaped_insertions[key] = [] collaped_insertions[key].append(insert) with open(unsorted_nonredundant_bed, "w") as bed: for key in collaped_insertions.keys(): highest_supported_insert = None for x, insert in enumerate(collaped_insertions[key]): if x < 1: highest_supported_insert = insert else: if highest_supported_insert.support != "!" and insert.support > highest_supported_insert.support: highest_supported_insert = insert line = "\t".join([ highest_supported_insert.chromosome, str(highest_supported_insert.start), str(highest_supported_insert.end), highest_supported_insert.name, "0", highest_supported_insert.direction ]) bed.write(line + "\n") tmp_bed = out + "/" + sample + "_temp_nonredundant.bed.tmp" command = ["bedtools", "sort", "-i", unsorted_nonredundant_bed] mccutils.run_command_stdout(command, tmp_bed, log=log) nonredundant_bed = out + "/" + sample + "_temp_nonredundant.bed" with open(nonredundant_bed, "w") as outbed: with open(tmp_bed, "r") as inbed: header = 'track name="%s_TEMP" description="%s_TEMP"' % (sample, sample) outbed.write(header + "\n") for line in inbed: outbed.write(line) mccutils.remove(tmp_bed) mccutils.remove(unsorted_nonredundant_bed)
def map_reads(ref, fq, outsam, threads=1, log=None): mccutils.log("popoolationte2", "mapping reads", log=log) mccutils.run_command_stdout( ["bwa", "bwasw", "-t", str(threads), ref, fq], outsam, log=log) return outsam