def make_local_css_js_copies(css_dir, js_dir, out_dir): mccutils.mkdir(out_dir + "/html/") mccutils.mkdir(out_dir + "/css/") for css in os.listdir(css_dir): mccutils.run_command(["cp", css_dir + "/" + css, out_dir + "/css/"]) mccutils.mkdir(out_dir + "/js/") for js in os.listdir(js_dir): mccutils.run_command(["cp", js_dir + "/" + js, out_dir + "/js/"])
def make_consensus_beds(elements, ref_name, te_bed, taxon, out): out_dir = out + "/split_bed/" mccutils.mkdir(out_dir) taxon_map = {} location_file = out_dir + ref_name + ".locationlist" with open(taxon, "r") as t: for line in t: split_line = line.split("\t") element = split_line[0] element_fam = split_line[1].replace("\n", "") if element_fam in taxon_map.keys(): taxon_map[element_fam].append(element) else: taxon_map[element_fam] = [element] with open(location_file, "w") as locations: for fam in taxon_map.keys(): if fam in elements: bed_name = fam special_chars = [ ";", "&", "(", ")", "|", "*", "?", "[", "]", "~", "{", "}", "<", "!", "^", '"', "'", "\\", "$", "/" ] for char in special_chars: bed_name = bed_name.replace(char, "_") bed_name = out_dir + ref_name + "_" + bed_name + ".bed" locations.write(fam + "\t" + bed_name + "\n") with open(bed_name, "w") as outbed: with open(te_bed, "r") as inbed: for line in inbed: split_line = line.split("\t") element_name = split_line[3] if element_name in taxon_map[fam]: outbed.write(line) return location_file
def main(): install_path = snakemake.config['paths']['install'] + "/tools/" raw_name = "ngs_te_mapper-f9f48996ac346ac86d57edbd00534aa1227b753e" mccutils.remove(snakemake.params.zipfile) download_success = mccutils.download(snakemake.params.url, snakemake.params.zipfile, md5=snakemake.params.md5, max_attempts=3) if not download_success: print("ngs_te_mapper download failed... exiting...") print("try running --install with --clean for clean installation") sys.exit(1) mccutils.remove(snakemake.config['paths']['install'] + raw_name) command = ["unzip", snakemake.params.zipfile] mccutils.run_command(command, log=snakemake.params.log) mccutils.remove(install_path + raw_name) command = [ "mv", snakemake.config['paths']['install'] + raw_name, install_path ] mccutils.run_command(command, log=snakemake.params.log) mccutils.remove(install_path + "ngs_te_mapper") mccutils.mkdir(install_path + "ngs_te_mapper") for f in os.listdir(install_path + raw_name): command = [ "mv", install_path + raw_name + "/" + f, install_path + "ngs_te_mapper" ] mccutils.run_command(command, log=snakemake.params.log) mccutils.remove(install_path + raw_name) mccutils.remove(snakemake.params.zipfile)
def main(): mcc_out = snakemake.config["args"]['out'] mccutils.mkdir(mcc_out+"/results/") coverage_out = mcc_out+"/results/coverage/" mccutils.mkdir(coverage_out) # ensures intermediate files from previous runs are removed for f in os.listdir(coverage_out): mccutils.remove(coverage_out+"/"+f) run_id = snakemake.config['args']['run_id'] te_seqs = snakemake.input.consensus log = snakemake.params.log # always use consensus fasta for masking the genome mccutils.mkdir(coverage_out+"/input") masked_reference, masked_gff = repeatmask_genome(snakemake.input.ref, te_seqs, snakemake.threads, run_id, coverage_out, log) # uses coverage fasta (if exists) for augmenting and coverage analysis if snakemake.config['in']['coverage_fasta'] != "None": te_seqs = snakemake.input.coverage_fa augmented_reference = augment_genome(masked_reference, te_seqs, run_id, coverage_out) index_genome(snakemake.input.ref, log) index_genome(augmented_reference, log) if snakemake.config['in']['fq2'] == "None": sam = map_reads(augmented_reference, snakemake.input.fq1, snakemake.threads, snakemake.params.sample, run_id, coverage_out, log) else: sam = map_reads(augmented_reference, snakemake.input.fq1, snakemake.threads, snakemake.params.sample, run_id, coverage_out, log, fq2=snakemake.input.fq2) bam = sam_to_bam(sam, augmented_reference, snakemake.params.sample, snakemake.threads, run_id, coverage_out, log) nonte_bed = make_nonte_bed(snakemake.input.ref, masked_gff, run_id, coverage_out, log) genome_depth = get_genome_depth(nonte_bed, bam, run_id, coverage_out, log) edge_trim = 0 if config.OMIT_EDGES: if config.OMIT_EDGES_READ_LENGTH: edge_trim = mccutils.estimate_read_length(snakemake.input.fq1) else: edge_trim = config.OMIT_EDGES_LENGTH te_names, all_coverage_files, uniq_coverage_files, avg_norm_te_depths = make_depth_table(te_seqs, bam, genome_depth, run_id, coverage_out, snakemake.output[0], log, trim_edges=edge_trim) make_plots(te_names, all_coverage_files, uniq_coverage_files, avg_norm_te_depths, genome_depth, snakemake.params.sample, coverage_out, trim_edges=edge_trim) mccutils.remove(sam) mccutils.remove(bam)
def parse_args(): parser = argparse.ArgumentParser( prog='McClintock', description= "Meta-pipeline to identify transposable element insertions using next generation sequencing data" ) ## required ## parser.add_argument("-r", "--reference", type=str, help="A reference genome sequence in fasta format", required='--install' not in sys.argv) parser.add_argument( "-c", "--consensus", type=str, help= "The consensus sequences of the TEs for the species in fasta format", required='--install' not in sys.argv) parser.add_argument( "-1", "--first", type=str, help= "The path of the first fastq file from paired end read sequencing or the fastq file from single read sequencing", required='--install' not in sys.argv) ## optional ## parser.add_argument( "-2", "--second", type=str, help= "The path of the second fastq file from a paired end read sequencing", required=False) parser.add_argument( "-p", "--proc", type=int, help= "The number of processors to use for parallel stages of the pipeline [default = 1]", required=False) parser.add_argument("-o", "--out", type=str, help="An output folder for the run. [default = '.']", required=False) parser.add_argument( "-m", "--methods", type=str, help= "A comma-delimited list containing the software you want the pipeline to use for analysis. e.g. '-m relocate,TEMP,ngs_te_mapper' will launch only those three methods", required=False) parser.add_argument( "-g", "--locations", type=str, help= "The locations of known TEs in the reference genome in GFF 3 format. This must include a unique ID attribute for every entry", required=False) parser.add_argument( "-t", "--taxonomy", type=str, help= "A tab delimited file with one entry per ID in the GFF file and two columns: the first containing the ID and the second containing the TE family it belongs to. The family should correspond to the names of the sequences in the consensus fasta file", required=False) parser.add_argument( "-s", "--coverage_fasta", type=str, help= "A fasta file that will be used for TE-based coverage analysis, if not supplied then the consensus sequences of the TEs will be used for the analysis", required=False) # parser.add_argument("-d", "--coverage", action="store_true", help="If this option is specified then McClintock will perform depth of coverage analysis for every TE. Note: Doing TE-based coverage analysis will result in longer running time. A fasta file can be provided here for coverage analysis. If no file is provided here, the consensus sequences of the TEs will be used for the analysis", required=False) # parser.add_argument("-D", "--coverage_only", action="store_true", help="If this option is specified then only depth of coverage analysis for TEs will be performed", required=False) parser.add_argument( "-T", "--comments", action="store_true", help= "If this option is specified then fastq comments (e.g. barcode) will be incorporated to SAM output. Warning: do not use this option if the input fastq files do not have comments", required=False) # parser.add_argument("-b", "--keep_bam", action="store_true", help="Retain the sorted and indexed BAM file of the paired end data aligned to the reference genome", required=False) # parser.add_argument("-i", "--remove_intermediate", action="store_true", help="If this option is specified then all sample specific intermediate files will be removed, leaving only the overall results. The default is to leave sample specific intermediate files", required=False) parser.add_argument( "-a", "--augment", type=str, help= "A fasta file of TE sequences that will be included as extra chromosomes in the reference file (useful if the organism is known to have TEs that are not present in the reference strain)", required=False) parser.add_argument( "--clean", action="store_true", help= "This option will make sure mcclintock runs from scratch and doesn't reuse files already created", required=False) parser.add_argument( "--install", action="store_true", help="This option will install the dependencies of mcclintock", required=False) parser.add_argument( "--debug", action="store_true", help="This option will allow snakemake to print progress to stdout", required=False) parser.add_argument( "--slow", action="store_true", help= "This option runs without attempting to optimize thread usage to run rules concurrently. Each multithread rule will use the max processors designated by -p/--proc", required=False) args = parser.parse_args() if args.debug is None: args.debug = False if args.install: mccutils.log("installation", "installing dependencies") mccutils.log("installation", "WARNING: this could take awhile") install(clean=args.clean, debug=args.debug) sys.exit(0) #check -r args.reference = mccutils.get_abs_path(args.reference) #check -c args.consensus = mccutils.get_abs_path(args.consensus) #check -1 args.first = mccutils.get_abs_path(args.first) #check -2 if args.second is not None: args.second = mccutils.get_abs_path(args.second) #check -p if args.proc is None: args.proc = 1 #check -o if args.out is None: args.out = os.path.abspath(".") else: args.out = os.path.abspath(args.out) try: mccutils.mkdir(args.out) except Exception as e: track = traceback.format_exc() print(track, file=sys.stderr) print("cannot create output directory: ", args.out, "exiting...", file=sys.stderr) sys.exit(1) #check -m # If only one fastq has been supplied assume this is single ended data and launch only ngs_te_mapper and RelocaTE if args.second is None: valid_methods = config.SINGLE_END_METHODS #from config.py else: valid_methods = config.ALL_METHODS #from config.py if args.methods is None: args.methods = valid_methods else: args.methods = args.methods.split(",") for x, method in enumerate(args.methods): args.methods[x] = method.lower() if args.methods[x] not in valid_methods: sys.stderr.write(" ".join([ "Method:", method, "not a valid method...", "Valid methods:", " ".join(valid_methods), "\n" ])) sys.exit(1) # check -g if args.locations is not None: args.locations = mccutils.get_abs_path(args.locations) if args.taxonomy is None: sys.stderr.write( "If a GFF file is supplied (-g/--locations) then a TE taxonomy file that links it to the fasta consensus is also needed (-t/--taxonomy)...exiting...\n" ) sys.exit(1) # check -t if args.taxonomy is not None: args.taxonomy = mccutils.get_abs_path(args.taxonomy) # check -s if args.coverage_fasta is not None: args.coverage_fasta = mccutils.get_abs_path(args.coverage_fasta) # check -T if args.comments is None: args.comments = False # check -a if args.augment is not None: args.augment = mccutils.get_abs_path(args.augment) return args
def make_run_config(args, sample_name, ref_name, full_command, current_directory): run_id = random.randint(1000000, 9999999) mccutils.mkdir(args.out + "/snakemake") mccutils.mkdir(args.out + "/snakemake/config") run_config = args.out + "/snakemake/config/config_" + str(run_id) + ".json" input_dir = args.out + "/method_input/" results_dir = args.out + "/results/" out_files_to_make = [] out_files = config.OUT_PATHS for key in out_files.keys(): out_files[key] = out_files[key].replace(config.INPUT_DIR, input_dir) out_files[key] = out_files[key].replace(config.RESULTS_DIR, results_dir) out_files[key] = out_files[key].replace(config.SAMPLE_NAME, sample_name) for method in args.methods: out_files_to_make.append(out_files[method]) now = datetime.now() now_str = now.strftime("%Y%m%d.%H%M%S") log_dir = args.out + "/logs/" + now_str + "." + str(run_id) + "/" mccutils.mkdir(log_dir) chromosomes = [] for record in SeqIO.parse(args.reference, "fasta"): chrom = str(record.id) chrom = mccutils.replace_special_chars(chrom) chromosomes.append(chrom) data = {} data['args'] = { 'proc': str(args.proc), 'out': str(args.out), 'log_dir': log_dir, 'augment_fasta': str(args.augment), 'mcc_path': os.path.dirname(os.path.abspath(__file__)), 'sample_name': sample_name, 'ref_name': ref_name, 'run_id': str(run_id), 'methods': ",".join(args.methods), 'out_files': ",".join(out_files_to_make), 'save_comments': str(args.comments), 'max_threads_per_rule': max( 1, calculate_max_threads(args.proc, args.methods, config.MULTI_THREAD_METHODS, slow=args.slow)), 'full_command': full_command, 'call_directory': current_directory, 'time': now.strftime("%Y-%m-%d %H:%M:%S"), "chromosomes": ",".join(chromosomes) } # input paths for files data["in"] = { 'reference': str(args.reference), 'consensus': str(args.consensus), 'fq1': str(args.first), 'fq2': str(args.second), 'locations': str(args.locations), 'taxonomy': str(args.taxonomy), 'coverage_fasta': str(args.coverage_fasta), } # where mcc copies will be stored data["mcc"] = config.INTERMEDIATE_PATHS for key in data["mcc"].keys(): data["mcc"][key] = data["mcc"][key].replace(config.INPUT_DIR, input_dir) data["mcc"][key] = data["mcc"][key].replace(config.REF_NAME, ref_name) data["mcc"][key] = data["mcc"][key].replace(config.SAMPLE_NAME, sample_name) env_path = os.path.dirname(os.path.abspath(__file__)) + "/install/envs/" data["envs"] = config_install.ENV for key in data["envs"].keys(): data['envs'][key] = data['envs'][key].replace(config_install.ENV_PATH, env_path) with open(run_config, "w") as conf: json.dump(data, conf, indent=4) return run_id
def main(): mccutils.log("popoolationte2", "running PopoolationTE2") ref_fasta = snakemake.input.ref_fasta bam = snakemake.input.bam taxonomy = snakemake.input.taxonomy jar = snakemake.params.jar out_dir = snakemake.params.out_dir sample_name = snakemake.params.sample_name log = snakemake.params.log status_log = snakemake.params.status_log prev_step_succeeded = mccutils.check_status_file(status_log) if prev_step_succeeded: try: mccutils.mkdir(out_dir + "/tmp") taxonomy = format_taxonomy(taxonomy, out_dir) ppileup = popoolationte2_ppileup(jar, config.PARAMS["ppileup"], bam, taxonomy, out_dir, log=log) ppileup = popoolationte2_subsample( jar, config.PARAMS["subsampleppileup"], ppileup, out_dir, log=log) signatures = popoolationte2_signatures( jar, config.PARAMS["identifySignatures"], ppileup, out_dir, log=log) signatures = popoolationte2_strand(jar, config.PARAMS["updateStrand"], signatures, bam, taxonomy, out_dir, log=log) signatures = popoolationte2_frequency(jar, ppileup, signatures, out_dir, log=log) te_insertions = popoolationte2_pairup( jar, config.PARAMS["pairupSignatures"], signatures, ref_fasta, taxonomy, out_dir, log=log) mccutils.remove(out_dir + "/tmp") mccutils.check_file_exists(snakemake.output[0]) with open(status_log, "w") as l: l.write("COMPLETED\n") mccutils.log("popoolationte2", "popoolationte2 run complete") except Exception as e: track = traceback.format_exc() print(track, file=sys.stderr) with open(log, "a") as l: print(track, file=l) mccutils.log("popoolationte2", "popoolationte2 run failed") with open(status_log, "w") as l: l.write("FAILED\n") mccutils.run_command(["touch", snakemake.output[0]]) else: mccutils.run_command(["touch", snakemake.output[0]])
def install(clean=False, debug=False): mcc_path = os.path.dirname(os.path.abspath(__file__)) install_path = mcc_path + "/install/" install_config = install_path + "/config.json" log_dir = install_path + "/log/" conda_env_dir = install_path + "/envs/conda" data = {} data['paths'] = { 'mcc_path': mcc_path, 'install': install_path, 'log_dir': log_dir } data['URLs'] = config_install.URL data['MD5s'] = config_install.MD5 data['ENVs'] = config_install.ENV data['output'] = config_install.OUTPUT for method in data['ENVs'].keys(): data['ENVs'][method] = data['ENVs'][method].replace( config_install.ENV_PATH, install_path + "envs/") for method in data['output'].keys(): data['output'][method] = data['output'][method].replace( config_install.INSTALL_PATH, install_path) with open(install_config, "w") as c: json.dump(data, c, indent=4) if os.path.exists(install_path + "install.log"): os.remove(install_path + "install.log") # removes installed tools and conda environments if clean: mccutils.log("install", "Removing conda envs from: " + conda_env_dir) mccutils.log( "install", "Removing installed tools from: " + install_path + "tools") mccutils.remove(conda_env_dir) mccutils.remove(install_path + "/tools") mccutils.mkdir(conda_env_dir) os.chdir(install_path) mccutils.mkdir(log_dir) for env in config.ALL_METHODS: if env not in config.NO_INSTALL_METHODS: mccutils.log("install", "Installing conda environment for: " + env) command = [ "snakemake", "--use-conda", "--conda-prefix", conda_env_dir, "--configfile", install_config, "--cores", "1", "--nolock", "--create-envs-only", data['output'][env] ] if not debug: command.append("--quiet") mccutils.run_command(command) mccutils.log("install", "Installing scripts for:" + env) command = [ "snakemake", "--use-conda", "--conda-prefix", conda_env_dir, "--configfile", install_config, "--cores", "1", "--nolock", data['output'][env] ] if not debug: command.append("--quiet") mccutils.run_command(command) mccutils.log("install", "Installing conda environment for processing steps") command = [ "snakemake", "--use-conda", "--conda-prefix", conda_env_dir, "--configfile", install_config, "--cores", "1", "--nolock", "--create-envs-only", data['output']['processing'] ] if not debug: command.append("--quiet") mccutils.run_command(command)
def main(): install_path = snakemake.config['paths']['install'] + "/tools/" mccutils.remove(snakemake.params.zipfile) download_success = mccutils.download(snakemake.params.url, snakemake.params.zipfile, md5=snakemake.params.md5, max_attempts=3) if not download_success: print("relocaTE download failed... exiting...") print("try running --install with --clean for clean installation") sys.exit(1) mccutils.remove(snakemake.config['paths']['install'] + "RelocaTE-ce3a2066e15f5c14e2887fdf8dce0485e1750e5b") command = ["unzip", snakemake.params.zipfile] mccutils.run_command(command, log=snakemake.params.log) mccutils.remove(install_path + "RelocaTE-ce3a2066e15f5c14e2887fdf8dce0485e1750e5b") command = [ "mv", snakemake.config['paths']['install'] + "RelocaTE-ce3a2066e15f5c14e2887fdf8dce0485e1750e5b", install_path ] mccutils.run_command(command, log=snakemake.params.log) mccutils.remove(install_path + "relocate") mccutils.mkdir(install_path + "relocate/") for f in os.listdir(install_path + "RelocaTE-ce3a2066e15f5c14e2887fdf8dce0485e1750e5b"): command = [ "mv", install_path + "RelocaTE-ce3a2066e15f5c14e2887fdf8dce0485e1750e5b/" + f, install_path + "relocate/" ] mccutils.run_command(command, log=snakemake.params.log) command = [ "patch", "-i", snakemake.params.patch, install_path + "relocate/scripts/relocaTE_insertionFinder.pl" ] mccutils.run_command(command, log=snakemake.params.log) mccutils.remove(install_path + "RelocaTE-ce3a2066e15f5c14e2887fdf8dce0485e1750e5b") mccutils.remove(snakemake.params.zipfile) output = subprocess.Popen(["which", "perl"], stdout=subprocess.PIPE, stderr=subprocess.PIPE) perl_path = output.stdout.read() perl_path = perl_path.decode() for f in os.listdir(install_path + "relocate/scripts/"): if "pl" == f.split(".")[-1]: with open(install_path + "tmp", "w") as tmp: with open(install_path + "relocate/scripts/" + f, "r") as script: for line in script: if "#!/usr/bin/perl" in line: # line = "#!"+perl_path line = "#!/usr/bin/env perl\n" elif "defined @" in line: line = line.replace("defined @", "@") elif "$scripts/" in line and "perl" not in line and "relocaTE.pl" in f: line = line.replace("$scripts/", "perl $scripts/") tmp.write(line) mccutils.run_command([ "mv", install_path + "tmp", install_path + "relocate/scripts/" + f ]) # write version to file with open( snakemake.config['paths']['install'] + "/tools/relocate/version.log", "w") as version: version.write(snakemake.params.md5)
def main(): te_gff = snakemake.input.te_gff sam = snakemake.input.sam ref_fasta = snakemake.input.ref median_insert_size_file = snakemake.input.median_insert_size log = snakemake.params.log status_log = snakemake.params.status_log mccutils.log("te-locate", "running TE-Locate", log=log) with open(log, "a") as l: l.write("TE GFF: " + te_gff + "\n") l.write("SAM: " + sam + "\n") l.write("reference fasta: " + ref_fasta + "\n") telocate = snakemake.params.run_script out_dir = snakemake.params.out_dir try: # ensures intermediate files from previous runs are removed for f in os.listdir(out_dir): mccutils.remove(out_dir + "/" + f) sam_dir = out_dir + "/sam/" mccutils.mkdir(sam_dir) te_locate_sam = sam_dir + "te-locate.sam" if os.path.exists(te_locate_sam): os.remove(te_locate_sam) os.symlink(sam, te_locate_sam) os.chdir(os.path.dirname(telocate)) median_insert_size = mccutils.get_median_insert_size( median_insert_size_file) distance = (median_insert_size * config.PARAMS["min_distance"]) command = [ "perl", telocate, str(config.PARAMS["max_mem"]), sam_dir, te_gff, ref_fasta, out_dir, str(distance), str(config.PARAMS["min_support_reads"]), str(config.PARAMS["min_support_individuals"]) ] mccutils.run_command(command, log=log) mccutils.check_file_exists(out_dir + "_" + str(distance) + "_reads3_acc1.info") mccutils.run_command([ "cp", out_dir + "_" + str(distance) + "_reads3_acc1.info", out_dir + "te-locate-raw.info" ]) mccutils.log("te-locate", "TE-Locate complete") with open(status_log, "w") as l: l.write("COMPLETED\n") except Exception as e: track = traceback.format_exc() print(track, file=sys.stderr) with open(log, "a") as l: print(track, file=l) mccutils.log("telocate", "TE-locate run failed") with open(status_log, "w") as l: l.write("FAILED\n") mccutils.run_command(["touch", snakemake.output[0]])
def main(): sample_name = snakemake.params.sample_name log = snakemake.params.log raw_fq2 = snakemake.params.raw_fq2 is_paired = True if raw_fq2 == "None": is_paired = False script_dir = snakemake.params.script_dir out_dir = snakemake.params.out_dir out_gff = snakemake.output[0] mccutils.log("relocate", "running RelocaTE", log=log) input_dir = snakemake.params.out_dir + "/input/" mccutils.remove(input_dir) mccutils.mkdir(input_dir) fq_dir = input_dir + "fastq/" mccutils.mkdir(fq_dir) consensus_fasta = input_dir + "consensus.fasta" te_gff = input_dir + "te.gff" reference_fasta = input_dir + "reference.fasta" os.symlink(snakemake.input.consensus_fasta, consensus_fasta) os.symlink(snakemake.input.te_gff, te_gff) os.symlink(snakemake.input.reference_fasta, reference_fasta) if is_paired: os.symlink(snakemake.input.fq1, fq_dir + sample_name + "_1.fq") os.symlink(snakemake.input.fq2, fq_dir + sample_name + "_2.fq") else: os.symlink(snakemake.input.fq1, fq_dir + sample_name + ".unPaired.fq") annotation = make_annotation_file(te_gff, out_dir) os.chdir(out_dir) command = [ "perl", script_dir + "/relocaTE.pl", "-t", consensus_fasta, "-d", fq_dir, "-g", reference_fasta, "-o", ".", "-r", annotation, "-l", str(config.RELOCATE['l']), "-m", str(config.RELOCATE['m']), "-bm", str(config.RELOCATE['bm']), "-bt", str(config.RELOCATE['bt']), "-f", str(config.RELOCATE['f']) ] if is_paired: command += ["-1", "_1", "-2", "_2"] else: command += ["-u", "unPaired"] mccutils.run_command(command, log=log) combine_gffs(out_dir, out_gff) mccutils.remove(out_dir + "/input/fastq") for d in os.listdir(out_dir): if os.path.exists(d + "/te_containing_fq/"): mccutils.remove(d + "/te_containing_fq/") mccutils.log("relocate", "RelocaTE run complete")
def make_depth_table(te_fasta, bam, genome_depth, run_id, out, depth_csv, log, trim_edges=0): mccutils.log("coverage", "creating TE depth coverage table", log=log) with open(depth_csv, "w") as table: table.write("TE-Family,Normalized-Depth,Normalized-Unique-Depth" + "\n") te_names = [] uniq_coverage_files = [] all_coverage_files = [] avg_norm_depths = [] avg_uniq_norm_depths = [] with open(te_fasta, "r") as fa: for line in fa: if ">" in line: te_name = line.replace("\n", "") te_name = te_name.replace(">", "") mccutils.mkdir(out + "/te-depth-files") highQ = out + "/te-depth-files/" + te_name + ".highQ.cov" command = [ "samtools", "depth", "-aa", "-r", te_name, bam, "-d", "0", "-Q", "1" ] mccutils.run_command_stdout(command, highQ, log=log) allQ = out + "/te-depth-files/" + te_name + ".allQ.cov" command = [ "samtools", "depth", "-aa", "-r", te_name, bam, "-d", "0", "-Q", "0" ] mccutils.run_command_stdout(command, allQ, log=log) # make normalized coverage files allQ_chrom, allQ_pos, allQ_cov = read_samtools_depth_file(allQ) with open( out + "/te-depth-files/" + te_name + ".allQ.normalized.cov", "w") as covfile: for i, pos in enumerate(allQ_pos): cov = str(round(allQ_cov[i] / genome_depth, 2)) line = "\t".join([allQ_chrom, str(pos), cov]) covfile.write(line + "\n") highQ_chrom, highQ_pos, highQ_cov = read_samtools_depth_file( highQ) with open( out + "/te-depth-files/" + te_name + ".highQ.normalized.cov", "w") as covfile: for i, pos in enumerate(highQ_pos): cov = str(round(highQ_cov[i] / genome_depth, 2)) line = "\t".join([highQ_chrom, str(pos), cov]) covfile.write(line + "\n") avg_depth = get_avg_depth(allQ, trim_edges=trim_edges) avg_norm_depth = avg_depth / genome_depth avg_uniq_depth = get_avg_depth(highQ, trim_edges=trim_edges) avg_uniq_norm_depth = avg_uniq_depth / genome_depth with open(depth_csv, "a") as table: table.write(te_name + "," + str(round(avg_norm_depth, 2)) + "," + str(round(avg_uniq_norm_depth, 2)) + "\n") te_names.append(te_name) uniq_coverage_files.append(highQ) all_coverage_files.append(allQ) avg_norm_depths.append(avg_norm_depth) return te_names, all_coverage_files, uniq_coverage_files, avg_norm_depths
def install(methods, resume=False, debug=False): mcc_path = os.path.dirname(os.path.abspath(__file__)) install_path = mcc_path+"/install/" install_config = install_path+"/config.json" log_dir = install_path+"/log/" conda_env_dir = install_path+"/envs/conda" data = {} data['paths'] = { 'mcc_path': mcc_path, 'install' : install_path, 'log_dir': log_dir } data['URLs'] = config_install.URL data['MD5s'] = config_install.MD5 data['ENVs'] = config_install.ENV data['output'] = config_install.OUTPUT for method in data['ENVs'].keys(): data['ENVs'][method] = data['ENVs'][method].replace(config_install.ENV_PATH, install_path+"envs/") for method in data['output'].keys(): data['output'][method] = data['output'][method].replace(config_install.INSTALL_PATH, install_path) with open(install_config,"w") as c: json.dump(data, c, indent=4) if os.path.exists(install_path+"install.log"): os.remove(install_path+"install.log") # finding existing conda yamls existing_envs = get_conda_envs(conda_env_dir) mccutils.mkdir(conda_env_dir) os.chdir(install_path) mccutils.mkdir(log_dir) # temp requires te-locate scripts to make taxonomy file if "temp" in methods and "te-locate" not in methods: methods.append("te-locate") for env in methods: if not resume: # remove existing envs if env in existing_envs.keys(): mccutils.log("install","Removing existing conda env for: "+env) mccutils.remove(existing_envs[env]) mccutils.remove(existing_envs[env].replace(".yaml","")) # remove existing src code if os.path.exists(install_path+"/tools/"+env): mccutils.log("install","Removing existing installation of: "+env) print(install_path+"/tools/"+env) mccutils.remove(install_path+"/tools/"+env) # reinstall src code mccutils.log("install","Installing scripts for:"+env) command = ["snakemake","--use-conda", "--conda-prefix", conda_env_dir, "--configfile", install_config, "--cores", "1", "--nolock", data['output'][env]] if not debug: command.append("--quiet") mccutils.run_command(command)
def main(): sample_name = snakemake.params.sample_name log = snakemake.params.log raw_fq2 = snakemake.params.raw_fq2 is_paired = True if raw_fq2 == "None": is_paired = False script_dir = snakemake.params.script_dir out_dir = snakemake.params.out_dir status_log = snakemake.params.status_log out_gff = snakemake.output[0] try: # ensures intermediate files from previous runs are removed for f in os.listdir(out_dir): mccutils.remove(out_dir + "/" + f) mccutils.log("relocate", "running RelocaTE", log=log) input_dir = snakemake.params.out_dir + "/input/" mccutils.remove(input_dir) mccutils.mkdir(input_dir) fq_dir = input_dir + "fastq/" mccutils.mkdir(fq_dir) consensus_fasta = input_dir + "consensus.fasta" te_gff = input_dir + "te.gff" reference_fasta = input_dir + "reference.fasta" uniq_id = str(random.randint(10000, 99999)) while uniq_id in fq_dir: mccutils.log("relocate", "unique id: " + uniq_id + " occurs in file path... selecting a new one...", log=log) uniq_id = str(random.randint(10000, 99999)) fq1_uniq_id = uniq_id + "_mcc_relocate_1" fq2_uniq_id = uniq_id + "_mcc_relocate_2" unpaired_id = uniq_id + "_unPaired" os.symlink(snakemake.input.consensus_fasta, consensus_fasta) os.symlink(snakemake.input.te_gff, te_gff) os.symlink(snakemake.input.reference_fasta, reference_fasta) if is_paired: os.symlink(snakemake.input.fq1, fq_dir + sample_name + "." + fq1_uniq_id + ".fq") os.symlink(snakemake.input.fq2, fq_dir + sample_name + "." + fq2_uniq_id + ".fq") else: os.symlink(snakemake.input.fq1, fq_dir + sample_name + "." + unpaired_id + ".fq") annotation = make_annotation_file(te_gff, out_dir) os.chdir(out_dir) command = [ "perl", script_dir + "/relocaTE.pl", "-t", consensus_fasta, "-d", fq_dir, "-g", reference_fasta, "-o", ".", "-r", annotation ] for param in config.PARAMS.keys(): command.append(param) command.append(str(config.PARAMS[param])) if is_paired: command += ["-1", fq1_uniq_id, "-2", fq2_uniq_id] else: command += ["-u", unpaired_id] mccutils.run_command(command, log=log) combine_gffs(out_dir, out_gff) mccutils.check_file_exists(out_gff) mccutils.log("relocate", "RelocaTE run complete") with open(status_log, "w") as l: l.write("COMPLETED\n") except Exception as e: track = traceback.format_exc() print(track, file=sys.stderr) with open(log, "a") as l: print(track, file=l) mccutils.log("relocate", "RelocaTE run failed") with open(status_log, "w") as l: l.write("FAILED\n") mccutils.run_command(["touch", snakemake.output[0]])
def main(): sample_name = snakemake.params.sample_name threads = snakemake.threads out_dir = snakemake.params.out_dir median_insert_size_file = snakemake.input.median_insert_size log = snakemake.params.log # ensures intermediate files from previous runs are removed for f in os.listdir(out_dir): mccutils.remove(out_dir + "/" + f) is_paired = True if snakemake.params.raw_fq2 == "None": is_paired = False input_dir = snakemake.params.out_dir + "/input/" mccutils.remove(input_dir) mccutils.mkdir(input_dir) fq_dir = snakemake.params.out_dir + "/input/fastq/" mccutils.mkdir(fq_dir) reference = input_dir + "reference.fasta" te_seqs = input_dir + "consensus.fasta" rm_out = input_dir + "repeatmasker.out" os.symlink(snakemake.input.reference, reference) os.symlink(snakemake.input.te_seqs, te_seqs) os.symlink(snakemake.input.rm_out, rm_out) if is_paired: fq1 = fq_dir + sample_name + "_1.fq" fq2 = fq_dir + sample_name + "_2.fq" os.symlink(snakemake.input.fq1, fq1) os.symlink(snakemake.input.fq2, fq2) else: fq1 = fq_dir + sample_name + ".unPaired.fq" os.symlink(snakemake.input.fq1, fq1) median_insert_size = get_median_insert_size(median_insert_size_file) output = subprocess.Popen(["which", "relocaTE2.py"], stdout=subprocess.PIPE, stderr=subprocess.PIPE) script = output.stdout.read() script = script.decode() script = script.replace("\n", "") mccutils.log("relocate2", "running RelocaTE2", log=log) command = [ "python2", script, "-t", te_seqs, "-g", reference, "-r", rm_out, "-o", out_dir, "-s", str(median_insert_size), "--run", "-v", "4", "-c", str(threads), "--aligner", config.RELOCATE2["aligner"], "--len_cut_match", str(config.RELOCATE2["len_cut_match"]), "--len_cut_trim", str(config.RELOCATE2["len_cut_trim"]), "--mismatch", str(config.RELOCATE2["mismatch"]), "--mismatch_junction", str(config.RELOCATE2["mismatch_junction"]), "-d", fq_dir ] if is_paired: command += ["-1", "_1", "-2", "_2"] else: command += ["-u", ".unPaired"] mccutils.run_command(command, log=log) mccutils.log("relocate2", "RelocaTE2 run complete")
def run_workflow(args, sample_name, ref_name, run_id, debug=False, annotations_only=False): log = args.out + "/mcclintock." + str(run_id) + ".log" input_dir = args.out reference_dir = args.out + "/" + ref_name + "/" sample_dir = args.out + "/" + sample_name + "/" results_dir = args.out + "/" + sample_name + "/results/" out_files = config.OUT_PATHS for key in out_files.keys(): out_files[key] = out_files[key].replace(config.INPUT_DIR, input_dir) out_files[key] = out_files[key].replace(config.REF_DIR, reference_dir) out_files[key] = out_files[key].replace(config.SAM_DIR, sample_dir) out_files[key] = out_files[key].replace(config.RESULTS_DIR, results_dir) out_files[key] = out_files[key].replace(config.SAMPLE_NAME, sample_name) path = os.path.dirname(os.path.abspath(__file__)) mccutils.mkdir(args.out + "/snakemake") snakemake_path = args.out + "/snakemake/" + str(run_id) mccutils.mkdir(snakemake_path) mccutils.run_command(["cp", path + "/Snakefile", snakemake_path]) os.chdir(snakemake_path) command = [ "snakemake", "--use-conda", "--conda-prefix", path + "/install/envs/conda" ] if not debug: command.append("--quiet") else: command.append("--reason") command += [ "--configfile", args.out + "/snakemake/config/config_" + str(run_id) + ".json" ] command += ["--cores", str(args.proc)] if not args.resume: if os.path.exists(reference_dir) and len( os.listdir(reference_dir)) > 0: sys.exit( "ERROR: output directory:" + reference_dir + " is not empty. If wanting to resume a previous run, use --resume, otherwise please delete this directory or change your -o/--output\n" ) if os.path.exists(sample_dir) and len(os.listdir(sample_dir)) > 0: sys.exit( "ERROR: output directory:" + sample_dir + " is not empty. If wanting to resume a previous run, use --resume, otherwise please delete this directory or change your -o/--output or --sample_name\n" ) # check that previous runs are compatible else: mccutils.log( "setup", "Checking config files to ensure previous intermediate files are compatible with this run" ) config_found = False for prev_config in os.listdir(input_dir + "/snakemake/config/"): if prev_config != "config_" + str(run_id) + ".json": config_found = True config_compatible = config_compatibility( input_dir + "/snakemake/config/config_" + str(run_id) + ".json", args.out + "/snakemake/config/" + prev_config) if not config_compatible: sys.exit(1) if not config_found: sys.exit( "ERROR: Unable to resume run. No config files from previous runs found in:" + input_dir + "/snakemake/config/ Remove --resume for clean run\n") if not annotations_only: for method in args.methods: command.append(out_files[method]) command.append(sample_dir + "results/summary/data/run/summary_report.txt") else: command.append(reference_dir + "reference_te_locations/inrefTEs.gff") command.append(reference_dir + "te_taxonomy/taxonomy.tsv") # print(" ".join(command)) try: sys.stdout.flush() mccutils.mkdir(sample_dir) mccutils.mkdir(sample_dir + "tmp") mccutils.run_command(command) except Exception as e: track = traceback.format_exc() print(track, file=sys.stderr) print( "McClintock Pipeline Failed... please open an issue at https://github.com/bergmanlab/mcclintock/issues if you are having trouble using McClintock", file=sys.stderr) sys.exit(1) mccutils.remove(sample_dir + "tmp") remove_intermediate_files( args.keep_intermediate, args.out + "/snakemake/config/config_" + str(run_id) + ".json", args.methods, ref_name, sample_name, args.out)
def install(methods, resume=False, debug=False): mcc_path = os.path.dirname(os.path.abspath(__file__)) install_path = mcc_path + "/install/" install_config = install_path + "/config.json" log_dir = install_path + "/log/" conda_env_dir = install_path + "/envs/conda" data = {} data['paths'] = { 'mcc_path': mcc_path, 'install': install_path, 'log_dir': log_dir } data['URLs'] = config_install.URL data['MD5s'] = config_install.MD5 data['ENVs'] = config_install.ENV data['output'] = config_install.OUTPUT for method in data['ENVs'].keys(): data['ENVs'][method] = data['ENVs'][method].replace( config_install.ENV_PATH, install_path + "envs/") for method in data['output'].keys(): data['output'][method] = data['output'][method].replace( config_install.INSTALL_PATH, install_path) with open(install_config, "w") as c: json.dump(data, c, indent=4) if os.path.exists(install_path + "install.log"): os.remove(install_path + "install.log") # removes installed tools and conda environments if not resume: mccutils.log( "install", "Removing all previously installed McClintock conda envs and tools" ) mccutils.log( "install", "Use the --resume option if you don't want to perform a clean installation" ) mccutils.log("install", "Removing conda envs from: " + conda_env_dir) mccutils.log( "install", "Removing installed tools from: " + install_path + "tools") mccutils.remove(conda_env_dir) mccutils.remove(install_path + "/tools") mccutils.mkdir(conda_env_dir) os.chdir(install_path) mccutils.mkdir(log_dir) # temp requires te-locate scripts to make taxonomy file if "temp" in methods and "te-locate" not in methods: methods.append("te-locate") for env in methods: if env not in config.NO_INSTALL_METHODS: mccutils.log("install", "Installing conda environment for: " + env) command = [ "snakemake", "--use-conda", "--conda-frontend", "mamba", "--conda-prefix", conda_env_dir, "--configfile", install_config, "--cores", "1", "--nolock", "--conda-create-envs-only", data['output'][env] ] if not debug: command.append("--quiet") mccutils.run_command(command) mccutils.log("install", "Installing scripts for:" + env) command = [ "snakemake", "--use-conda", "--conda-prefix", conda_env_dir, "--configfile", install_config, "--cores", "1", "--nolock", data['output'][env] ] if not debug: command.append("--quiet") mccutils.run_command(command) mccutils.log("install", "Installing conda environment for setup_reads steps") command = [ "snakemake", "--use-conda", "--conda-frontend", "mamba", "--conda-prefix", conda_env_dir, "--configfile", install_config, "--cores", "1", "--nolock", "--conda-create-envs-only", data['output']['setup_reads'] ] if not debug: command.append("--quiet") mccutils.run_command(command) mccutils.log("install", "Installing conda environment for processing steps") command = [ "snakemake", "--use-conda", "--conda-frontend", "mamba", "--conda-prefix", conda_env_dir, "--configfile", install_config, "--cores", "1", "--nolock", "--conda-create-envs-only", data['output']['processing'] ] if not debug: command.append("--quiet") mccutils.run_command(command)
def main(): sample_name = snakemake.params.sample_name threads = snakemake.threads out_dir = snakemake.params.out_dir median_insert_size_file = snakemake.input.median_insert_size log = snakemake.params.log status_log = snakemake.params.status_log try: # ensures intermediate files from previous runs are removed for f in os.listdir(out_dir): mccutils.remove(out_dir + "/" + f) is_paired = True if snakemake.params.raw_fq2 == "None": is_paired = False input_dir = snakemake.params.out_dir + "/input/" mccutils.remove(input_dir) mccutils.mkdir(input_dir) fq_dir = snakemake.params.out_dir + "/input/fastq/" mccutils.mkdir(fq_dir) reference = input_dir + "reference.fasta" te_seqs = input_dir + "consensus.fasta" rm_out = input_dir + "repeatmasker.out" os.symlink(snakemake.input.reference, reference) os.symlink(snakemake.input.te_seqs, te_seqs) os.symlink(snakemake.input.rm_out, rm_out) if is_paired: fq1 = fq_dir + sample_name + "_1.fq" fq2 = fq_dir + sample_name + "_2.fq" os.symlink(snakemake.input.fq1, fq1) os.symlink(snakemake.input.fq2, fq2) else: fq1 = fq_dir + sample_name + ".unPaired.fq" os.symlink(snakemake.input.fq1, fq1) median_insert_size = get_median_insert_size(median_insert_size_file) output = subprocess.Popen(["which", "relocaTE2.py"], stdout=subprocess.PIPE, stderr=subprocess.PIPE) script = output.stdout.read() script = script.decode() script = script.replace("\n", "") mccutils.log("relocate2", "running RelocaTE2", log=log) command = [ "python2", script, "-t", te_seqs, "-g", reference, "-r", rm_out, "-o", out_dir, "-s", str(median_insert_size), "--run", "-v", "4", "-c", str(threads), "-d", fq_dir ] for param in config.PARAMS.keys(): command.append(param) command.append(str(config.PARAMS[param])) if is_paired: command += ["-1", "_1", "-2", "_2"] else: command += ["-u", ".unPaired"] mccutils.run_command(command, log=log) mccutils.check_file_exists(snakemake.output[0]) mccutils.check_file_exists(snakemake.output[1]) with open(status_log, "w") as l: l.write("COMPLETED\n") mccutils.log("relocate2", "RelocaTE2 run complete") except Exception as e: track = traceback.format_exc() print(track, file=sys.stderr) with open(log, "a") as l: print(track, file=l) mccutils.log("relocate2", "RelocaTE2 run failed") with open(status_log, "w") as l: l.write("FAILED\n") mccutils.run_command(["touch", snakemake.output[0]]) mccutils.run_command(["touch", snakemake.output[1]])
def parse_args(expected_configs): parser = argparse.ArgumentParser(prog='McClintock', description="Meta-pipeline to identify transposable element insertions using next generation sequencing data") ## required ## parser.add_argument("-r", "--reference", type=str, help="A reference genome sequence in fasta format", required=('--install' not in sys.argv)) parser.add_argument("-c", "--consensus", type=str, help="The consensus sequences of the TEs for the species in fasta format", required='--install' not in sys.argv) parser.add_argument("-1", "--first", type=str, help="The path of the first fastq file from paired end read sequencing or the fastq file from single read sequencing", required=(('--install' not in sys.argv) and ('--make_annotations' not in sys.argv))) ## optional ## parser.add_argument("-2", "--second", type=str, help="The path of the second fastq file from a paired end read sequencing", required=False) parser.add_argument("-p", "--proc", type=int, help="The number of processors to use for parallel stages of the pipeline [default = 1]", required=False) parser.add_argument("-o", "--out", type=str, help="An output folder for the run. [default = '.']", required=False) parser.add_argument("-m", "--methods", type=str, help="A comma-delimited list containing the software you want the pipeline to use for analysis. e.g. '-m relocate,TEMP,ngs_te_mapper' will launch only those three methods", required=False) parser.add_argument("-g", "--locations", type=str, help="The locations of known TEs in the reference genome in GFF 3 format. This must include a unique ID attribute for every entry", required=False) parser.add_argument("-t", "--taxonomy", type=str, help="A tab delimited file with one entry per ID in the GFF file and two columns: the first containing the ID and the second containing the TE family it belongs to. The family should correspond to the names of the sequences in the consensus fasta file", required=False) parser.add_argument("-s", "--coverage_fasta", type=str, help="A fasta file that will be used for TE-based coverage analysis, if not supplied then the consensus sequences of the TEs will be used for the analysis", required=False) parser.add_argument("-T", "--comments", action="store_true", help="If this option is specified then fastq comments (e.g. barcode) will be incorporated to SAM output. Warning: do not use this option if the input fastq files do not have comments", required=False) # parser.add_argument("-b", "--keep_bam", action="store_true", help="Retain the sorted and indexed BAM file of the paired end data aligned to the reference genome", required=False) # parser.add_argument("-i", "--remove_intermediate", action="store_true", help="If this option is specified then all sample specific intermediate files will be removed, leaving only the overall results. The default is to leave sample specific intermediate files", required=False) parser.add_argument("-a", "--augment", type=str, help="A fasta file of TE sequences that will be included as extra chromosomes in the reference file (useful if the organism is known to have TEs that are not present in the reference strain)", required=False) parser.add_argument("--sample_name", type=str, help="The sample name to use for output files [default: fastq1 name]", required=False) parser.add_argument("--resume", action="store_true", help="This option will attempt to use existing intermediate files from a previous McClintock run", required=False) parser.add_argument("--install", action="store_true", help="This option will install the dependencies of mcclintock", required=False) parser.add_argument("--debug", action="store_true", help="This option will allow snakemake to print progress to stdout", required=False) parser.add_argument("--slow", action="store_true", help="This option runs without attempting to optimize thread usage to run rules concurrently. Each multithread rule will use the max processors designated by -p/--proc", required=False) parser.add_argument("--make_annotations", action="store_true", help="This option will only run the pipeline up to the creation of the repeat annotations", required=False) parser.add_argument("-k","--keep_intermediate", type=str, help="This option determines which intermediate files are preserved after McClintock completes [default: general][options: minimal, general, methods, <list,of,methods>, all]", required=False) parser.add_argument("--config", type=str, help="This option determines which config files to use for your mcclintock run [default: config in McClintock Repository]", required=False) args = parser.parse_args() if args.config is None: args.config = os.path.dirname(os.path.abspath(__file__)) + "/config/" else: args.config = os.path.abspath(args.config)+"/" for key in expected_configs.keys(): for config_file in expected_configs[key]: if not os.path.exists(args.config+"/"+config_file): sys.exit("Error: can't find config file: "+args.config+"/"+config_file+"\n Check that --config is set correctly...exiting...\n") if args.debug is None: args.debug = False #check -m # If only one fastq has been supplied assume this is single ended data and launch only ngs_te_mapper and RelocaTE if args.second is None and not args.install: valid_methods = sysconfig.SINGLE_END_METHODS #from config.py else: valid_methods = sysconfig.ALL_METHODS #from config.py # used to preserve trimgalore and mapped reads output if they are explicitly called by the user trimgalore_called = False map_reads_called = False if args.methods is None: args.methods = valid_methods else: args.methods = args.methods.split(",") if "trimgalore" in args.methods: trimgalore_called = True if "map_reads" in args.methods: map_reads_called = True for x,method in enumerate(args.methods): args.methods[x] = method.lower() if args.methods[x] not in valid_methods: sys.stderr.write(" ".join(["Method:",method, "not a valid method...", "Valid methods:"," ".join(valid_methods),"\n"])) sys.exit(1) if args.install: mccutils.log("install","installing dependencies") mccutils.log("install","WARNING: this could take awhile") install(args.methods, resume=args.resume, debug=args.debug) sys.exit(0) #check -r args.reference = mccutils.get_abs_path(args.reference) #check -c args.consensus = mccutils.get_abs_path(args.consensus) if args.make_annotations != True: #check -1 args.first = mccutils.get_abs_path(args.first) #check -2 if args.second is not None: args.second = mccutils.get_abs_path(args.second) #check -p if args.proc is None: args.proc = 1 #check -o if args.out is None: args.out = os.path.abspath(".") else: args.out = os.path.abspath(args.out) try: mccutils.mkdir(args.out) except Exception as e: track = traceback.format_exc() print(track, file=sys.stderr) print("cannot create output directory: ",args.out,"exiting...", file=sys.stderr) sys.exit(1) # check -g if args.locations is not None: args.locations = mccutils.get_abs_path(args.locations) if args.taxonomy is None: sys.stderr.write("If a GFF file is supplied (-g/--locations) then a TE taxonomy file that links it to the fasta consensus is also needed (-t/--taxonomy)...exiting...\n") sys.exit(1) # check -t if args.taxonomy is not None: args.taxonomy = mccutils.get_abs_path(args.taxonomy) # check -s if args.coverage_fasta is not None: args.coverage_fasta = mccutils.get_abs_path(args.coverage_fasta) # check -T if args.comments is None: args.comments = False # check -a if args.augment is not None: args.augment = mccutils.get_abs_path(args.augment) # check sample name if args.sample_name is not None: if "/" in args.sample_name or args.sample_name == "tmp": sys.exit(args.sample_name+" is not a valid sample name...\n") else: if not args.make_annotations: args.sample_name = mccutils.get_base_name(args.first) else: args.sample_name = "tmp" keep_intermediate_options = ["minimal","general", "methods", "all"] + args.methods if args.keep_intermediate is None: args.keep_intermediate = ["general"] else: args.keep_intermediate = args.keep_intermediate.split(",") for option in args.keep_intermediate: if option not in keep_intermediate_options: sys.stderr.write("keep_intermediate option: "+option+" is not valid. Valid options: "+" ".join(keep_intermediate_options)+"\nExample:(--keep_intermediate general,methods)\n") sys.exit(1) if trimgalore_called: args.keep_intermediate.append("trimgalore") if map_reads_called: args.keep_intermediate.append("map_reads") return args
def make_method_pages(jinja_env, methods, consensus, out_file_map, chromosomes, out_dir): prediction_methods = [] for method in methods: if method not in NO_PRED_METHODS: prediction_methods.append(method) if len(prediction_methods) > 0: families = [] with open(consensus, "r") as fa: for line in fa: if line[0] == ">": family = line.replace(">", "") family = family.replace("\n", "") families.append(family) mccutils.mkdir(out_dir + "/data/methods/") for method in prediction_methods: template = jinja_env.get_template('method.html') mccutils.mkdir(out_dir + "/data/methods/" + method) predictions_file = out_file_map[method] reference_family_counts = [] nonreference_family_counts = [] for family in families: reference_count = 0 nonreference_count = 0 predictions = get_predictions(predictions_file, family=family) for prediction in predictions: if prediction.type == "Reference": reference_count += 1 else: nonreference_count += 1 reference_family_counts.append(reference_count) nonreference_family_counts.append(nonreference_count) with open( out_dir + "/data/methods/" + method + "/family_predictions.txt", "w") as raw_file: header = ",".join(["Family", "Reference", "Non-Reference"]) raw_file.write(header + "\n") for i, fam in enumerate(families): line = ",".join([ fam, str(reference_family_counts[i]), str(nonreference_family_counts[i]) ]) raw_file.write(line + "\n") # determine height of family counts plot, makes sure there is enough room for each bar height_per_entry = 20 min_height = 500 family_plot_height = len(families) * height_per_entry if family_plot_height < min_height: family_plot_height = min_height reference_chromosome_counts = [] nonreference_chromosome_counts = [] for chromosome in chromosomes: reference_count = 0 nonreference_count = 0 predictions = get_predictions(predictions_file, chromosome=chromosome) for prediction in predictions: if prediction.type == "Reference": reference_count += 1 else: nonreference_count += 1 reference_chromosome_counts.append(reference_count) nonreference_chromosome_counts.append(nonreference_count) with open( out_dir + "/data/methods/" + method + "/contig_predictions.txt", "w") as raw_file: header = ",".join(["Contig", "Reference", "Non-Reference"]) raw_file.write(header + "\n") for i, chrom in enumerate(chromosomes): line = ",".join([ chrom, str(reference_chromosome_counts[i]), str(nonreference_chromosome_counts[i]) ]) raw_file.write(line + "\n") # determine height of plot of predictions per contig chrom_plot_height = len(chromosomes) * height_per_entry if chrom_plot_height < min_height: chrom_plot_height = min_height predictions = get_predictions(predictions_file) with open( out_dir + "/data/methods/" + method + "/all_predictions.txt", "w") as raw_file: header = ",".join( ["Contig", "Family", "Type", "Start", "End", "Strand"]) raw_file.write(header + "\n") for prediction in predictions: line = ",".join([ prediction.chrom, prediction.family, prediction.type, str(prediction.start), str(prediction.end), prediction.strand ]) raw_file.write(line + "\n") rendered_lines = template.render( methods=prediction_methods, method=method, families=families, family_plot_height=family_plot_height, reference_family_counts=reference_family_counts, nonreference_family_counts=nonreference_family_counts, chromosomes=chromosomes, chrom_plot_height=chrom_plot_height, reference_chromosome_counts=reference_chromosome_counts, nonreference_chromosome_counts=nonreference_chromosome_counts, predictions=predictions) out_file = out_dir + "/html/" + method + ".html" with open(out_file, "w") as out: for line in rendered_lines: out.write(line)
def make_run_config(args, sample_name, ref_name, full_command, current_directory): run_id = random.randint(1000000, 9999999) mccutils.mkdir(args.out + "/snakemake") mccutils.mkdir(args.out + "/snakemake/config") run_config = args.out + "/snakemake/config/config_" + str(run_id) + ".json" input_dir = args.out + "/method_input/" results_dir = args.out + "/results/" mcc_path = os.path.dirname(os.path.abspath(__file__)) # get git commit hash to provide in summary report git_commit = "?" try: os.chdir(mcc_path) git_commit_file = args.out + "/git-commit.txt" mccutils.run_command_stdout(["git", "rev-parse", "HEAD"], git_commit_file) with open(git_commit_file, "r") as inf: for line in inf: git_commit = line.replace("\n", "") mccutils.remove(git_commit_file) except Exception as e: track = traceback.format_exc() print(track, file=sys.stderr) print("Could not locate git commit hash...using '?' ", file=sys.stderr) git_commit = "?" mccutils.log("SETUP", "McClintock Version: " + git_commit) out_files_to_make = [] out_files = config.OUT_PATHS for key in out_files.keys(): out_files[key] = out_files[key].replace(config.INPUT_DIR, input_dir) out_files[key] = out_files[key].replace(config.RESULTS_DIR, results_dir) out_files[key] = out_files[key].replace(config.SAMPLE_NAME, sample_name) for method in args.methods: out_files_to_make.append(out_files[method]) now = datetime.now() now_str = now.strftime("%Y%m%d.%H%M%S") log_dir = args.out + "/logs/" + now_str + "." + str(run_id) + "/" mccutils.mkdir(log_dir) chromosomes = [] for record in SeqIO.parse(args.reference, "fasta"): chrom = str(record.id) chrom = mccutils.replace_special_chars(chrom) chromosomes.append(chrom) data = {} data['args'] = { 'proc': str(args.proc), 'out': str(args.out), 'log_dir': log_dir, 'augment_fasta': str(args.augment), 'mcc_path': mcc_path, 'commit': git_commit, 'sample_name': sample_name, 'ref_name': ref_name, 'run_id': str(run_id), 'methods': ",".join(args.methods), 'out_files': ",".join(out_files_to_make), 'save_comments': str(args.comments), 'max_threads_per_rule': max( 1, calculate_max_threads(args.proc, args.methods, config.MULTI_THREAD_METHODS, slow=args.slow)), 'full_command': full_command, 'call_directory': current_directory, 'time': now.strftime("%Y-%m-%d %H:%M:%S"), "chromosomes": ",".join(chromosomes) } # input paths for files data["in"] = { 'reference': str(args.reference), 'consensus': str(args.consensus), 'fq1': str(args.first), 'fq2': str(args.second), 'locations': str(args.locations), 'taxonomy': str(args.taxonomy), 'coverage_fasta': str(args.coverage_fasta), } # where mcc copies will be stored data["mcc"] = config.INTERMEDIATE_PATHS for key in data["mcc"].keys(): data["mcc"][key] = data["mcc"][key].replace(config.INPUT_DIR, input_dir) data["mcc"][key] = data["mcc"][key].replace(config.REF_NAME, ref_name) data["mcc"][key] = data["mcc"][key].replace(config.SAMPLE_NAME, sample_name) env_path = os.path.dirname(os.path.abspath(__file__)) + "/install/envs/" data["envs"] = config_install.ENV for key in data["envs"].keys(): data['envs'][key] = data['envs'][key].replace(config_install.ENV_PATH, env_path) with open(run_config, "w") as conf: json.dump(data, conf, indent=4) return run_id
def main(): sample_name = snakemake.params.sample_name log = snakemake.params.log raw_fq2 = snakemake.params.raw_fq2 is_paired = True if raw_fq2 == "None": is_paired = False script_dir = snakemake.params.script_dir out_dir = snakemake.params.out_dir out_gff = snakemake.output[0] # ensures intermediate files from previous runs are removed for f in os.listdir(out_dir): mccutils.remove(out_dir+"/"+f) mccutils.log("relocate","running RelocaTE", log=log) input_dir = snakemake.params.out_dir+"/input/" mccutils.remove(input_dir) mccutils.mkdir(input_dir) fq_dir = input_dir+"fastq/" mccutils.mkdir(fq_dir) consensus_fasta = input_dir+"consensus.fasta" te_gff = input_dir+"te.gff" reference_fasta = input_dir+"reference.fasta" uniq_id = str(random.randint(10000,99999)) while uniq_id in fq_dir: mccutils.log("relocate","unique id: "+uniq_id+" occurs in file path... selecting a new one...", log=log) uniq_id = str(random.randint(10000,99999)) fq1_uniq_id = uniq_id+"_mcc_relocate_1" fq2_uniq_id = uniq_id+"_mcc_relocate_2" unpaired_id = uniq_id+"_unPaired" os.symlink(snakemake.input.consensus_fasta, consensus_fasta) os.symlink(snakemake.input.te_gff, te_gff) os.symlink(snakemake.input.reference_fasta, reference_fasta) if is_paired: os.symlink(snakemake.input.fq1, fq_dir+sample_name+"."+fq1_uniq_id+".fq") os.symlink(snakemake.input.fq2, fq_dir+sample_name+"."+fq2_uniq_id+".fq") else: os.symlink(snakemake.input.fq1, fq_dir+sample_name+"."+unpaired_id+".fq") annotation = make_annotation_file(te_gff, out_dir) os.chdir(out_dir) command = ["perl", script_dir+"/relocaTE.pl", "-t", consensus_fasta, "-d", fq_dir, "-g", reference_fasta, "-o", ".", "-r", annotation, "-l", str(config.RELOCATE['l']), "-m", str(config.RELOCATE['m']), "-bm", str(config.RELOCATE['bm']), "-bt", str(config.RELOCATE['bt']), "-f", str(config.RELOCATE['f'])] if is_paired: command += ["-1", fq1_uniq_id, "-2", fq2_uniq_id] else: command += ["-u", unpaired_id] mccutils.run_command(command, log=log) combine_gffs(out_dir, out_gff) mccutils.remove(out_dir+"/input/fastq") for d in os.listdir(out_dir): if os.path.exists(d+"/te_containing_fq/"): mccutils.remove(d+"/te_containing_fq/") mccutils.log("relocate","RelocaTE run complete")
def main(): consensus_fasta = snakemake.input.consensus_fasta reference_fasta = snakemake.input.reference_fasta fastq1 = snakemake.input.fastq1 fastq2 = snakemake.input.fastq2 status_log = snakemake.params.status_log log = snakemake.params.log try: with open(log,"a") as l: l.write("consensus fasta: "+consensus_fasta+"\n") l.write("reference fasta: "+reference_fasta+"\n") l.write("fastq1: "+fastq1+"\n") l.write("fastq2: "+fastq2+"\n") threads = snakemake.threads sample_name = snakemake.params.sample_name script_dir = snakemake.params.script_dir out_dir = snakemake.params.out_dir out_bed = snakemake.output[0] # ensures intermediate files from previous runs are removed for f in os.listdir(out_dir): mccutils.remove(out_dir+"/"+f) is_paired = True if snakemake.params.raw_fq2 == "None": is_paired = False command = ['Rscript', "--vanilla", script_dir+"/ngs_te_mapper.R", "genome="+reference_fasta, "teFile="+consensus_fasta, "tsd="+str(config.PARAMS["tsd="]), "thread="+str(threads), "output="+out_dir, "sourceCodeFolder="+script_dir] if is_paired: command.append("sample="+fastq1+";"+fastq2) else: command.append("sample="+fastq1) mccutils.log("ngs_te_mapper","running ngs_te_mapper", log=log) mccutils.run_command(command, log=log) mccutils.log("ngs_te_mapper","ngs_te_mapper run complete", log=log) raw_bed = "" for f in os.listdir(out_dir+"/bed_tsd/"): if "insertions.bed" in f: raw_bed = out_dir+"/bed_tsd/"+f mccutils.check_file_exists(raw_bed) mccutils.run_command(["cp", raw_bed, out_bed]) mccutils.log("ngs_te_mapper","ngs_te_mapper run complete") with open(status_log,"w") as l: l.write("COMPLETED\n") except Exception as e: track = traceback.format_exc() print(track, file=sys.stderr) with open(log,"a") as l: print(track, file=l) mccutils.log("ngs_te_mapper","ngs_te_mapper run failed") with open(status_log,"w") as l: l.write("FAILED\n") mccutils.mkdir(out_dir+"/bed_tsd/") mccutils.run_command(["touch", out_bed])