def format_taxonomy(in_taxonomy, gff_ids, consensus_ids, consensus_fasta, locations_gff): mccutils.log("setup","checking taxonomy TSV: "+in_taxonomy) with open(in_taxonomy, "r") as tsv: for line in tsv: split_line = line.split("\t") if len(split_line) != 2: sys.exit(in_taxonomy+" does not have two columns. Should be tab-separated file with feature ID and TE family as columns\n") else: te_id = split_line[0] masked_te_id = mccutils.replace_special_chars(te_id) if masked_te_id != te_id: mccutils.log("setup", in_taxonomy+": ERROR problematic symbol in feature name: "+te_id+" ... reformat this feature name for compatibility with McClintock") print("Problematic symbols:"," ".join(mccutils.INVALID_SYMBOLS)) sys.exit(1) te_family = split_line[1].replace("\n","") if "#" in te_family: org_te_family = te_family te_family = te_family[:(te_family.find("#"))] mccutils.log("setup", in_taxonomy+": replacing "+org_te_family+" with "+te_family+" for compatibility with RepeatMasker") masked_te_family = mccutils.replace_special_chars(te_family) if masked_te_family != te_family: mccutils.log("setup", in_taxonomy+": ERROR problematic symbol in feature name: "+te_family+" ... reformat this feature name for compatibility with McClintock") print("Problematic symbols:"," ".join(mccutils.INVALID_SYMBOLS)) sys.exit(1) if masked_te_id not in gff_ids: sys.exit("TE ID: "+masked_te_id+" not found in IDs from GFF: "+locations_gff+"\nplease make sure each ID in: "+in_taxonomy+" is found in:"+locations_gff+"\n") if masked_te_family not in consensus_ids: sys.exit("TE Family: "+masked_te_family+" not found in sequence names from: "+consensus_fasta+"\nplease make sure each family in: "+in_taxonomy+" is found in: "+consensus_fasta+"\n")
def format_gff(ingff): mccutils.log("setup","checking locations gff: "+ingff) gff_ids = [] with open(ingff,"r") as gff: for line in gff: if "#" not in line[0]: split_line = line.split("\t") if len(split_line) < 9: sys.exit(ingff+" appears to be a malformed GFF file..exiting...\n") else: feats = split_line[8] split_feats = feats.split(";") gff_id = "" for feat in split_feats: if feat[:3] == "ID=": gff_id = feat.split("=")[1].replace("\n","") masked_gff_id = mccutils.replace_special_chars(gff_id) if gff_id != masked_gff_id: mccutils.log("setup", ingff+": ERROR problematic symbol in feature name: "+gff_id+" ... reformat this feature name for compatibility with McClintock") print("Problematic symbols:"," ".join(mccutils.INVALID_SYMBOLS)) sys.exit(1) if masked_gff_id not in gff_ids: gff_ids.append(masked_gff_id) else: sys.exit("ID: "+masked_gff_id+" is not unique. please ensure each feature has a unique ID\n") if masked_gff_id == "": sys.exit("GFF line: "+line+" is missing an ID attribute (ex. ID=chr1_TY1s1)\n") return gff_ids
def format_fasta(in_fasta): mccutils.log("setup","checking fasta: "+in_fasta) seq_names = [] try: with open(in_fasta,"r") as infa: for record in SeqIO.parse(infa, "fasta"): seq_name = str(record.id) if "#" in seq_name: org_seq_name = seq_name seq_name = seq_name[:(seq_name.find("#"))] mccutils.log("setup", in_fasta+": replacing "+org_seq_name+" with "+seq_name+" for compatibility with RepeatMasker") masked_seq_name = mccutils.replace_special_chars(seq_name) if seq_name != masked_seq_name: mccutils.log("setup", in_fasta+": ERROR problematic symbol in feature name: "+seq_name+" ... reformat this feature name for compatibility with McClintock") print("Problematic symbols:"," ".join(mccutils.INVALID_SYMBOLS)) sys.exit(1) if masked_seq_name not in seq_names: seq_names.append(masked_seq_name) else: sys.exit(in_fasta+": Duplicate sequence name:"+masked_seq_name+"...exiting...\n") except Exception as e: print(e) sys.exit(in_fasta+" appears to be a malformed FastA file..exiting...\n") if len(seq_names) < 1: sys.exit(in_fasta+" contains no sequences... exiting...\n") return seq_names
def format_ref_te_gff(ref_tes, run_id, out): format_ref_tes = out + "/tmp/" + run_id + "tmpreferenceTEs1.gff" with open(ref_tes, "r") as ingff: with open(format_ref_tes, "w") as outgff: for line in ingff: if "#" not in line: split_line = line.split("\t") features = split_line[8].replace("\n", "") split_feats = features.split(";") te_id = "MISSING" for feat in split_feats: if "ID=" in feat: te_id = feat.split("=")[1] te_id = mccutils.replace_special_chars(te_id) split_line[2] = te_id features = ";".join( ["ID=" + te_id, "Name=" + te_id, "Alias=" + te_id]) line = "\t".join(split_line[0:8]) line = line + "\t" + features + "\n" outgff.write(line) return format_ref_tes
def make_run_config(args, sample_name, ref_name, full_command, current_directory): run_id = random.randint(1000000, 9999999) mccutils.mkdir(args.out + "/snakemake") mccutils.mkdir(args.out + "/snakemake/config") run_config = args.out + "/snakemake/config/config_" + str(run_id) + ".json" input_dir = args.out + "/method_input/" results_dir = args.out + "/results/" out_files_to_make = [] out_files = config.OUT_PATHS for key in out_files.keys(): out_files[key] = out_files[key].replace(config.INPUT_DIR, input_dir) out_files[key] = out_files[key].replace(config.RESULTS_DIR, results_dir) out_files[key] = out_files[key].replace(config.SAMPLE_NAME, sample_name) for method in args.methods: out_files_to_make.append(out_files[method]) now = datetime.now() now_str = now.strftime("%Y%m%d.%H%M%S") log_dir = args.out + "/logs/" + now_str + "." + str(run_id) + "/" mccutils.mkdir(log_dir) chromosomes = [] for record in SeqIO.parse(args.reference, "fasta"): chrom = str(record.id) chrom = mccutils.replace_special_chars(chrom) chromosomes.append(chrom) data = {} data['args'] = { 'proc': str(args.proc), 'out': str(args.out), 'log_dir': log_dir, 'augment_fasta': str(args.augment), 'mcc_path': os.path.dirname(os.path.abspath(__file__)), 'sample_name': sample_name, 'ref_name': ref_name, 'run_id': str(run_id), 'methods': ",".join(args.methods), 'out_files': ",".join(out_files_to_make), 'save_comments': str(args.comments), 'max_threads_per_rule': max( 1, calculate_max_threads(args.proc, args.methods, config.MULTI_THREAD_METHODS, slow=args.slow)), 'full_command': full_command, 'call_directory': current_directory, 'time': now.strftime("%Y-%m-%d %H:%M:%S"), "chromosomes": ",".join(chromosomes) } # input paths for files data["in"] = { 'reference': str(args.reference), 'consensus': str(args.consensus), 'fq1': str(args.first), 'fq2': str(args.second), 'locations': str(args.locations), 'taxonomy': str(args.taxonomy), 'coverage_fasta': str(args.coverage_fasta), } # where mcc copies will be stored data["mcc"] = config.INTERMEDIATE_PATHS for key in data["mcc"].keys(): data["mcc"][key] = data["mcc"][key].replace(config.INPUT_DIR, input_dir) data["mcc"][key] = data["mcc"][key].replace(config.REF_NAME, ref_name) data["mcc"][key] = data["mcc"][key].replace(config.SAMPLE_NAME, sample_name) env_path = os.path.dirname(os.path.abspath(__file__)) + "/install/envs/" data["envs"] = config_install.ENV for key in data["envs"].keys(): data['envs'][key] = data['envs'][key].replace(config_install.ENV_PATH, env_path) with open(run_config, "w") as conf: json.dump(data, conf, indent=4) return run_id
def make_run_config(args, sample_name, ref_name, full_command, current_directory): run_id = random.randint(1000000, 9999999) mccutils.mkdir(args.out + "/snakemake") mccutils.mkdir(args.out + "/snakemake/config") run_config = args.out + "/snakemake/config/config_" + str(run_id) + ".json" input_dir = args.out + "/method_input/" results_dir = args.out + "/results/" mcc_path = os.path.dirname(os.path.abspath(__file__)) # get git commit hash to provide in summary report git_commit = "?" try: os.chdir(mcc_path) git_commit_file = args.out + "/git-commit.txt" mccutils.run_command_stdout(["git", "rev-parse", "HEAD"], git_commit_file) with open(git_commit_file, "r") as inf: for line in inf: git_commit = line.replace("\n", "") mccutils.remove(git_commit_file) except Exception as e: track = traceback.format_exc() print(track, file=sys.stderr) print("Could not locate git commit hash...using '?' ", file=sys.stderr) git_commit = "?" mccutils.log("SETUP", "McClintock Version: " + git_commit) out_files_to_make = [] out_files = config.OUT_PATHS for key in out_files.keys(): out_files[key] = out_files[key].replace(config.INPUT_DIR, input_dir) out_files[key] = out_files[key].replace(config.RESULTS_DIR, results_dir) out_files[key] = out_files[key].replace(config.SAMPLE_NAME, sample_name) for method in args.methods: out_files_to_make.append(out_files[method]) now = datetime.now() now_str = now.strftime("%Y%m%d.%H%M%S") log_dir = args.out + "/logs/" + now_str + "." + str(run_id) + "/" mccutils.mkdir(log_dir) chromosomes = [] for record in SeqIO.parse(args.reference, "fasta"): chrom = str(record.id) chrom = mccutils.replace_special_chars(chrom) chromosomes.append(chrom) data = {} data['args'] = { 'proc': str(args.proc), 'out': str(args.out), 'log_dir': log_dir, 'augment_fasta': str(args.augment), 'mcc_path': mcc_path, 'commit': git_commit, 'sample_name': sample_name, 'ref_name': ref_name, 'run_id': str(run_id), 'methods': ",".join(args.methods), 'out_files': ",".join(out_files_to_make), 'save_comments': str(args.comments), 'max_threads_per_rule': max( 1, calculate_max_threads(args.proc, args.methods, config.MULTI_THREAD_METHODS, slow=args.slow)), 'full_command': full_command, 'call_directory': current_directory, 'time': now.strftime("%Y-%m-%d %H:%M:%S"), "chromosomes": ",".join(chromosomes) } # input paths for files data["in"] = { 'reference': str(args.reference), 'consensus': str(args.consensus), 'fq1': str(args.first), 'fq2': str(args.second), 'locations': str(args.locations), 'taxonomy': str(args.taxonomy), 'coverage_fasta': str(args.coverage_fasta), } # where mcc copies will be stored data["mcc"] = config.INTERMEDIATE_PATHS for key in data["mcc"].keys(): data["mcc"][key] = data["mcc"][key].replace(config.INPUT_DIR, input_dir) data["mcc"][key] = data["mcc"][key].replace(config.REF_NAME, ref_name) data["mcc"][key] = data["mcc"][key].replace(config.SAMPLE_NAME, sample_name) env_path = os.path.dirname(os.path.abspath(__file__)) + "/install/envs/" data["envs"] = config_install.ENV for key in data["envs"].keys(): data['envs'][key] = data['envs'][key].replace(config_install.ENV_PATH, env_path) with open(run_config, "w") as conf: json.dump(data, conf, indent=4) return run_id