def ariba_detection(r1, r2, database, work_dir_path, name, species_full): """ This function manage the ARIBA detection :param r1: The R1 fastq file path :param r2: The R2 fastq file path :param database: The CGST database path :param work_dir_path: The working directory path :param name: The name used :param species_full: The scientific name of the strain """ exe = shutil.which("ariba") species_full = species_full.strip().rstrip() species = species_full.split(" ")[0][0].lower() + species_full.split( " ")[1].lower() db_path_species = os.path.join(database, "ariba", f"{species}") reference_dict = get_reference_ariba_mlst() for file in os.listdir(db_path_species): file_path = os.path.join(db_path_species, file) if os.path.isdir(file_path): db_path = os.path.join(file_path, "ref_db") ################################### # Run Ariba call section_header('Run Ariba Call') explanation('Ariba Detection ') # prepare output = os.path.join( work_dir_path, f"{name}_output_ariba_{reference_dict[file]}") cmd = f"{exe} run {db_path} {r1} {r2} {output}" log_message = f"Command used : \n {cmd}\n" # launch process = Popen(cmd, shell=True, stdout=PIPE, stderr=STDOUT) log_file_path = os.path.join( work_dir_path, f"logAriba_{reference_dict[file]}.txt") log_process_with_output_file(process, log_message, log_file_path)
def available_species_ariba(species): """ This function display ARIBA schema MLST available :param species: The scientific name of the strain :return: the id of schema """ exe = shutil.which("ariba") ################################### # Run Ariba Available Species section_header('Ariba Available MLST Schemas') explanation('Display the MLST by Ariba function that search in the pubmlst database') # prepare cmd = f"{exe} pubmlstspecies" # launch process = Popen(cmd, shell=True, stdout=PIPE, stderr=STDOUT) id_list = log_process(process, species) return id_list
def check_for_required_tools(): """ The function that check the dependencies """ section_header('Checking requirements') explanation( 'CGST requires MentaLiST to run, so it checks for this tool now.') mentalist_path, mentalist_version, mentalist_status = mentalist_path_and_version( 'mentalist') if mentalist_status == 'good': log(f'MentaLiST found: {mentalist_path} (v{mentalist_version})') elif mentalist_status == 'not found': os.sys.exit( 'Error: MentaLiST not found - make sure it is in your PATH before running ' 'CGST') elif mentalist_status == 'bad': os.sys.exit('Error: unable to determine MentaLiST version') log()
def build_database_ariba(cgst_database_path, species_full): """ This function build ARIBA MLST database :param cgst_database_path: The CGST database path :param species_full: The scientific name of the strain """ exe = shutil.which("ariba") species_full = species_full.strip().rstrip() species = species_full.split(" ")[0][0].lower() + species_full.split(" ")[1].lower() id_species_list = available_species_ariba(species_full.lower()) if not id_species_list: log() log(f"Any ID found for the species : {species_full}") log() exit() else: for id_species in id_species_list: log(f"ID ({id_species}) found for {species_full} ") ################################### # Run Ariba Build Species Database section_header(f'Ariba Build database for species : {species_full}') explanation('Download and install MLST for a species using Ariba function') # prepare if not os.path.exists(cgst_database_path): os.mkdir(cgst_database_path) mlst_database_path = os.path.join(cgst_database_path, "ariba") if not os.path.exists(mlst_database_path): os.mkdir(mlst_database_path) mlst_species_database_path = os.path.join(mlst_database_path, species) if not os.path.exists(mlst_species_database_path): os.mkdir(mlst_species_database_path) for id_species in id_species_list: id_species_name = id_species.lower().replace(" ", "-").replace("#", "") out_pubmlst_get_path = os.path.join(mlst_species_database_path, f"mlst_{id_species_name}") if os.path.exists(out_pubmlst_get_path): shutil.rmtree(out_pubmlst_get_path) cmd = f"{exe} pubmlstget \"{id_species}\" {out_pubmlst_get_path}" # launch process = Popen(cmd, shell=True, stdout=PIPE, stderr=STDOUT, executable='/bin/bash') log_process(process, "")
def main_analysis(detection_dir, database, work_dir, species_full, force, threads): """ This function that manage the analysis of CGST :param detection_dir: The directory MentaLiST output :param database: The CGST database path :param work_dir: The analysis working directory path :param species_full: The scientific name of the strain :param force: Force the output :param threads: The number of threads to allocate """ level_gg_dict = {"Diff_5_alleles": {"lvl": 5, "list_sample": []}, "Diff_10_alleles": {"lvl": 10, "list_sample": []}, "Diff_25_alleles": {"lvl": 25, "list_sample": []}, "Diff_50_alleles": {"lvl": 50, "list_sample": []}, "Diff_100_alleles": {"lvl": 100, "list_sample": []}, "Diff_150_alleles": {"lvl": 150, "list_sample": []}, "Diff_200_alleles": {"lvl": 200, "list_sample": []}, "Diff_300_alleles": {"lvl": 300, "list_sample": []}} species_full = species_full.strip().rstrip() species = species_full.split(" ")[0][0].lower() + species_full.split(" ")[1].lower() ################################### if not os.path.exists(work_dir): os.mkdir(work_dir) ################################### # Load Detection result section_header(f'Load Detection Result {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}') detection_result_dict = {} detection_result_dict_list = [] for file in os.listdir(detection_dir): if "_output_final" in file: strain_name = file.split("_output_final")[0] file_path = os.path.join(detection_dir, file) detection_result_dict_tmp = read_output_mentalist(file_path, strain_name) detection_result_dict_list.append(detection_result_dict_tmp) if len(detection_result_dict_list) >= 5: pass else: tool_error_log(f"Need more strains that just {len(detection_result_dict_list)}") exit() for detect_dict in detection_result_dict_list: for key, value_dict in detect_dict.items(): if key in detection_result_dict: value_detect_dict = detection_result_dict[key] for sample, value in value_dict.items(): value_detect_dict[sample] = value else: detection_result_dict[key] = value_dict ########## # Get Difference alleles section_header(f'Analysis {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}') detection_result_only_diff_dict = {} distance_dict = {} sample_list = [] for key, sample_dict in detection_result_dict.items(): if key == "Sample" or "clonal_complex" in key or "ST" in key: if key == "Sample": sample_list = [*sample_dict] continue else: element_list = [] pivot = False for sample_name in sample_list: if "-" in sample_dict[sample_name] or "+" in sample_dict[sample_name] \ or "0" == sample_dict[sample_name] or "?" in sample_dict[sample_name]\ or "N" in sample_dict[sample_name]: pivot = True break else: element_list.append(sample_dict[sample_name]) if not pivot: # test if the element are the same or not if len(set(element_list)) == 1: continue else: detection_result_only_diff_dict[key] = sample_dict for sample_name_1 in sample_list: for sample_name_2 in sample_list: if sample_dict[sample_name_1] == sample_dict[sample_name_2]: continue else: if sample_name_1 in distance_dict: value1_dict = distance_dict[sample_name_1] if sample_name_2 in value1_dict: value1_count = value1_dict[sample_name_2] value1_dict[sample_name_2] = value1_count + 1 else: value1_dict[sample_name_2] = 1 else: distance_dict[sample_name_1] = {sample_name_2: 1} explanation(f"Number of different relevant locus : {len(detection_result_only_diff_dict)}") ################################### # Get Similarity Matrix section_header(f'Get Similarity Matrix {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}') sample_set_list = list(combinations(sample_list, 2)) similarity_dict = {} for locus, sample_dict in detection_result_only_diff_dict.items(): for sample_set in sample_set_list: if sample_dict[sample_set[0]] == sample_dict[sample_set[1]]: continue else: if sample_set in similarity_dict: value_similarity = similarity_dict[sample_set] value_similarity += 1 similarity_dict[sample_set] = value_similarity else: similarity_dict[sample_set] = 1 ################################### combination_dir = os.path.join(work_dir, "combination") if not os.path.exists(combination_dir): os.mkdir(combination_dir) cluster_dir = os.path.join(work_dir, "cluster") if not os.path.exists(cluster_dir): os.mkdir(cluster_dir) phylotree_dir = os.path.join(work_dir, "phylotree") if not os.path.exists(phylotree_dir): os.mkdir(phylotree_dir) ################################### # Write Similarity Matrix section_header(f'Write Similarity Matrix {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}') similarity_file = os.path.join(combination_dir, "similarity_matrix.tsv") with open(similarity_file, "w") as similarity: writer_csv = writer(similarity, delimiter='\t') writer_csv.writerow([""] + sample_list) for sample_1 in sample_list: val_list = [] for sample_2 in sample_list: tuple_1 = (sample_1, sample_2) tuple_2 = (sample_2, sample_1) if tuple_1 in similarity_dict: val_list.append(similarity_dict[tuple_1]) continue if tuple_2 in similarity_dict: val_list.append(similarity_dict[tuple_2]) continue if sample_1 == sample_2: val_list.append("0") continue else: val_list.append("0") writer_csv.writerow([sample_1] + val_list) ################################### # R Script section_header(f'Execute RScript {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}') ex_r = shutil.which("Rscript") r_script_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), "r_script.R") cmd = f"{ex_r} {r_script_file} --wd {combination_dir}" log_message = f"Command used : \n {cmd}\n" # launch process = Popen(cmd, shell=True, stdout=PIPE, stderr=STDOUT) log_file_path = os.path.join(combination_dir, "logR.txt") log_process_with_output_file(process, log_message, log_file_path) ########################### # Create Group File section_header(f'Create Group {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}') r_result_file = os.path.join(combination_dir, "groups.tsv") groups_dict = {} strains_groups_list = [] with open(r_result_file, "r") as groups_file: reader = DictReader(groups_file, delimiter="\t") headers = reader.fieldnames for row in reader: strains_groups_list.append(row[""]) for head in headers: if head: if head in groups_dict: val_list = groups_dict[head] pivot = True for i, group_d in enumerate(val_list): if row[head] == group_d["group"]: pivot = False if row[""] not in group_d["strains"]: str_list = group_d["strains"] str_list.append(row[""]) group_d["strains"] = str_list continue else: continue if pivot: val_list.append({"group": row[head], "strains": [row[""]]}) continue groups_dict[head] = val_list continue else: groups_dict[head] = [{"group": row[head], "strains": [row[""]]}] continue r_result_gap_file = os.path.join(combination_dir, "groups_gap.tsv") if os.path.exists(r_result_gap_file): with open(r_result_gap_file, "r") as groups_file: reader = DictReader(groups_file, delimiter="\t") headers = reader.fieldnames for row in reader: for head in headers: if head: if head in groups_dict: val_list = groups_dict[head] pivot = True for i, group_d in enumerate(val_list): if row[head] == group_d["group"]: pivot = False if row[""] not in group_d["strains"]: str_list = group_d["strains"] str_list.append(row[""]) group_d["strains"] = str_list continue else: continue if pivot: val_list.append({"group": row[head], "strains": [row[""]]}) continue groups_dict[head] = val_list continue else: groups_dict[head] = [{"group": row[head], "strains": [row[""]]}] continue else: log() log("File of gap group not exist : ({0})".format(r_result_gap_file)) strains_groups_list = list(set(strains_groups_list)) groups_dict["1"] = [{"group": "1", "strains": strains_groups_list}] ################################### # Add allele to group for key_class, list_value in groups_dict.items(): for group in list_value: share_90_allele_list = [] share_strict_allele_list = [] for key, sample_dict in detection_result_dict.items(): if key == "Sample" or "clonal_complex" in key or "ST" in key: if "ST" in key: st_tmp_list = [] for strain in group["strains"]: st_tmp_list.append(sample_dict[strain]) group["name_group"] = f"{species}-{key}{':'.join(list(set(st_tmp_list)))}" continue else: allele_tmp_list = [] for strain in group["strains"]: allele_tmp_list.append(sample_dict[strain]) ####### unique = list(Counter(allele_tmp_list).keys()) # equals to list(set(words)) value = list(Counter(allele_tmp_list).values()) if len(unique) == 1: share_strict_allele_list.append({"locus": key, "allele": unique[0]}) share_90_allele_list.append({"locus": key, "allele": unique[0]}) else: for i, val in enumerate(value): if (val / len(group["strains"])) * 100 >= 90: share_90_allele_list.append({"locus": key, "allele": unique[i]}) break else: continue continue group["share_strict_allele"] = share_strict_allele_list group["share_90_allele"] = share_90_allele_list group["count_strict_allele"] = len(share_strict_allele_list) group["count_90_allele"] = len(share_90_allele_list) ################################### # Create level lists section_header(f'Create Lvl list {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}') for name_lvl, lvl_dict in level_gg_dict.items(): for sample_name, count_dict in distance_dict.items(): group_lvl_list = [] for sample_name_2 in sample_list: if sample_name == sample_name_2: continue if sample_name_2 not in count_dict: group_lvl_list.append(sample_name_2) continue if count_dict[sample_name_2] <= lvl_dict["lvl"]: group_lvl_list.append(sample_name_2) continue if group_lvl_list: group_lvl_list.append(sample_name) group_lvl_list.sort() if "list_sample" in lvl_dict: list_tmp = lvl_dict["list_sample"] # check if the list is already present if group_lvl_list in list_tmp: continue list_tmp.append(group_lvl_list) lvl_dict["list_sample"] = list_tmp else: lvl_dict["list_sample"] = [group_lvl_list] ################################### # Create Multi fasta section_header(f'Create Multiple Fasta Files {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}') cgmlst_database_path = os.path.join(database, "cgMLST", f"{species}") log() log("CGST MLST path : {0}".format(cgmlst_database_path)) fasta_db_path_list = [] for file_1 in os.listdir(cgmlst_database_path): if "cgmlst-org" == file_1 or "cnr" == file_1 or "other" == file_1: cgmlst_dir_path = os.path.join(cgmlst_database_path, file_1) for file_2 in os.listdir(cgmlst_dir_path): file_2_path = os.path.join(cgmlst_dir_path, file_2) if ".db" not in file_2 and "_fasta" in file_2 and os.path.isdir(file_2_path): fasta_db_path_list.append(file_2_path) output_dir_msa = os.path.join(phylotree_dir, "msa") if not os.path.exists(output_dir_msa): os.mkdir(output_dir_msa) elif force and os.path.exists(output_dir_msa): rmtree(output_dir_msa) os.mkdir(output_dir_msa) pivot_first_loop = True sequence_dict = {} final_resume_aln_dict = {} output_aln_file_list = [] pool = multiprocessing.Pool(processes=int(threads)) list_jobs = [] fasta_db_path_used = "" for locus_name, sample_dict in detection_result_only_diff_dict.items(): fasta_file = "" for fasta_db_path in fasta_db_path_list: for file in os.listdir(fasta_db_path): if file == f"{locus_name}.fasta": fasta_db_path_used = fasta_db_path fasta_file = os.path.join(fasta_db_path, f"{locus_name}.fasta") break output_fasta_file = os.path.join(output_dir_msa, os.path.basename(fasta_file)) with open(output_fasta_file, "w") as output_fasta: record_dict = SeqIO.index(fasta_file, "fasta") for sample_name, number_allele in sample_dict.items(): seq = record_dict[f"{locus_name}_{number_allele}"] seq.id = sample_name SeqIO.write(seq, output_fasta, "fasta") record_dict.close() ################################### # MAFFT - MSA output_aln_file = os.path.join(output_dir_msa, f"{os.path.basename(fasta_file).split('.')[0]}.aln") output_aln_file_list.append(output_aln_file) list_jobs.append([output_dir_msa, output_fasta_file, locus_name, output_aln_file]) count_locus_cg = 0 for file in os.listdir(fasta_db_path_used): if ".fasta" in file: count_locus_cg += 1 pool.starmap(mafft, list_jobs) stop = 0 for output_aln_file in output_aln_file_list: ################################### # EXPLOIT OUTPUT MAFFT with open(output_aln_file, "r") as handle: record_aln_dict = SeqIO.to_dict(SeqIO.parse(handle, "fasta")) for id_seq in sorted(record_aln_dict, key=lambda id_s: len(record_aln_dict[id_s].seq), reverse=True): if pivot_first_loop: start = 1 stop = len(record_aln_dict[id_seq].seq) else: start = stop + 1 stop = start + len(record_aln_dict[id_seq].seq) final_resume_aln_dict[record_aln_dict[id_seq].description.split(" ")[-1]] = {"start": start, "stop": stop, "length": stop - start} break for sample in sample_list: if sample in sequence_dict: sequence = sequence_dict[sample] if sample not in record_aln_dict: sequence = sequence + "-" * final_resume_aln_dict[record_aln_dict[sample].description.split(" ")[-1]]["length"] else: sequence = sequence + record_aln_dict[sample].seq sequence_dict[sample] = sequence continue else: if sample not in record_aln_dict: sequence = "-" + "-" * final_resume_aln_dict[record_aln_dict[sample].description.split(" ")[-1]]["length"] else: sequence = record_aln_dict[sample].seq sequence_dict[sample] = sequence continue pivot_first_loop = False ################################### # Write Groups all_groups_alleles_file = os.path.join(combination_dir, "groups_alleles.tsv") with open(all_groups_alleles_file, "w") as all_groups_alleles: writer_group = writer(all_groups_alleles, delimiter='\t') writer_group.writerow( ["Class", "Group", "Strains", "Count Strains", "Name Group", "Alleles strict", "Count Alleles strict", "Strict Coverage CG", "Alleles 90%", "Count Alleles 90%", "90% Coverage CG"]) for key_class, list_value in groups_dict.items(): for ele_d in list_value: strict_string_list = [] for ele_strict in ele_d["share_strict_allele"]: strict_string_list.append(f"{ele_strict['locus']}:{ele_strict['allele']}") ninety_string_list = [] for ele_ninety in ele_d["share_90_allele"]: ninety_string_list.append(f"{ele_ninety['locus']}:{ele_ninety['allele']}") writer_group.writerow( [key_class, ele_d["group"], ele_d["strains"], len(ele_d["strains"]), ele_d["name_group"], ",".join(strict_string_list), ele_d["count_strict_allele"], (ele_d["count_strict_allele"] / count_locus_cg) * 100, ",".join(ninety_string_list), ele_d["count_90_allele"], (ele_d["count_90_allele"] / count_locus_cg) * 100]) ################################### # Create MSA section_header(f'Create Alignment file {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}') output_aln_final_file = os.path.join(phylotree_dir, "core-genome.aln") with open(output_aln_final_file, "w") as output_aln_final_handle: for sample in sample_list: record = SeqRecord(sequence_dict[sample], id=sample) SeqIO.write(record, output_aln_final_handle, "fasta") output_resume_final_file = os.path.join(phylotree_dir, "resume_core-genome.tsv") with open(output_resume_final_file, "w") as output_resume_final_handle: writer_resume = writer(output_resume_final_handle, delimiter="\t") writer_resume.writerow(["Gene", "Start", "Stop", "Length"]) for gene, value_dict in final_resume_aln_dict.items(): writer_resume.writerow([gene, value_dict["start"], value_dict["stop"], value_dict["length"]]) ################################### # Gubbins section_header(f'Delete Recombination with Gubbins {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}') ex_gubbins = shutil.which("run_gubbins") gubbins_work_dir = os.path.join(phylotree_dir, "gubbins") if not os.path.exists(gubbins_work_dir): os.mkdir(gubbins_work_dir) elif force and os.path.exists(gubbins_work_dir): rmtree(gubbins_work_dir) os.mkdir(gubbins_work_dir) cmd = f"{ex_gubbins} -p {gubbins_work_dir}/gubbins --threads {threads} {output_aln_final_file}" log_message = f"Command used : \n {cmd}\n" # launch process = Popen(cmd, shell=True, stdout=PIPE, stderr=STDOUT) log_file_path = os.path.join(gubbins_work_dir, "logGubbins.txt") log_process_with_output_file(process, log_message, log_file_path) ################################### # RAXML-ng section_header(f'Phylotree with RaXML-ng {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}') ex_raxml_ng = shutil.which("raxml-ng") raxml_work_dir = os.path.join(phylotree_dir, "raxml-ng") raxml_prefix = os.path.join(raxml_work_dir, "raxml-ng") if not os.path.exists(raxml_work_dir): os.mkdir(raxml_work_dir) elif force and os.path.exists(raxml_work_dir): rmtree(raxml_work_dir) os.mkdir(raxml_work_dir) gubbins_snp_phylip = os.path.join(gubbins_work_dir, "gubbins.filtered_polymorphic_sites.phylip") cmd = f"{ex_raxml_ng} --all --msa {gubbins_snp_phylip} --prefix {raxml_prefix} --model GTR+FO+IO" \ f" --bs-trees autoMRE --threads 4" log_message = f"Command used : \n {cmd}\n" # launch process = Popen(cmd, shell=True, stdout=PIPE, stderr=STDOUT) log_file_path = os.path.join(raxml_work_dir, "logRaXML-ng.txt") log_process_with_output_file(process, log_message, log_file_path) ################################### # Write Analysis section_header(f'Write Final Analysis {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}') all_report_file = os.path.join(work_dir, "all_report.tsv") with open(all_report_file, "w") as all_report: writer_report_all = writer(all_report, delimiter='\t') for key, sample_dict in detection_result_dict.items(): if key == "Sample": writer_report_all.writerow(["Sample"] + [*sample_dict]) pass else: write_list = [key] for sample_name in sample_list: write_list.append(sample_dict[sample_name]) writer_report_all.writerow(write_list) lvl_report_file = os.path.join(cluster_dir, "lvl_report.tsv") with open(lvl_report_file, "w") as lvl_report: writer_report = writer(lvl_report, delimiter='\t') writer_report.writerow(["Name level", "Groups"]) for name_lvl, lvl_dict in level_gg_dict.items(): writer_report.writerow([name_lvl, ";".join(str(v) for v in lvl_dict["list_sample"])])
def build_database_mentalist(cgst_database_path, species_full, threads): """ This function build MentaLiST cgMLST database :param cgst_database_path: The CGST database path :param species_full: The scientific name of the strain :param threads: The number of threads to allocate """ exe = shutil.which("mentalist") species_full = species_full.strip().rstrip() species = species_full.split(" ")[0][0].lower() + species_full.split(" ")[1].lower() if not os.path.exists(cgst_database_path): os.mkdir(cgst_database_path) cgmlst_database_path = os.path.join(cgst_database_path, "cgMLST") if not os.path.exists(cgmlst_database_path): os.mkdir(cgmlst_database_path) cgmlst_species_base_path = os.path.join(cgmlst_database_path, species) if not os.path.exists(cgmlst_species_base_path): os.mkdir(cgmlst_species_base_path) else: log(f"cgMLST for {species_full} already present") ##### # Check if cgmlst in CNR repo url = 'https://raw.githubusercontent.com/CNRResistanceAntibiotic/core_genomes/master/reference.csv' df = pandas.read_csv(url, error_bad_lines=False) url_folder = "" sub_folder = "" for i, row in enumerate(df["Species"]): if row == species_full: log("Species {0} found in CNR GitHub".format(species_full)) url_folder = "https://github.com/CNRResistanceAntibiotic/core_genomes/trunk/{0}/{1}"\ .format(df["Name"][i], df["Sub-folder"][i]) sub_folder = df["Sub-folder"][i] else: continue id_species_list = [] if url_folder: cgmlst_species_database_path = os.path.join(cgmlst_species_base_path, sub_folder) cmd = f"svn export {url_folder} {cgmlst_species_database_path} --force" # launch process = Popen(cmd, shell=True, stdout=PIPE, stderr=STDOUT, executable='/bin/bash') while True: if process.stdout is not None: output = process.stdout.readline().decode("utf-8").rstrip() if process.stderr is not None: error = process.stderr.readline().decode("utf-8").rstrip() if output == '' and process.poll() is not None: break else: id_species_list = available_species_mentalist(species_full) if not id_species_list: log() log(f"Any ID found for the species : {species_full}") log() exit() else: for id_species in id_species_list: log(f"ID ({id_species.lower()}) found for {species_full} ") cgmlst_species_database_path = os.path.join(cgmlst_species_base_path, "cgmlst-org") if not os.path.exists(cgmlst_species_database_path): os.mkdir(cgmlst_species_database_path) else: log(f"cgMLST for {species_full} already present") ################################### # Run MentaLiST Build Species Database section_header(f'MentaLiST Build database cgMLST for species : {species_full}') explanation('Download and install cgMLST for a species using MentaLiST function') # prepare fasta_database_name = f"{species}_cgmlst_fasta" fasta_database_path = os.path.join(cgmlst_species_database_path, fasta_database_name) database_name = f"{species}_cgmlst.db" database_path = os.path.join(cgmlst_species_database_path, database_name) cmd = f"{exe} download_cgmlst -k 31 -o {fasta_database_path} -s {id_species_list[0].lower()} --db {database_path}" \ f" --threads {threads}" print(cmd) # launch process = Popen(cmd, shell=True, stdout=PIPE, stderr=STDOUT, executable='/bin/bash') log_process(process, "")
def mentalist_detection(r1, r2, database, work_dir_path, name, species_full, threads): """ This function manage the MentaLiST detection :param r1: The R1 fastq file path :param r2: The R2 fastq file path :param database: The CGST database path :param work_dir_path: The working directory path :param name: The name of :param species_full: The scientific name of the strain :param threads: The number of threads to allocate """ kmer_threshold = 5 kmer_build = 31 species_full = species_full.strip().rstrip() species = species_full.split(" ")[0][0].lower() + species_full.split( " ")[1].lower() exe = shutil.which("mentalist") exe_parse = shutil.which("parse_novel_alleles.py") exe_update = shutil.which("update_fasta_db.py") cgmlst_database_path = os.path.join(database, "cgMLST", f"{species}") fasta_db_path = db_path = "" cg_db_dict = {} for file_1 in os.listdir(cgmlst_database_path): file_1_path = os.path.join(cgmlst_database_path, file_1) if os.path.isdir(file_1_path): cgmlst_dir_path = file_1_path for file_2 in os.listdir(cgmlst_dir_path): file_2_path = os.path.join(cgmlst_dir_path, file_2) if "_fasta" in file_2 and os.path.isdir(file_2_path): fasta_db_path = file_2_path db_name = file_2.split("_fasta")[0] + ".db" db_path = os.path.join(cgmlst_dir_path, db_name) cg_db_dict[file_1] = { 'db_path': db_path, 'fasta_db_path': fasta_db_path } # make detection on each core-genome available for the species for name_db, value_hash in cg_db_dict.items(): i = 1 db_path = value_hash["db_path"] fasta_db_path = value_hash["fasta_db_path"] section_header('Launch MentaLiST:') explanation('Name Database : {0}'.format(name_db)) explanation('Database Path : {0}'.format(db_path)) explanation('Fasta Path : {0}'.format(fasta_db_path)) output_final = os.path.join(work_dir_path, f"{name}_{name_db}_output_final") # If the kmer index database do not exist -> create it if not os.path.exists(db_path): ################################### # Run MentaLiST Build DB section_header('Create MentaLiST Database') explanation( 'Before run a detection MentaLiST need to construct his own kmer-index database' ) # prepare cmd = f"{exe} build_db --db {db_path} -k {kmer_build} -d {fasta_db_path} --threads {threads}" log_message = f"Command used : \n {cmd}" # launch log_file_path = os.path.join(work_dir_path, "logBuildDB_{0}.txt".format(name_db)) process = Popen(cmd, shell=True, stdout=PIPE, stderr=STDOUT, executable='/bin/bash') log_process_with_output_file(process, log_message, log_file_path) ################################### # Run MentaLiST call section_header(f'Run MentaLiST Call : Round {i}') explanation('MentaLiST Detection ') # prepare output = os.path.join(work_dir_path, f"output_mentalist_{name_db}_{i}") cmd = f"{exe} call --db {db_path} --output_votes -o {output} -1 {r1} -2 {r2} --kt {kmer_threshold}" log_message = f"Command used : \n {cmd}\n" # launch process = Popen(cmd, shell=True, stdout=PIPE, stderr=STDOUT, executable='/bin/bash') log_file_path = os.path.join(work_dir_path, f"logMentaLiST_{name_db}_{i}.txt") log_process_with_output_file(process, log_message, log_file_path) fasta_novel_st = os.path.join( work_dir_path, f"output_mentalist_{name_db}_{i}.novel.fa") while os.path.exists( fasta_novel_st) and os.stat(fasta_novel_st).st_size != 0: ################################### # check MentaLiST output for Novel and Multiple Votes section_header(f'Check MentaLiST Output : Round {i}') explanation('Check MentaLiST output for Novel Votes') check_mentalist_output(fasta_novel_st, output, fasta_db_path) ################################### # Run MentaLiST Parse Novel Fasta section_header(f'Run MentaLiST parse novel fasta : Round {i}') explanation('MentaLiST parse new novel variant ') # prepare fasta_novel_st = os.path.join( work_dir_path, f"output_mentalist_{name_db}_{i}.novel.fa") result_parse_path = os.path.join( work_dir_path, f"all_novel_alleles_{name_db}_{i}") cmd = f"{exe_parse} -f {fasta_novel_st} -o {result_parse_path}" log_message = f"Command used : \n {cmd}\n" # launch process = Popen(cmd, shell=True, stdout=PIPE, stderr=STDOUT) log_file_path = os.path.join( work_dir_path, f"logParseNovelAlleles_{name_db}_{i}.txt") log_process_with_output_file(process, log_message, log_file_path) ################################### # Run MentaLiST Update Fasta DB section_header(f'Run MentaLiST update DB fasta : Round {i}') explanation('MentaLiST update DB fasta with new novel variant ') # prepare result_parse_path = os.path.join( work_dir_path, f"all_novel_alleles_{name_db}_{i}") cmd = f"{exe_update} -db {fasta_db_path} -n {result_parse_path}.fa" log_message = f"Command used : \n {cmd}\n" # launch process = Popen(cmd, shell=True, stdout=PIPE, stderr=STDOUT) log_file_path = os.path.join( work_dir_path, f"logCreateNewSchemeWithNovel_{name_db}_{i}.txt") log_process_with_output_file(process, log_message, log_file_path) ################################### # Run MentaLiST Build DB section_header(f'Create MentaLiST Database : Round {i}') explanation( 'Before run a detection MentaLiST need to construct his own kmer-index database' ) # prepare cmd = f"{exe} build_db --db {db_path} -k {kmer_build} -d {fasta_db_path} --threads {threads}" log_message = f"Command used : \n {cmd}" # remove previous db os.remove(db_path) # launch log_file_path = os.path.join(work_dir_path, f"logBuildDB_{name_db}_{i}.txt") process = Popen(cmd, shell=True, stdout=PIPE, stderr=STDOUT, executable='/bin/bash') log_process_with_output_file(process, log_message, log_file_path) ################################### # Update counter i += 1 ################################### # Run MentaLiST call section_header(f'Run MentaLiST Call : Round {i}') explanation('MentaLiST Detection ') # prepare output = os.path.join(work_dir_path, f"output_mentalist_{name_db}_{i}") cmd = f"{exe} call --db {db_path} --output_votes -o {output} -1 {r1} -2 {r2} --kt {kmer_threshold}" log_message = f"Command used : \n {cmd}\n" # launch process = Popen(cmd, shell=True, stdout=PIPE, stderr=STDOUT, executable='/bin/bash') log_file_path = os.path.join(work_dir_path, f"logMentaLiST_{name_db}_{i}.txt") log_process_with_output_file(process, log_message, log_file_path) fasta_novel_st = os.path.join( work_dir_path, f"output_mentalist_{name_db}_{i}.novel.fa") ################################### # check MentaLiST output for Novel and Multiple Votes section_header(f'Check MentaLiST Output : Round {i}') explanation( 'Check MentaLiST output for Novel Votes and edit false novel in previous output' ) check_mentalist_output(fasta_novel_st, output, fasta_db_path) ################################### # Process final result section_header('Process Final Output') # final files shutil.move(output, output_final) intermediate_file_dir = os.path.join(work_dir_path, "intermediate_files") if not os.path.exists(intermediate_file_dir): os.mkdir(intermediate_file_dir) for file in os.listdir(work_dir_path): file_path = os.path.join(work_dir_path, file) if not os.path.isdir(file_path): if not ("_output_final" in file or "statistics_" in file or "combination_result_" in file): shutil.move(file_path, os.path.join(intermediate_file_dir, file)) # add ST in modify output_mentalist st_dict = add_st_to_output_mentalist(output_final, name) # Statistic detection_result_dict = read_output_mentalist(output_final, name) count_none = count_loc = count_low_cov = count_perfect = count_multi = 0 for locus, value_dict in detection_result_dict.items(): if locus == "Sample" or locus == "ST" or locus == "clonal_complex" or locus in st_dict: continue elif value_dict[name] == "0?": count_none += 1 count_loc += 1 elif "-" in value_dict[name]: count_low_cov += 1 count_loc += 1 elif "+" in value_dict[name]: count_multi += 1 count_loc += 1 else: count_perfect += 1 count_loc += 1 explanation(f'Total Locus' f' : {count_loc}') explanation( f'Perfect Locus' f' : {count_perfect} -> {round((count_perfect / count_loc * 100), 2)}% of total locus' ) explanation( f'Multiple Locus' f' : {count_multi} -> {round((count_multi / count_loc * 100), 2)}% of total locus' ) explanation( f'Low Coverage Locus' f' : {count_low_cov} -> {round((count_low_cov / count_loc * 100), 2)}% of total locus' ) explanation( f'None Locus' f' : {count_none} -> {round((count_none / count_loc * 100), 2)}% of total locus' ) stats_file = os.path.join(work_dir_path, "statistics_{0}.tsv".format(name_db)) with open(stats_file, "w") as output_file: csv_writer = writer(output_file, delimiter="\t") csv_writer.writerow(["Total Locus", count_loc]) csv_writer.writerow( ["Resume", "Count", "Percentage on Total locus"]) csv_writer.writerow([ "Perfect Locus", count_perfect, (count_perfect / count_loc * 100) ]) csv_writer.writerow([ "Multiple Locus", count_multi, (count_multi / count_loc * 100) ]) csv_writer.writerow([ "Low Coverage Locus", count_low_cov, (count_low_cov / count_loc * 100) ]) csv_writer.writerow( ["None Locus", count_none, (count_none / count_loc * 100)]) # Load Known Combination known_comb_path = os.path.join(os.path.dirname(db_path), "combination_list.tsv") known_comb_dict = {} if os.path.exists(known_comb_path): with open(known_comb_path, "r") as file: reader = DictReader(file, delimiter="\t") for row in reader: known_comb_dict[row["Name"]] = row["Combination"] else: explanation( f"combination file not found for {species_full} at {known_comb_path}" ) # Search For Known Combination final_file_path = os.path.join(work_dir_path, f"{name}_{name_db}_output_final") detection_result_dict = read_output_mentalist(final_file_path, name) comb_strain_list = [] for locus, sample_dict in detection_result_dict.items(): if locus == "Sample" or locus == "ST" or locus == "clonal_complex": continue else: comb_strain_list.append(f"{locus}:{sample_dict[name]}") if known_comb_dict: # search for each known combination combine_result_file = os.path.join( work_dir_path, f"combination_result_{name_db}.tsv") with open(combine_result_file, "w") as combine_file: csv_writer = writer(combine_file, delimiter="\t") csv_writer.writerow([ "Combination Name", "Count Reference Locus", "Count Sample Locus", "Ratio", "Comment" ]) for name_comb, combs in known_comb_dict.items(): combination_known = combs.split(",") result = list( set(comb_strain_list).intersection(combination_known)) ratio = round((len(result) / len(combination_known)) * 100, 2) if ratio == 100: comment = "Perfect" elif ratio >= 98: comment = "Very Close" elif ratio >= 90: comment = "Close" elif ratio >= 80: comment = "Like" elif ratio >= 70: comment = "Close Like" else: comment = "No relevant" csv_writer.writerow([ name_comb, len(combination_known), len(result), ratio, comment ]) section_header('Finish {0} Coregenome Analysis'.format(name_db)) section_header('Finish ALL CoreGenome Analysis')