Exemplo n.º 1
0
def ariba_detection(r1, r2, database, work_dir_path, name, species_full):
    """
    This function manage the ARIBA detection
    :param r1: The R1 fastq file path
    :param r2: The R2 fastq file path
    :param database: The CGST database path
    :param work_dir_path: The working directory path
    :param name: The name used
    :param species_full: The scientific name of the strain
    """
    exe = shutil.which("ariba")
    species_full = species_full.strip().rstrip()
    species = species_full.split(" ")[0][0].lower() + species_full.split(
        " ")[1].lower()
    db_path_species = os.path.join(database, "ariba", f"{species}")
    reference_dict = get_reference_ariba_mlst()

    for file in os.listdir(db_path_species):
        file_path = os.path.join(db_path_species, file)
        if os.path.isdir(file_path):
            db_path = os.path.join(file_path, "ref_db")
            ###################################
            # Run Ariba call
            section_header('Run Ariba Call')
            explanation('Ariba Detection ')
            # prepare
            output = os.path.join(
                work_dir_path, f"{name}_output_ariba_{reference_dict[file]}")
            cmd = f"{exe} run {db_path} {r1} {r2} {output}"
            log_message = f"Command used : \n {cmd}\n"
            # launch
            process = Popen(cmd, shell=True, stdout=PIPE, stderr=STDOUT)
            log_file_path = os.path.join(
                work_dir_path, f"logAriba_{reference_dict[file]}.txt")
            log_process_with_output_file(process, log_message, log_file_path)
def available_species_ariba(species):
    """
    This function display ARIBA schema MLST available
    :param species: The scientific name of the strain
    :return: the id of schema
    """
    exe = shutil.which("ariba")
    ###################################
    # Run Ariba Available Species
    section_header('Ariba Available MLST Schemas')
    explanation('Display the MLST by Ariba function that search in the pubmlst database')
    # prepare
    cmd = f"{exe} pubmlstspecies"
    # launch
    process = Popen(cmd, shell=True, stdout=PIPE, stderr=STDOUT)
    id_list = log_process(process, species)
    return id_list
Exemplo n.º 3
0
def check_for_required_tools():
    """
    The function that check the dependencies
    """
    section_header('Checking requirements')
    explanation(
        'CGST requires MentaLiST to run, so it checks for this tool now.')
    mentalist_path, mentalist_version, mentalist_status = mentalist_path_and_version(
        'mentalist')
    if mentalist_status == 'good':
        log(f'MentaLiST found: {mentalist_path} (v{mentalist_version})')
    elif mentalist_status == 'not found':
        os.sys.exit(
            'Error: MentaLiST not found - make sure it is in your PATH before running '
            'CGST')
    elif mentalist_status == 'bad':
        os.sys.exit('Error: unable to determine MentaLiST version')
    log()
def build_database_ariba(cgst_database_path, species_full):
    """
    This function build ARIBA MLST database
    :param cgst_database_path: The CGST database path
    :param species_full: The scientific name of the strain
    """
    exe = shutil.which("ariba")
    species_full = species_full.strip().rstrip()
    species = species_full.split(" ")[0][0].lower() + species_full.split(" ")[1].lower()
    id_species_list = available_species_ariba(species_full.lower())
    if not id_species_list:
        log()
        log(f"Any ID found for the species : {species_full}")
        log()
        exit()
    else:
        for id_species in id_species_list:
            log(f"ID ({id_species}) found for {species_full} ")
    ###################################
    # Run Ariba Build Species Database
    section_header(f'Ariba Build database for species : {species_full}')
    explanation('Download and install MLST for a species using Ariba function')
    # prepare

    if not os.path.exists(cgst_database_path):
        os.mkdir(cgst_database_path)
    mlst_database_path = os.path.join(cgst_database_path, "ariba")
    if not os.path.exists(mlst_database_path):
        os.mkdir(mlst_database_path)
    mlst_species_database_path = os.path.join(mlst_database_path, species)
    if not os.path.exists(mlst_species_database_path):
        os.mkdir(mlst_species_database_path)

    for id_species in id_species_list:
        id_species_name = id_species.lower().replace(" ", "-").replace("#", "")
        out_pubmlst_get_path = os.path.join(mlst_species_database_path, f"mlst_{id_species_name}")

        if os.path.exists(out_pubmlst_get_path):
            shutil.rmtree(out_pubmlst_get_path)

        cmd = f"{exe} pubmlstget \"{id_species}\" {out_pubmlst_get_path}"
        # launch
        process = Popen(cmd, shell=True, stdout=PIPE, stderr=STDOUT, executable='/bin/bash')
        log_process(process, "")
Exemplo n.º 5
0
def main_analysis(detection_dir, database, work_dir, species_full, force, threads):
    """
    This function that manage the analysis of CGST
    :param detection_dir: The directory MentaLiST output
    :param database: The CGST database path
    :param work_dir: The analysis working directory path
    :param species_full: The scientific name of the strain
    :param force: Force the output
    :param threads: The number of threads to allocate
    """
    level_gg_dict = {"Diff_5_alleles": {"lvl": 5, "list_sample": []},
                     "Diff_10_alleles": {"lvl": 10, "list_sample": []},
                     "Diff_25_alleles": {"lvl": 25, "list_sample": []},
                     "Diff_50_alleles": {"lvl": 50, "list_sample": []},
                     "Diff_100_alleles": {"lvl": 100, "list_sample": []},
                     "Diff_150_alleles": {"lvl": 150, "list_sample": []},
                     "Diff_200_alleles": {"lvl": 200, "list_sample": []},
                     "Diff_300_alleles": {"lvl": 300, "list_sample": []}}
    species_full = species_full.strip().rstrip()
    species = species_full.split(" ")[0][0].lower() + species_full.split(" ")[1].lower()

    ###################################

    if not os.path.exists(work_dir):
        os.mkdir(work_dir)

    ###################################
    # Load Detection result
    section_header(f'Load Detection Result {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')
    detection_result_dict = {}
    detection_result_dict_list = []
    for file in os.listdir(detection_dir):
        if "_output_final" in file:
            strain_name = file.split("_output_final")[0]
            file_path = os.path.join(detection_dir, file)
            detection_result_dict_tmp = read_output_mentalist(file_path, strain_name)
            detection_result_dict_list.append(detection_result_dict_tmp)

    if len(detection_result_dict_list) >= 5:
        pass
    else:
        tool_error_log(f"Need more strains that just {len(detection_result_dict_list)}")
        exit()

    for detect_dict in detection_result_dict_list:
        for key, value_dict in detect_dict.items():
            if key in detection_result_dict:
                value_detect_dict = detection_result_dict[key]
                for sample, value in value_dict.items():
                    value_detect_dict[sample] = value
            else:
                detection_result_dict[key] = value_dict
    ##########
    # Get Difference alleles
    section_header(f'Analysis {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')
    detection_result_only_diff_dict = {}
    distance_dict = {}
    sample_list = []
    for key, sample_dict in detection_result_dict.items():
        if key == "Sample" or "clonal_complex" in key or "ST" in key:
            if key == "Sample":
                sample_list = [*sample_dict]
            continue
        else:
            element_list = []
            pivot = False
            for sample_name in sample_list:
                if "-" in sample_dict[sample_name] or "+" in sample_dict[sample_name] \
                        or "0" == sample_dict[sample_name] or "?" in sample_dict[sample_name]\
                        or "N" in sample_dict[sample_name]:
                    pivot = True
                    break
                else:
                    element_list.append(sample_dict[sample_name])
            if not pivot:
                # test if the element are the same or not
                if len(set(element_list)) == 1:
                    continue
                else:
                    detection_result_only_diff_dict[key] = sample_dict
                    for sample_name_1 in sample_list:
                        for sample_name_2 in sample_list:

                            if sample_dict[sample_name_1] == sample_dict[sample_name_2]:
                                continue
                            else:
                                if sample_name_1 in distance_dict:
                                    value1_dict = distance_dict[sample_name_1]
                                    if sample_name_2 in value1_dict:
                                        value1_count = value1_dict[sample_name_2]
                                        value1_dict[sample_name_2] = value1_count + 1
                                    else:
                                        value1_dict[sample_name_2] = 1

                                else:
                                    distance_dict[sample_name_1] = {sample_name_2: 1}
    explanation(f"Number of different relevant locus : {len(detection_result_only_diff_dict)}")
    ###################################
    # Get Similarity Matrix
    section_header(f'Get Similarity Matrix {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')
    sample_set_list = list(combinations(sample_list, 2))
    similarity_dict = {}
    for locus, sample_dict in detection_result_only_diff_dict.items():
        for sample_set in sample_set_list:
            if sample_dict[sample_set[0]] == sample_dict[sample_set[1]]:
                continue
            else:
                if sample_set in similarity_dict:
                    value_similarity = similarity_dict[sample_set]
                    value_similarity += 1
                    similarity_dict[sample_set] = value_similarity
                else:
                    similarity_dict[sample_set] = 1

    ###################################

    combination_dir = os.path.join(work_dir, "combination")
    if not os.path.exists(combination_dir):
        os.mkdir(combination_dir)

    cluster_dir = os.path.join(work_dir, "cluster")
    if not os.path.exists(cluster_dir):
        os.mkdir(cluster_dir)

    phylotree_dir = os.path.join(work_dir, "phylotree")
    if not os.path.exists(phylotree_dir):
        os.mkdir(phylotree_dir)

    ###################################
    # Write Similarity Matrix
    section_header(f'Write Similarity Matrix {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')
    similarity_file = os.path.join(combination_dir, "similarity_matrix.tsv")
    with open(similarity_file, "w") as similarity:
        writer_csv = writer(similarity, delimiter='\t')
        writer_csv.writerow([""] + sample_list)
        for sample_1 in sample_list:
            val_list = []
            for sample_2 in sample_list:
                tuple_1 = (sample_1, sample_2)
                tuple_2 = (sample_2, sample_1)
                if tuple_1 in similarity_dict:
                    val_list.append(similarity_dict[tuple_1])
                    continue
                if tuple_2 in similarity_dict:
                    val_list.append(similarity_dict[tuple_2])
                    continue
                if sample_1 == sample_2:
                    val_list.append("0")
                    continue
                else:
                    val_list.append("0")
            writer_csv.writerow([sample_1] + val_list)
    ###################################
    # R Script
    section_header(f'Execute RScript {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')
    ex_r = shutil.which("Rscript")
    r_script_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), "r_script.R")
    cmd = f"{ex_r} {r_script_file} --wd {combination_dir}"
    log_message = f"Command used : \n {cmd}\n"
    # launch
    process = Popen(cmd, shell=True, stdout=PIPE, stderr=STDOUT)
    log_file_path = os.path.join(combination_dir, "logR.txt")
    log_process_with_output_file(process, log_message, log_file_path)
    ###########################
    # Create Group File
    section_header(f'Create Group {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')
    r_result_file = os.path.join(combination_dir, "groups.tsv")

    groups_dict = {}
    strains_groups_list = []
    with open(r_result_file, "r") as groups_file:
        reader = DictReader(groups_file, delimiter="\t")
        headers = reader.fieldnames
        for row in reader:
            strains_groups_list.append(row[""])
            for head in headers:
                if head:
                    if head in groups_dict:
                        val_list = groups_dict[head]
                        pivot = True
                        for i, group_d in enumerate(val_list):
                            if row[head] == group_d["group"]:
                                pivot = False
                                if row[""] not in group_d["strains"]:
                                    str_list = group_d["strains"]
                                    str_list.append(row[""])
                                    group_d["strains"] = str_list
                                    continue
                                else:
                                    continue
                        if pivot:
                            val_list.append({"group": row[head], "strains": [row[""]]})
                            continue
                        groups_dict[head] = val_list
                        continue
                    else:
                        groups_dict[head] = [{"group": row[head], "strains": [row[""]]}]
                        continue

    r_result_gap_file = os.path.join(combination_dir, "groups_gap.tsv")
    if os.path.exists(r_result_gap_file):
        with open(r_result_gap_file, "r") as groups_file:
            reader = DictReader(groups_file, delimiter="\t")
            headers = reader.fieldnames
            for row in reader:
                for head in headers:
                    if head:
                        if head in groups_dict:
                            val_list = groups_dict[head]
                            pivot = True
                            for i, group_d in enumerate(val_list):
                                if row[head] == group_d["group"]:
                                    pivot = False
                                    if row[""] not in group_d["strains"]:
                                        str_list = group_d["strains"]
                                        str_list.append(row[""])
                                        group_d["strains"] = str_list
                                        continue
                                    else:
                                        continue
                            if pivot:
                                val_list.append({"group": row[head], "strains": [row[""]]})
                                continue
                            groups_dict[head] = val_list
                            continue
                        else:
                            groups_dict[head] = [{"group": row[head], "strains": [row[""]]}]
                            continue
    else:
        log()
        log("File of gap group not exist : ({0})".format(r_result_gap_file))

    strains_groups_list = list(set(strains_groups_list))
    groups_dict["1"] = [{"group": "1", "strains": strains_groups_list}]

    ###################################
    # Add allele to group
    for key_class, list_value in groups_dict.items():
        for group in list_value:
            share_90_allele_list = []
            share_strict_allele_list = []
            for key, sample_dict in detection_result_dict.items():
                if key == "Sample" or "clonal_complex" in key or "ST" in key:
                    if "ST" in key:
                        st_tmp_list = []
                        for strain in group["strains"]:
                            st_tmp_list.append(sample_dict[strain])
                        group["name_group"] = f"{species}-{key}{':'.join(list(set(st_tmp_list)))}"
                        continue
                else:
                    allele_tmp_list = []
                    for strain in group["strains"]:
                        allele_tmp_list.append(sample_dict[strain])
                    #######
                    unique = list(Counter(allele_tmp_list).keys())  # equals to list(set(words))

                    value = list(Counter(allele_tmp_list).values())
                    if len(unique) == 1:
                        share_strict_allele_list.append({"locus": key, "allele": unique[0]})
                        share_90_allele_list.append({"locus": key, "allele": unique[0]})
                    else:
                        for i, val in enumerate(value):
                            if (val / len(group["strains"])) * 100 >= 90:
                                share_90_allele_list.append({"locus": key, "allele": unique[i]})
                                break
                            else:
                                continue
                        continue
            group["share_strict_allele"] = share_strict_allele_list
            group["share_90_allele"] = share_90_allele_list
            group["count_strict_allele"] = len(share_strict_allele_list)
            group["count_90_allele"] = len(share_90_allele_list)
    ###################################
    # Create level lists
    section_header(f'Create Lvl list {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')
    for name_lvl, lvl_dict in level_gg_dict.items():
        for sample_name, count_dict in distance_dict.items():
            group_lvl_list = []
            for sample_name_2 in sample_list:
                if sample_name == sample_name_2:
                    continue
                if sample_name_2 not in count_dict:
                    group_lvl_list.append(sample_name_2)
                    continue
                if count_dict[sample_name_2] <= lvl_dict["lvl"]:
                    group_lvl_list.append(sample_name_2)
                    continue
            if group_lvl_list:
                group_lvl_list.append(sample_name)
                group_lvl_list.sort()
                if "list_sample" in lvl_dict:
                    list_tmp = lvl_dict["list_sample"]
                    # check if the list is already present
                    if group_lvl_list in list_tmp:
                        continue
                    list_tmp.append(group_lvl_list)
                    lvl_dict["list_sample"] = list_tmp
                else:
                    lvl_dict["list_sample"] = [group_lvl_list]
    ###################################
    # Create Multi fasta
    section_header(f'Create Multiple Fasta Files {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')

    cgmlst_database_path = os.path.join(database, "cgMLST", f"{species}")

    log()
    log("CGST MLST path : {0}".format(cgmlst_database_path))

    fasta_db_path_list = []

    for file_1 in os.listdir(cgmlst_database_path):
        if "cgmlst-org" == file_1 or "cnr" == file_1 or "other" == file_1:
            cgmlst_dir_path = os.path.join(cgmlst_database_path, file_1)
            for file_2 in os.listdir(cgmlst_dir_path):
                file_2_path = os.path.join(cgmlst_dir_path, file_2)
                if ".db" not in file_2 and "_fasta" in file_2 and os.path.isdir(file_2_path):
                    fasta_db_path_list.append(file_2_path)

    output_dir_msa = os.path.join(phylotree_dir, "msa")
    if not os.path.exists(output_dir_msa):
        os.mkdir(output_dir_msa)
    elif force and os.path.exists(output_dir_msa):
        rmtree(output_dir_msa)
        os.mkdir(output_dir_msa)

    pivot_first_loop = True
    sequence_dict = {}
    final_resume_aln_dict = {}

    output_aln_file_list = []
    pool = multiprocessing.Pool(processes=int(threads))
    list_jobs = []
    fasta_db_path_used = ""
    for locus_name, sample_dict in detection_result_only_diff_dict.items():
        fasta_file = ""
        for fasta_db_path in fasta_db_path_list:
            for file in os.listdir(fasta_db_path):
                if file == f"{locus_name}.fasta":
                    fasta_db_path_used = fasta_db_path
                    fasta_file = os.path.join(fasta_db_path, f"{locus_name}.fasta")
                    break
        output_fasta_file = os.path.join(output_dir_msa, os.path.basename(fasta_file))
        with open(output_fasta_file, "w") as output_fasta:
            record_dict = SeqIO.index(fasta_file, "fasta")
            for sample_name, number_allele in sample_dict.items():
                seq = record_dict[f"{locus_name}_{number_allele}"]
                seq.id = sample_name
                SeqIO.write(seq, output_fasta, "fasta")
            record_dict.close()
        ###################################
        # MAFFT - MSA
        output_aln_file = os.path.join(output_dir_msa, f"{os.path.basename(fasta_file).split('.')[0]}.aln")
        output_aln_file_list.append(output_aln_file)
        list_jobs.append([output_dir_msa, output_fasta_file, locus_name, output_aln_file])

    count_locus_cg = 0
    for file in os.listdir(fasta_db_path_used):
        if ".fasta" in file:
            count_locus_cg += 1

    pool.starmap(mafft, list_jobs)
    stop = 0
    for output_aln_file in output_aln_file_list:
        ###################################
        # EXPLOIT OUTPUT MAFFT
        with open(output_aln_file, "r") as handle:
            record_aln_dict = SeqIO.to_dict(SeqIO.parse(handle, "fasta"))
        for id_seq in sorted(record_aln_dict, key=lambda id_s: len(record_aln_dict[id_s].seq), reverse=True):
            if pivot_first_loop:
                start = 1
                stop = len(record_aln_dict[id_seq].seq)
            else:
                start = stop + 1
                stop = start + len(record_aln_dict[id_seq].seq)
            final_resume_aln_dict[record_aln_dict[id_seq].description.split(" ")[-1]] = {"start": start, "stop": stop,
                                                                                         "length": stop - start}
            break
        for sample in sample_list:
            if sample in sequence_dict:
                sequence = sequence_dict[sample]
                if sample not in record_aln_dict:
                    sequence = sequence + "-" * final_resume_aln_dict[record_aln_dict[sample].description.split(" ")[-1]]["length"]
                else:
                    sequence = sequence + record_aln_dict[sample].seq
                sequence_dict[sample] = sequence
                continue
            else:
                if sample not in record_aln_dict:
                    sequence = "-" + "-" * final_resume_aln_dict[record_aln_dict[sample].description.split(" ")[-1]]["length"]
                else:
                    sequence = record_aln_dict[sample].seq
                sequence_dict[sample] = sequence
                continue
        pivot_first_loop = False
    ###################################
    # Write Groups
    all_groups_alleles_file = os.path.join(combination_dir, "groups_alleles.tsv")
    with open(all_groups_alleles_file, "w") as all_groups_alleles:
        writer_group = writer(all_groups_alleles, delimiter='\t')
        writer_group.writerow(
            ["Class", "Group", "Strains", "Count Strains", "Name Group", "Alleles strict", "Count Alleles strict",
             "Strict Coverage CG", "Alleles 90%", "Count Alleles 90%", "90% Coverage CG"])
        for key_class, list_value in groups_dict.items():
            for ele_d in list_value:
                strict_string_list = []
                for ele_strict in ele_d["share_strict_allele"]:
                    strict_string_list.append(f"{ele_strict['locus']}:{ele_strict['allele']}")
                ninety_string_list = []
                for ele_ninety in ele_d["share_90_allele"]:
                    ninety_string_list.append(f"{ele_ninety['locus']}:{ele_ninety['allele']}")
                writer_group.writerow(
                    [key_class, ele_d["group"], ele_d["strains"], len(ele_d["strains"]), ele_d["name_group"],
                     ",".join(strict_string_list), ele_d["count_strict_allele"],
                     (ele_d["count_strict_allele"] / count_locus_cg) * 100, ",".join(ninety_string_list),
                     ele_d["count_90_allele"], (ele_d["count_90_allele"] / count_locus_cg) * 100])

    ###################################
    # Create MSA
    section_header(f'Create Alignment file {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')
    output_aln_final_file = os.path.join(phylotree_dir, "core-genome.aln")
    with open(output_aln_final_file, "w") as output_aln_final_handle:
        for sample in sample_list:
            record = SeqRecord(sequence_dict[sample], id=sample)
            SeqIO.write(record, output_aln_final_handle, "fasta")

    output_resume_final_file = os.path.join(phylotree_dir, "resume_core-genome.tsv")
    with open(output_resume_final_file, "w") as output_resume_final_handle:
        writer_resume = writer(output_resume_final_handle, delimiter="\t")
        writer_resume.writerow(["Gene", "Start", "Stop", "Length"])
        for gene, value_dict in final_resume_aln_dict.items():
            writer_resume.writerow([gene, value_dict["start"], value_dict["stop"], value_dict["length"]])
    ###################################
    # Gubbins
    section_header(f'Delete Recombination with Gubbins {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')

    ex_gubbins = shutil.which("run_gubbins")

    gubbins_work_dir = os.path.join(phylotree_dir, "gubbins")

    if not os.path.exists(gubbins_work_dir):
        os.mkdir(gubbins_work_dir)
    elif force and os.path.exists(gubbins_work_dir):
        rmtree(gubbins_work_dir)
        os.mkdir(gubbins_work_dir)
    cmd = f"{ex_gubbins} -p {gubbins_work_dir}/gubbins --threads {threads} {output_aln_final_file}"
    log_message = f"Command used : \n {cmd}\n"
    # launch
    process = Popen(cmd, shell=True, stdout=PIPE, stderr=STDOUT)
    log_file_path = os.path.join(gubbins_work_dir, "logGubbins.txt")
    log_process_with_output_file(process, log_message, log_file_path)
    ###################################
    # RAXML-ng
    section_header(f'Phylotree with RaXML-ng {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')
    ex_raxml_ng = shutil.which("raxml-ng")
    raxml_work_dir = os.path.join(phylotree_dir, "raxml-ng")
    raxml_prefix = os.path.join(raxml_work_dir, "raxml-ng")

    if not os.path.exists(raxml_work_dir):
        os.mkdir(raxml_work_dir)
    elif force and os.path.exists(raxml_work_dir):
        rmtree(raxml_work_dir)
        os.mkdir(raxml_work_dir)

    gubbins_snp_phylip = os.path.join(gubbins_work_dir, "gubbins.filtered_polymorphic_sites.phylip")
    cmd = f"{ex_raxml_ng} --all --msa {gubbins_snp_phylip} --prefix {raxml_prefix} --model GTR+FO+IO" \
          f" --bs-trees autoMRE --threads 4"
    log_message = f"Command used : \n {cmd}\n"
    # launch
    process = Popen(cmd, shell=True, stdout=PIPE, stderr=STDOUT)
    log_file_path = os.path.join(raxml_work_dir, "logRaXML-ng.txt")
    log_process_with_output_file(process, log_message, log_file_path)

    ###################################
    # Write Analysis
    section_header(f'Write Final Analysis {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')
    all_report_file = os.path.join(work_dir, "all_report.tsv")
    with open(all_report_file, "w") as all_report:
        writer_report_all = writer(all_report, delimiter='\t')
        for key, sample_dict in detection_result_dict.items():
            if key == "Sample":
                writer_report_all.writerow(["Sample"] + [*sample_dict])
                pass
            else:
                write_list = [key]
                for sample_name in sample_list:
                    write_list.append(sample_dict[sample_name])
                writer_report_all.writerow(write_list)
    lvl_report_file = os.path.join(cluster_dir, "lvl_report.tsv")
    with open(lvl_report_file, "w") as lvl_report:
        writer_report = writer(lvl_report, delimiter='\t')
        writer_report.writerow(["Name level", "Groups"])
        for name_lvl, lvl_dict in level_gg_dict.items():
            writer_report.writerow([name_lvl, ";".join(str(v) for v in lvl_dict["list_sample"])])
def build_database_mentalist(cgst_database_path, species_full, threads):
    """
    This function build MentaLiST cgMLST database
    :param cgst_database_path: The CGST database path
    :param species_full: The scientific name of the strain
    :param threads: The number of threads to allocate
    """
    exe = shutil.which("mentalist")
    species_full = species_full.strip().rstrip()
    species = species_full.split(" ")[0][0].lower() + species_full.split(" ")[1].lower()

    if not os.path.exists(cgst_database_path):
        os.mkdir(cgst_database_path)
    cgmlst_database_path = os.path.join(cgst_database_path, "cgMLST")
    if not os.path.exists(cgmlst_database_path):
        os.mkdir(cgmlst_database_path)

    cgmlst_species_base_path = os.path.join(cgmlst_database_path, species)
    if not os.path.exists(cgmlst_species_base_path):
        os.mkdir(cgmlst_species_base_path)
    else:
        log(f"cgMLST for {species_full} already present")

    #####
    # Check if cgmlst in CNR repo
    url = 'https://raw.githubusercontent.com/CNRResistanceAntibiotic/core_genomes/master/reference.csv'
    df = pandas.read_csv(url, error_bad_lines=False)

    url_folder = ""
    sub_folder = ""
    for i, row in enumerate(df["Species"]):
        if row == species_full:
            log("Species {0} found in CNR GitHub".format(species_full))
            url_folder = "https://github.com/CNRResistanceAntibiotic/core_genomes/trunk/{0}/{1}"\
                .format(df["Name"][i], df["Sub-folder"][i])
            sub_folder = df["Sub-folder"][i]
        else:
            continue
    id_species_list = []
    if url_folder:
        cgmlst_species_database_path = os.path.join(cgmlst_species_base_path, sub_folder)
        cmd = f"svn export {url_folder} {cgmlst_species_database_path} --force"

        # launch
        process = Popen(cmd, shell=True, stdout=PIPE, stderr=STDOUT, executable='/bin/bash')
        while True:
            if process.stdout is not None:
                output = process.stdout.readline().decode("utf-8").rstrip()
            if process.stderr is not None:
                error = process.stderr.readline().decode("utf-8").rstrip()
            if output == '' and process.poll() is not None:
                break

    else:
        id_species_list = available_species_mentalist(species_full)
        if not id_species_list:
            log()
            log(f"Any ID found for the species : {species_full}")
            log()
            exit()
        else:
            for id_species in id_species_list:
                log(f"ID ({id_species.lower()}) found for {species_full} ")

        cgmlst_species_database_path = os.path.join(cgmlst_species_base_path, "cgmlst-org")

        if not os.path.exists(cgmlst_species_database_path):
            os.mkdir(cgmlst_species_database_path)
        else:
            log(f"cgMLST for {species_full} already present")

        ###################################
        # Run MentaLiST Build Species Database
        section_header(f'MentaLiST Build database cgMLST for species : {species_full}')
        explanation('Download and install cgMLST for a species using MentaLiST function')
        # prepare

        fasta_database_name = f"{species}_cgmlst_fasta"
        fasta_database_path = os.path.join(cgmlst_species_database_path, fasta_database_name)

        database_name = f"{species}_cgmlst.db"
        database_path = os.path.join(cgmlst_species_database_path, database_name)

        cmd = f"{exe} download_cgmlst -k 31 -o {fasta_database_path} -s {id_species_list[0].lower()} --db {database_path}" \
              f" --threads {threads}"

        print(cmd)

        # launch
        process = Popen(cmd, shell=True, stdout=PIPE, stderr=STDOUT, executable='/bin/bash')
        log_process(process, "")
Exemplo n.º 7
0
def mentalist_detection(r1, r2, database, work_dir_path, name, species_full,
                        threads):
    """
    This function manage the MentaLiST detection
    :param r1: The R1 fastq file path
    :param r2: The R2 fastq file path
    :param database: The CGST database path
    :param work_dir_path: The working directory path
    :param name: The name of
    :param species_full: The scientific name of the strain
    :param threads: The number of threads to allocate
    """
    kmer_threshold = 5
    kmer_build = 31

    species_full = species_full.strip().rstrip()
    species = species_full.split(" ")[0][0].lower() + species_full.split(
        " ")[1].lower()

    exe = shutil.which("mentalist")
    exe_parse = shutil.which("parse_novel_alleles.py")
    exe_update = shutil.which("update_fasta_db.py")

    cgmlst_database_path = os.path.join(database, "cgMLST", f"{species}")
    fasta_db_path = db_path = ""
    cg_db_dict = {}

    for file_1 in os.listdir(cgmlst_database_path):
        file_1_path = os.path.join(cgmlst_database_path, file_1)
        if os.path.isdir(file_1_path):
            cgmlst_dir_path = file_1_path
            for file_2 in os.listdir(cgmlst_dir_path):
                file_2_path = os.path.join(cgmlst_dir_path, file_2)
                if "_fasta" in file_2 and os.path.isdir(file_2_path):
                    fasta_db_path = file_2_path
                    db_name = file_2.split("_fasta")[0] + ".db"
                    db_path = os.path.join(cgmlst_dir_path, db_name)
        cg_db_dict[file_1] = {
            'db_path': db_path,
            'fasta_db_path': fasta_db_path
        }

    # make detection on each core-genome available for the species
    for name_db, value_hash in cg_db_dict.items():
        i = 1
        db_path = value_hash["db_path"]
        fasta_db_path = value_hash["fasta_db_path"]
        section_header('Launch MentaLiST:')
        explanation('Name Database  : {0}'.format(name_db))
        explanation('Database Path : {0}'.format(db_path))
        explanation('Fasta Path : {0}'.format(fasta_db_path))
        output_final = os.path.join(work_dir_path,
                                    f"{name}_{name_db}_output_final")

        # If the kmer index database do not exist -> create it
        if not os.path.exists(db_path):
            ###################################
            # Run MentaLiST Build DB
            section_header('Create MentaLiST Database')
            explanation(
                'Before run a detection MentaLiST need to construct his own kmer-index database'
            )
            # prepare
            cmd = f"{exe} build_db --db {db_path} -k {kmer_build} -d {fasta_db_path} --threads {threads}"
            log_message = f"Command used : \n {cmd}"
            # launch
            log_file_path = os.path.join(work_dir_path,
                                         "logBuildDB_{0}.txt".format(name_db))
            process = Popen(cmd,
                            shell=True,
                            stdout=PIPE,
                            stderr=STDOUT,
                            executable='/bin/bash')
            log_process_with_output_file(process, log_message, log_file_path)

        ###################################
        # Run MentaLiST call
        section_header(f'Run MentaLiST Call : Round {i}')
        explanation('MentaLiST Detection ')
        # prepare
        output = os.path.join(work_dir_path, f"output_mentalist_{name_db}_{i}")
        cmd = f"{exe} call --db {db_path} --output_votes -o {output} -1 {r1} -2 {r2} --kt {kmer_threshold}"
        log_message = f"Command used : \n {cmd}\n"
        # launch
        process = Popen(cmd,
                        shell=True,
                        stdout=PIPE,
                        stderr=STDOUT,
                        executable='/bin/bash')
        log_file_path = os.path.join(work_dir_path,
                                     f"logMentaLiST_{name_db}_{i}.txt")
        log_process_with_output_file(process, log_message, log_file_path)
        fasta_novel_st = os.path.join(
            work_dir_path, f"output_mentalist_{name_db}_{i}.novel.fa")

        while os.path.exists(
                fasta_novel_st) and os.stat(fasta_novel_st).st_size != 0:
            ###################################
            # check MentaLiST output for Novel and Multiple Votes
            section_header(f'Check MentaLiST Output : Round {i}')
            explanation('Check MentaLiST output for Novel Votes')
            check_mentalist_output(fasta_novel_st, output, fasta_db_path)

            ###################################
            # Run MentaLiST Parse Novel Fasta
            section_header(f'Run MentaLiST parse novel fasta : Round {i}')
            explanation('MentaLiST parse new novel variant ')
            # prepare
            fasta_novel_st = os.path.join(
                work_dir_path, f"output_mentalist_{name_db}_{i}.novel.fa")
            result_parse_path = os.path.join(
                work_dir_path, f"all_novel_alleles_{name_db}_{i}")
            cmd = f"{exe_parse} -f {fasta_novel_st} -o {result_parse_path}"
            log_message = f"Command used : \n {cmd}\n"
            # launch
            process = Popen(cmd, shell=True, stdout=PIPE, stderr=STDOUT)
            log_file_path = os.path.join(
                work_dir_path, f"logParseNovelAlleles_{name_db}_{i}.txt")
            log_process_with_output_file(process, log_message, log_file_path)

            ###################################
            # Run MentaLiST Update Fasta DB
            section_header(f'Run MentaLiST update DB fasta : Round {i}')
            explanation('MentaLiST update DB fasta with new novel variant ')
            # prepare
            result_parse_path = os.path.join(
                work_dir_path, f"all_novel_alleles_{name_db}_{i}")
            cmd = f"{exe_update} -db {fasta_db_path} -n {result_parse_path}.fa"
            log_message = f"Command used : \n {cmd}\n"
            # launch
            process = Popen(cmd, shell=True, stdout=PIPE, stderr=STDOUT)
            log_file_path = os.path.join(
                work_dir_path,
                f"logCreateNewSchemeWithNovel_{name_db}_{i}.txt")
            log_process_with_output_file(process, log_message, log_file_path)

            ###################################
            # Run MentaLiST Build DB
            section_header(f'Create MentaLiST Database : Round {i}')
            explanation(
                'Before run a detection MentaLiST need to construct his own kmer-index database'
            )
            # prepare
            cmd = f"{exe} build_db --db {db_path} -k {kmer_build} -d {fasta_db_path} --threads {threads}"
            log_message = f"Command used : \n {cmd}"
            # remove previous db
            os.remove(db_path)
            # launch
            log_file_path = os.path.join(work_dir_path,
                                         f"logBuildDB_{name_db}_{i}.txt")
            process = Popen(cmd,
                            shell=True,
                            stdout=PIPE,
                            stderr=STDOUT,
                            executable='/bin/bash')
            log_process_with_output_file(process, log_message, log_file_path)

            ###################################
            # Update counter
            i += 1

            ###################################
            # Run MentaLiST call
            section_header(f'Run MentaLiST Call : Round {i}')
            explanation('MentaLiST Detection ')
            # prepare
            output = os.path.join(work_dir_path,
                                  f"output_mentalist_{name_db}_{i}")
            cmd = f"{exe} call --db {db_path} --output_votes -o {output} -1 {r1} -2 {r2} --kt {kmer_threshold}"
            log_message = f"Command used : \n {cmd}\n"
            # launch
            process = Popen(cmd,
                            shell=True,
                            stdout=PIPE,
                            stderr=STDOUT,
                            executable='/bin/bash')
            log_file_path = os.path.join(work_dir_path,
                                         f"logMentaLiST_{name_db}_{i}.txt")
            log_process_with_output_file(process, log_message, log_file_path)
            fasta_novel_st = os.path.join(
                work_dir_path, f"output_mentalist_{name_db}_{i}.novel.fa")

            ###################################
            # check MentaLiST output for Novel and Multiple Votes
            section_header(f'Check MentaLiST Output : Round {i}')
            explanation(
                'Check MentaLiST output for Novel Votes and edit false novel in previous output'
            )
            check_mentalist_output(fasta_novel_st, output, fasta_db_path)

        ###################################
        # Process final result
        section_header('Process Final Output')

        # final files
        shutil.move(output, output_final)
        intermediate_file_dir = os.path.join(work_dir_path,
                                             "intermediate_files")

        if not os.path.exists(intermediate_file_dir):
            os.mkdir(intermediate_file_dir)

        for file in os.listdir(work_dir_path):
            file_path = os.path.join(work_dir_path, file)
            if not os.path.isdir(file_path):
                if not ("_output_final" in file or "statistics_" in file
                        or "combination_result_" in file):
                    shutil.move(file_path,
                                os.path.join(intermediate_file_dir, file))

        # add ST in modify output_mentalist
        st_dict = add_st_to_output_mentalist(output_final, name)

        # Statistic
        detection_result_dict = read_output_mentalist(output_final, name)

        count_none = count_loc = count_low_cov = count_perfect = count_multi = 0

        for locus, value_dict in detection_result_dict.items():
            if locus == "Sample" or locus == "ST" or locus == "clonal_complex" or locus in st_dict:
                continue
            elif value_dict[name] == "0?":
                count_none += 1
                count_loc += 1
            elif "-" in value_dict[name]:
                count_low_cov += 1
                count_loc += 1
            elif "+" in value_dict[name]:
                count_multi += 1
                count_loc += 1
            else:
                count_perfect += 1
                count_loc += 1

        explanation(f'Total Locus' f' : {count_loc}')
        explanation(
            f'Perfect Locus'
            f' : {count_perfect} -> {round((count_perfect / count_loc * 100), 2)}% of total locus'
        )
        explanation(
            f'Multiple Locus'
            f' : {count_multi} -> {round((count_multi / count_loc * 100), 2)}% of total locus'
        )
        explanation(
            f'Low Coverage Locus'
            f' : {count_low_cov} -> {round((count_low_cov / count_loc * 100), 2)}% of total locus'
        )
        explanation(
            f'None Locus'
            f' : {count_none} -> {round((count_none / count_loc * 100), 2)}% of total locus'
        )
        stats_file = os.path.join(work_dir_path,
                                  "statistics_{0}.tsv".format(name_db))

        with open(stats_file, "w") as output_file:
            csv_writer = writer(output_file, delimiter="\t")
            csv_writer.writerow(["Total Locus", count_loc])
            csv_writer.writerow(
                ["Resume", "Count", "Percentage on Total locus"])
            csv_writer.writerow([
                "Perfect Locus", count_perfect,
                (count_perfect / count_loc * 100)
            ])
            csv_writer.writerow([
                "Multiple Locus", count_multi, (count_multi / count_loc * 100)
            ])
            csv_writer.writerow([
                "Low Coverage Locus", count_low_cov,
                (count_low_cov / count_loc * 100)
            ])
            csv_writer.writerow(
                ["None Locus", count_none, (count_none / count_loc * 100)])

        # Load Known Combination
        known_comb_path = os.path.join(os.path.dirname(db_path),
                                       "combination_list.tsv")

        known_comb_dict = {}

        if os.path.exists(known_comb_path):
            with open(known_comb_path, "r") as file:
                reader = DictReader(file, delimiter="\t")
                for row in reader:
                    known_comb_dict[row["Name"]] = row["Combination"]
        else:
            explanation(
                f"combination file not found for {species_full} at {known_comb_path}"
            )

        # Search For Known Combination
        final_file_path = os.path.join(work_dir_path,
                                       f"{name}_{name_db}_output_final")

        detection_result_dict = read_output_mentalist(final_file_path, name)

        comb_strain_list = []

        for locus, sample_dict in detection_result_dict.items():
            if locus == "Sample" or locus == "ST" or locus == "clonal_complex":
                continue
            else:
                comb_strain_list.append(f"{locus}:{sample_dict[name]}")
        if known_comb_dict:
            # search for each known combination
            combine_result_file = os.path.join(
                work_dir_path, f"combination_result_{name_db}.tsv")
            with open(combine_result_file, "w") as combine_file:
                csv_writer = writer(combine_file, delimiter="\t")
                csv_writer.writerow([
                    "Combination Name", "Count Reference Locus",
                    "Count Sample Locus", "Ratio", "Comment"
                ])
                for name_comb, combs in known_comb_dict.items():
                    combination_known = combs.split(",")
                    result = list(
                        set(comb_strain_list).intersection(combination_known))
                    ratio = round((len(result) / len(combination_known)) * 100,
                                  2)
                    if ratio == 100:
                        comment = "Perfect"
                    elif ratio >= 98:
                        comment = "Very Close"
                    elif ratio >= 90:
                        comment = "Close"
                    elif ratio >= 80:
                        comment = "Like"
                    elif ratio >= 70:
                        comment = "Close Like"
                    else:
                        comment = "No relevant"
                    csv_writer.writerow([
                        name_comb,
                        len(combination_known),
                        len(result), ratio, comment
                    ])
        section_header('Finish {0} Coregenome Analysis'.format(name_db))
    section_header('Finish ALL CoreGenome Analysis')