def mmseqs_to_pangenome(mmseqdb, mmseqclust, logmmseq, outfile):
    Convert mmseqs clustering to a pangenome file:

    - convert mmseqs results to tsv file
    - convert tsv file to pangenome

    mmseqdb : str
         path to base filename of output of mmseqs createdb
    mmseqclust : str
        path to base filename of output of mmseqs cluster
    logmmseq : str
         path to file where logs must be written
    outfile : str
        pangenome filename

        - families : {fam_num: [all members]}
    cmd = f"mmseqs createtsv {mmseqdb} {mmseqdb} {mmseqclust} {mmseqclust}.tsv"
    msg = "Problem while trying to convert mmseq result file to tsv file"
    logger.details(f"MMseqs command: {cmd}")
    with open(logmmseq, "a") as logf:
        utils.run_cmd(cmd, msg, eof=True, stdout=logf, stderr=logf)
    # Convert the tsv file to a 'pangenome' file: one line per family
    families = mmseqs_tsv_to_pangenome(mmseqclust, logmmseq, outfile)
    return families
def run_mmseqs_clust(args):
    Run mmseqs clustering

    args : tuple
         (mmseqdb, mmseqclust, tmpdir, logmmseq, min_id, threads, clust_mode), with:

            * mmseqdb: path to base filename (output created by mmseq db)
            * mmseqclust: path to base filename for output of mmseq clustering
            * tmpdir : path to folder which will contain mmseq temporary files
            * logmmseq : path to file where logs must be written
            * min_id : min percentage of identity to be considered in the same family
            *         (between 0 and 1)
            * threads : max number of threads to use
            * clust_mode : [0, 1, 2], 0 for 'set cover', 1 for 'single-linkage', 2 for 'CD-Hit'

    mmseqdb, mmseqclust, tmpdir, logmmseq, min_id, threads, clust_mode = args
    cmd = (
        f"mmseqs cluster {mmseqdb} {mmseqclust} {tmpdir} --min-seq-id {min_id} --threads {threads} --cluster-mode "
    logger.details(f"MMseqs command: {cmd}")
    msg = f"Problem while clustering proteins with mmseqs. See log in {logmmseq}"
    with open(logmmseq, "a") as logm:
        utils.run_cmd(cmd, msg, eof=False, stdout=logm, stderr=logm)
def run_fastme(alignfile, boot, write_boot, threads, model, outdir, quiet):
    Run fastME on the given alignment.

    alignfile: str
        Path to file containing alignments of persistent families grouped by genome
    boot: int or None
        Number of bootstraps to compute. None if no bootstrap asked
    write_boot: bool
        True if all bootstrap pseudo-trees must be saved into a file, False otherwise
    threads: int
        Maximum number of threads to use
    model: str or None
        DNA substitution model chosen by user. None if default one
    outdir: str
        output directory to save all results
    quiet: bool
        True if nothing must be printed to stderr/stdout, False otherwise
    logger.info("Running FastME...")
    bootinfo = ""
    threadinfo = ""
    outboot = ""

    # Get bootstrap information
    if boot:
        bootinfo = "-b {}".format(boot)
    # Get threads information
    if threads:
        threadinfo = "-T {}".format(threads)
    # Get output filename
    align_name = os.path.basename(alignfile)
    logfile = os.path.join(outdir, align_name + ".fastme.log")
    treefile = os.path.join(outdir, align_name + ".fastme_tree.nwk")
    # If bootstrap pseudo-trees must be written, define the filename here
    if write_boot:
        outboot = "-B " + os.path.join(outdir,
                                       align_name + ".fastme_bootstraps.nwk")
    # Put default model if not given
    if not model:
        model = "T"
    cmd = (f"fastme -i {alignfile} -d{model} -nB -s {threadinfo} {bootinfo} "
           f"-o {treefile} -I {logfile} {outboot}")
    if quiet:
        fnull = open(os.devnull, 'w')
        fnull = None
    error = ("Problem while running FastME. See log file ({}) for "
             "more information.").format(logfile)
def create_mmseqs_db(mmseqdb, prt_path, logmmseq):
    Create ffindex of protein bank (prt_path) if not already done. If done, just write a message
    to tell the user that the current existing file will be used.

    mmseqdb : str
         path to base filename for output of mmseqs createdb
    prt_path : str
        path to the file containing all proteins to cluster
    logmmseq : str
         path to file where logs must be written

        True if mmseqs db just created, False if already existed
    outext = [
        "", ".index", ".dbtype", ".lookup", "_h", "_h.index", "_h.dbtype"
    files_existing = []
    if os.path.isfile(mmseqdb):
        for file in [mmseqdb + ext for ext in outext]:
            if not os.path.isfile(file):
        if len(files_existing) != len(outext):
                f"mmseqs database {mmseqdb} already exists, but at least 1 associated "
                "file (.dbtype, .index etc). is missing. The program will "
                "remove existing files and recreate the database.")
            files_remaining = copy.deepcopy(files_existing)
            for file in files_existing:
                os.remove(file)  # Delete file
                    file)  # Remove file from list of existing files
                logger.details(f"Removing '{file}'.")
            files_existing = copy.deepcopy(files_remaining)
                f"mmseqs database {mmseqdb} already exists. The program will "
                "use it.")
            return False
    logger.debug("Existing files: {}".format(len(files_existing)))
    logger.debug("Expected extensions: {}".format(len(outext)))
    cmd = f"mmseqs createdb {prt_path} {mmseqdb}"
    msg = (f"Problem while trying to convert database {prt_path} to mmseqs "
           "database format.")
    logger.details(f"MMseqs command: {cmd}")
    with open(logmmseq, "w") as logf:
        utils.run_cmd(cmd, msg, eof=True, stdout=logf, stderr=logf)
    return True
def compare_all(out_msh, matrix, npz_matrix, mash_log, threads):
    Comparing all pairwise genomes that are already been sketched in the given file.

    out_msh : str
        output of mash
    matrix : str
        File to put generated matrix of pairwise distances between all genomes
    npz_matrix : str
        matrix of pairwise distances saved in a binary file
    mash_log : str
        mash logfile
    threads :
        max number of threads to use


    return code
    # txt matrix already exists
    if os.path.isfile(matrix):
            "Matrix file {} already exists. The program will use this distance matrix "
            "to filter all genomes according to their distances.".format(
        return 0
    # npz matrix already exists
    if os.path.isfile(npz_matrix):
            "Matrix file {} already exists. The program will use this distance matrix "
            "to filter all genomes according to their distances.".format(
        return 0
    logger.info("Computing pairwise distances between all genomes")
    cmd_dist = f"mash dist -p {threads} {out_msh}.msh {out_msh}.msh"
    # Open matfile to write matrix inside
    matfile = open(matrix, "w")
    # Open mash log to add log of 'mash dist' to log of 'mash sketch'
    outf = open(mash_log, "a")
    error_dist = (
        "Error while trying to estimate pairwise distances between all genomes. "
        f"See {mash_log}.")
    utils.run_cmd(cmd_dist, error_dist, eof=True, stdout=matfile, stderr=outf)
    return 0
def run_fasttree(alignfile, boot, outdir, model, quiet):
    Run FastTree on given alignment

    alignfile: str
        Path to file containing all families aligned, grouped by genome
    boot: int or None
        Number of bootstraps to calculate (None if no bootstrap asked)
    treefile: str or None
        Path to the tree file that must be created
    model: str
        DNA substitution model
    quiet: bool
        True if nothing must be printed to stderr/stdout, False otherwise
    logger.info("Running FasttreeMP...")
    if not boot:
        bootinfo = "-nosupport"
        bootinfo = "-boot {}".format(boot)
    align_name = os.path.basename(alignfile)
    logfile = os.path.join(outdir, align_name + ".fasttree.log")
    treefile = os.path.join(outdir, align_name + ".fasttree_tree.nwk")
    cmd = f"FastTreeMP -nt {model} -noml -nocat {bootinfo} -log {logfile} {alignfile}"
    logger.details("Fasttree command: " + cmd)
    if quiet:
        fnull = open(os.devnull, 'w')
        fnull = None
    stdout = open(treefile, "w")
    error = ("Problem while running Fasttree. See log file ({}) for "
             "more information.").format(logfile)
def run_quicktree(alignfile, boot, outdir):
    Run quicktree on the given alignment.

    alignfile: str
        Path to file containing alignments of persistent families grouped by genome,
        in Stockholm format
    boot: int or None
        Number of bootstraps to compute. None if no bootstrap asked
    outdir: str or None
        Path to the tree file that must be created
    logger.info("Running Quicktree...")
    bootinfo = ""

    # Get bootstrap information
    if boot:
        bootinfo = f"-boot {boot}"
    # Get output filename and logfile name
    align_name = os.path.basename(alignfile)
    logfile = os.path.join(outdir, align_name + ".quicktree.log")
    treefile = os.path.join(outdir, align_name + ".quicktree_tree.nwk")
    cmd = f"quicktree -in a -out t {bootinfo} {alignfile}"
    outfile = open(treefile, "w")
    logfilef = open(logfile, "w")
    error = (f"Problem while running quicktree. See log file ({logfile}) for "
             "more information.")
def back_translate(num_fam, mafft_file, gen_file, btr_file, nbfal, logger):
    Backtranslate protein alignment to nucleotides

    num_fam : int
        current family number. Used for log messages
    mafft_file : str
        path to file containing protein alignments by mafft
    gen_file : str
        path to file containing all sequences, not aligned, in nucleotides. It is used to
        convert the alignment in proteins into a nucleotide alignment
    btr_file : str
        path to the file that will contain the nucleotide alignment
    nbfal : int
        number of sequences aligned for the family by mafft
    logger : logging.Logger
        logger with queueHandler to give logs to main logger

        - False if problem (back-translation, different number of families...)
        - number of sequences in btr file if everything went well
    logger.log(utils.detail_lvl(), f"Back-translating family {num_fam}")
    curpath = os.path.dirname(os.path.abspath(__file__))
    awk_script = os.path.join(curpath, "prt2codon.awk")
    cmd = f"awk -f {awk_script} {mafft_file} {gen_file}"
    stdout = open(btr_file, "w")
    error = f"Problem while trying to backtranslate {mafft_file} to a nucleotide alignment"
    ret = utils.run_cmd(cmd, error, stdout=stdout, logger=logger)
    if not isinstance(ret, int):
        ret = ret.returncode
    if ret != 0:
        return False
    message = (f"fam {num_fam}: different number of proteins aligned in {mafft_file} ({nbfal}) and genes "
               f"back-translated in {btr_file}")
    # Check number of sequences in btr file, and return True/False according to it
    # It should contain the same number of sequences as the mafft file.
    return check_nb_seqs(mafft_file, nbfal, logger, message)
def prodigal_train(gpath, annot_folder):
    Use prodigal training mode.
    First, train prodigal on the first genome ('gpath'), and write it to 'genome'.trn,
    file which will be used for the annotation of all next sequence
    gpath : str
        path to genome to train on
    annot_folder : str
        path to folder where the log files and train file will be saved

        path and name of train file (will be used to annotate all next genomes)
        If problem, returns empty string
    logger.info(f"Prodigal will train using {gpath}")
    gname = os.path.basename(gpath)             # path/to/original/genome.fasta -> genome.fasta
    gpath_train = os.path.join(annot_folder, gname + ".trn") # path/to/prodiRes/genome.fasta.trn
    if os.path.isfile(gpath_train):
        logger.info(f"A training file already exists ({gpath_train}). "
                     "It will be used to annotate all genomes.")
        return gpath_train
    prodigal_logfile = gpath_train + "-prodigal-train.log"  # path/to/genome-prodigal-train.log
    prodigal_logfile_err = gpath_train + "-prodigal-train.log.err"
    cmd = (f"prodigal -i {gpath} -t {gpath_train}")
    error = (f"Error while trying to train prodigal on {gname}. See {prodigal_logfile_err}.")
    logger.log(utils.detail_lvl(), "prodigal command: " + cmd)
    prodigalf = open(prodigal_logfile, "w")
    prodigalferr = open(prodigal_logfile_err, "w")
    ret = utils.run_cmd(cmd, error, eof=False, stderr=prodigalferr, stdout=prodigalf,
    if ret.returncode == 0:
        logger.log(utils.detail_lvl(), f"End training on {gpath}")
        return gpath_train
        return ""
def mafft_align(num_fam, prt_file, mafft_file, nbfprt, logger):
    Align all proteins of the given family with mafft

    num_fam : int
        current family number
    prt_file : str
        path to file containing all proteins extracted
    mafft_file : str
        path to file which will contain proteins alignment
    nbfprt : int
        number of proteins extracted in prt file
    logger : logging.Logger
        logger with queueHandler to give logs to main logger

        True if no problem (alignment ok, same number of proteins extracted and aligned),
        False otherwise
    logger.log(utils.detail_lvl(), f"Aligning family {num_fam}")
    cmd = f"mafft --auto {prt_file}"
    error = f"Problem while trying to align fam {num_fam}"
    stdout = open(mafft_file, "w")
    stderr = open(mafft_file + ".log", "w")
    logger.log(utils.detail_lvl(), f"Mafft command: {cmd}")
    ret = utils.run_cmd(cmd, error, stdout=stdout, stderr=stderr, logger=logger)
    if not isinstance(ret, int):
        ret = ret.returncode
    if ret != 0:
        return False
    message = (f"fam {num_fam}: different number of proteins extracted in {prt_file} ({nbfprt}) and proteins "
               f"aligned in {mafft_file}")
    return check_nb_seqs(mafft_file, nbfprt, logger, message)
def run_tree(alignfile, boot, outdir, quiet, threads, **kwargs):
    Run IQtree for the given alignment file and options

    alignfile: str
        path to file containing all persistent families aligned, and grouped by genome
    boot: int or None
        number of bootstraps to calculate, None if no bootstrap asked
    outdir: str or None
        Path to the tree file that must be created
    quiet: bool
        True if nothing must be printed to stderr/stdout, False otherwise
    threads: int
        Maximum number of threads to use
    kwargs["model"]: str
        DNA substitution model chosen by user
    kwards["wb"]: bool
    	True if all bootstrap pseudo-trees must be saved into a file, False otherwise
    kwargs["mem"]: str
    	Maximal RAM usage in GB | MB | % - Only for iqtree
    kwargs["s"]: str
    	soft to use (iqtree or iqtree2)
    # Get optional arguments
    model = kwargs["model"]
    write_boot = kwargs["wb"]
    memory = kwargs["mem"]
    soft = kwargs["s"]
    fast = kwargs["f"]
    if not fast:
        fast = ""
        fast = "-fast"

    logger.info("Running IQtree...")

    # Init non mandatory arguments
    bootinfo = ""
    wb_info = ""
    mem_info = ""
    threadinfo = ""

    # Get info on all options (syntax changes according to IQtree version 1.x or 2.x)
    if boot:
        if soft == "iqtree":
            bootinfo = f"-bb {boot}"
            bootinfo = f"-B {boot}"
    if write_boot:
        if soft == "iqtree":
            wb_info = "-wbt"
            wb_info = "--boot-trees"
    if memory:
        if soft == "iqtree":
            mem_info = f"-mem {memory}"
            mem_info = f"--mem {memory}"
    # IQtree is always run quietly, but syntax depends on version:
    if soft == "iqtree":
        qu = "-quiet"
        qu = "--quiet"
    # Get threads information
    if threads:
        if soft == "iqtree":
            threadinfo = f"-nt {threads}"
            threadinfo = f"-T {threads}"

# get cmd for seqtype
    if soft == "iqtree":
        seqtype = "-st DNA"
        seqtype = "--seqtype DNA"

    # Define treefile name if not given.
    align_name = os.path.basename(alignfile)
    logfile = os.path.join(outdir, align_name + ".iqtree.log")
    treefile = os.path.join(outdir, align_name + ".iqtree_tree")
    # get prefix cmd:
    if soft == "iqtree":
        prefix = f"-pre {treefile}"
        prefix = f"--prefix {treefile}"
    cmd = (
        f"{soft} -s {alignfile} {threadinfo} -m {model} {mem_info} {bootinfo} {wb_info} "
        f"{seqtype} {prefix} {qu} {fast}")
    logger.details("IQtree command: " + cmd)
    if quiet:
        fnull = open(os.devnull, 'w')
        fnull = None
    error = (f"Problem while running IQtree. See log file ({logfile}) for "
             "more information.")
    utils.run_cmd(cmd, error, eof=True, logger=logger, stderr=fnull)
def to_database(outdir, section):
    Move .fna.gz files to 'database_init' folder, and uncompress them.

    outdir : str
        directory where all results are (for now, refseq/genbank folders, assembly summary and log
    section : str
        refseq (default) or genbank

        nb_gen : number of genomes downloaded
        db_dir : directory where are all fna files downloaded from refseq/genbank
    # Copy .gz files in a new folder, and Unzip them in this new folder
    logger.info("Uncompressing genome files.")
    # Folder where are .gz files
    download_dir = os.path.join(outdir, section, "bacteria")
    # If no folder output/refseq/bacteria: error, no genome found
    # (or output/genbank/bacteria)
    if not os.path.exists(download_dir):
        logger.error(f"The folder containing genomes downloaded from NCBI {section} "
                     f"({download_dir}) does not exist. Check that you really downloaded "
                     "sequences (fna.gz) and that they are in this folder.")
    # If folder output/<refseq or genbank>/bacteria empty: error, no genome found
    list_downloads = os.listdir(download_dir)
    if list_downloads == []:
        logger.error(f"The folder supposed to contain genomes downloaded from NCBI {section} "
                     f"({download_dir}) exists but is empty. Check that you really downloaded "
                     "sequences (fna.gz).")
    # Create directory to put uncompressed genomes
    db_dir = os.path.join(outdir, "Database_init")
    os.makedirs(db_dir, exist_ok=True)
    nb_gen = 0
    # For each subfolder of download dir, move the .gz file it contains (if possible)
    # to the new database folder
    for g_folder in os.listdir(download_dir):
        fasta = glob.glob(os.path.join(download_dir, g_folder, "*.fna.gz"))
        # No .gz file in folder
        if len(fasta) == 0:
            logger.warning("Problem with genome in {}: no compressed fasta file downloaded. "
                           "This genome will be ignored.".format(g_folder))
        # Several gz files in folder
        elif len(fasta) > 1:
            logger.warning("Problem with genome in {}: several compressed fasta files found. "
                           "This genome will be ignored.".format(g_folder))
        # Copy gz file to new folder
        fasta_file = os.path.basename(fasta[0])
        fasta_out = os.path.join(db_dir, fasta_file)
        shutil.copy(fasta[0], fasta_out)
        # Uncompress file copied
        cmd = f"gunzip {fasta_out} -f"
        error = f"Error while trying to uncompress {fasta_out}. This genome will be ignored."
        call = utils.run_cmd(cmd, error)
        # Problem with uncompressing: genome ignored (remove gz file from new folder)
        if call.returncode != 0:
        nb_gen += 1
    return nb_gen, db_dir
def sketch_all(genomes, sorted_genomes, outdir, list_reps, out_msh, mash_log,
    Sketch all genomes to a combined archive.

    genomes : dict
        {genome_file: [genome_name, orig_name, path_to_seq_to_annotate, size, nbcont, l90]}
    sorted_genomes: list
        list of 'genome_file' for all genomes kept (L90 and nbcont ok), ordered by
        decreasing quality
    outdir : str
        path to directory where all results are saved
    list_reps : str
        file with list of genomes to sketch. File will be emptied if it contain something, and
        filled with the informations from 'genomes'.
    out_msh : str
        output of mash
    mash_log : str
        mash logfile
    threads :
        max number of threads to use


    return value (0 if OK, 1 if error)

    # If given outdir does not exist, close it
    if not os.path.isdir(outdir):
        logger.error(f"Your output directory '{outdir}' does not exist.")
    # Empty list_reps file
    open(list_reps, "w").close()
    # Complete paths to genomes to compare: 'path_to_seq_to_annotate' = genome_file[2]
    file_paths = [genomes[g][2] for g in sorted_genomes]
    # Write list of genomes to compare to a file
    utils.write_list(file_paths, list_reps)
    # Sketch all genome sequences if not already done
    if os.path.isfile(out_msh + ".msh"):
            f"Mash sketch file {out_msh}.msh already exists. PanACoTA will "
            "use it for next step.")
        return 0
    logger.info("Sketching all genomes...")
    cmd_sketch = f"mash sketch -o {out_msh} -p {threads} -l {list_reps} -s 1e4"
    error_sketch = (
        f"Error while trying to sketch {len(sorted_genomes)} genomes to combined "
        "archive. Maybe some genome sequences in "
        "'tmp_files' are missing! Check logfile: "

    outf = open(mash_log, "w")
    return 0
def run_prodigal(arguments):
    Run prodigal for the given genome.

    arguments : tuple
        (gpath, prodigal_folder, cores_annot, name, force, nbcont, q) with:

        * gpath: path and filename of genome to annotate
        * prodigal_folder: path to folder where all prodigal folders for all genomes are saved
        * cores_annot: how many cores can use prodigal
        * name: output name of annotated genome
        * force: True if force run (override existing files), False otherwise
        * nbcont: number of contigs in the input genome, to check prodigal results
        * small: ifcontigs are too small (<20000bp), use -p meta option
        * q : queue where logs are put

        True if eveything went well (all needed output files present,
        corresponding numbers of proteins, genes etc.). False otherwise.
    gpath, prodigal_folder, threads, name, force, nbcont, gpath_train, q = arguments
    # Set logger for this process, which will be given to all subprocess
    qh = logging.handlers.QueueHandler(q)
    root = logging.getLogger()
    root.handlers = []
    logging.addLevelName(utils.detail_lvl(), "DETAIL")
    logger = logging.getLogger('annotate.run_prodigal')
    # Define prodigal directory and logfile, and check their existence
    # By default, prodigal is in tmp_folder -> resdir/tmp_files/genome-prodigalRes
    g_ori_name = os.path.basename(gpath)
    prodigal_dir = os.path.join(prodigal_folder, g_ori_name + "-prodigalRes")
    prodigal_logfile = os.path.join(prodigal_folder, g_ori_name + "-prodigal.log")
    prodigal_logfile_err = os.path.join(prodigal_folder, g_ori_name + "-prodigal.log.err")

    # If result dir exists but user wants to force, remove this result dir
    if os.path.isdir(prodigal_dir) and force:
        logger.warning("Prodigal results folder already exists, but is removed because "
                       "--force option was used.")

    # Training file can be "small option", meaning that we did not use the training mode.
    # If not "small option", we used the training mode. If training file does not exist 
    # and prodigal result directory neither, return False
    # We cannot annotate using nothing.
    # Happens if there was a problem while training
    if (gpath_train != "small option" and not os.path.isfile(gpath_train) 
        and not os.path.isdir(prodigal_dir)):
        return False

    logger.log(utils.detail_lvl(), f"Start annotating {name} (from {gpath} sequence) "
                                     "with Prodigal")
    # If prodigal results dir already exists (meaning user did not want to force,
    # otherwise it would have been deleted just before),
    # can we use it for next step ? -> check content.
    if os.path.isdir(prodigal_dir):
        logger.warning(f"Prodigal results folder {prodigal_dir} already exists.")
        ok = check_prodigal(gpath, name, prodigal_dir, logger)
        # If everything ok in the result dir, do not rerun prodigal,
        # use those results for next step (formatting)
        if ok:
            logger.log(utils.detail_lvl(), "Prodigal did not run again. "
                                           "Formatting step will use already generated results of "
                                           "Prodigal in {}. If you want to re-run Prodigal, first "
                                           "remove this result folder, or use '-F' or '--force' "

            logger.log(utils.detail_lvl(), f"End annotating {name} (from {gpath})")
        # If missing files, or other problems in result dir, error message,
        # ask user to force or remove this folder.
            logger.warning("Problems in the files contained in your already existing output dir "
                           f"({prodigal_dir}). Please check it, or remove it to "
        # If everything was ok -> everything is ready for next step -> return True
        # If something is wrong -> cannot use those results, genome won't be annotated
        # -> return False
        return ok
        # We are sure prodigal result dir does not exist yet, because either:
        #     - never existed
        #     - removed because user asked to force
        #     - exists but left function, so does not go until this line
        #        -> either if files inside are ok or not
        # So make prodigal_dir (not automatically created by prodigal)

    # Prodigal_directory is empty and ready to get prodigal results
    basic_outname = os.path.join(prodigal_dir, name)
    # Define cmd, stderr and stdout files, and error to write if problem.
    error = (f"Error while trying to run prodigal. See {prodigal_logfile_err}.")
    prodigalf = open(prodigal_logfile, "w")
    prodigalferr = open(prodigal_logfile_err, "w")
    if gpath_train == "small option":
        training = "-p meta"
        training = f"-t {gpath_train}"
    cmd = (f"prodigal -i {gpath} -d {basic_outname + '.ffn'} -a {basic_outname + '.faa'} "
           f"-f gff -o {basic_outname + '.gff'} {training} -q")
    logger.log(utils.detail_lvl(), "Prodigal command: " + cmd)

    ret = utils.run_cmd(cmd, error, eof=False, stderr=prodigalferr, stdout=prodigalf,
    if ret.returncode == 0:
        logger.log(utils.detail_lvl(), f"End annotating {name} (from {gpath})")
        return True
        return False
def run_prokka(arguments):
    Run prokka for the given genome.

    arguments : tuple
        (gpath, prok_folder, cores_annot, name, force, nbcont, small, q) with:

        * gpath: path and filename of genome to annotate
        * prok_folder: path to folder where all prokka folders for all genomes are saved
        * cores_annot: how many cores can use prokka
        * name: output name of annotated genome
        * force: True if force run (override existing files), False otherwise
        * nbcont: number of contigs in the input genome, to check prokka results
        * small: used for prodigal, if sequences to annotate are small. Not used here
        * q : queue where logs are put

        True if eveything went well (all needed output files present,
        corresponding numbers of proteins, genes etc.). False otherwise.
    gpath, prok_folder, threads, name, force, nbcont, _, q = arguments
    # Set logger for this process
    qh = logging.handlers.QueueHandler(q)
    root = logging.getLogger()
    root.handlers = []
    logging.addLevelName(utils.detail_lvl(), "DETAIL")
    logger = logging.getLogger('annotate.run_prokka')
    logger.log(utils.detail_lvl(), f"Start annotating {name} from {gpath} with Prokka")

    # Define prokka directory and logfile, and check their existence
    prok_dir = os.path.join(prok_folder, os.path.basename(gpath) + "-prokkaRes")
    fnull = open(os.devnull, 'w')
    prok_logfile = os.path.join(prok_folder, os.path.basename(gpath) + "-prokka.log")
    # import sys
    # sys.exit(1)
    # If result dir already exists, check if we can use it or next step or not
    if os.path.isdir(prok_dir) and not force:
        logger.warning(f"Prokka results folder {prok_dir} already exists.")
        ok = check_prokka(prok_dir, prok_logfile, name, gpath, nbcont, logger)
        # If everything ok in the result dir, do not rerun prokka,
        # use those results for next step (formatting)
        if ok:
            logger.log(utils.detail_lvl(), "Prokka did not run again, "
                       "formatting step used already generated results of "
                       f"Prokka in {prok_dir}. If you want to re-run prokka, first "
                       "remove this result folder, or use '-F' or '--force' "
                       "option if you want to rerun prokka for all genomes.")
            logger.log(utils.detail_lvl(), f"End annotating {name} {gpath}")
        # If missing files, or other problems in result dir, error message,
        # ask user to force or remove this folder.
            logger.warning("Problems in the files contained in your already existing output dir "
                           "({}). Please check it, or remove it to "
        # If everything was ok -> everything is ready for next step -> return True
        # If something is wrong -> cannot use those results, genome won't be annotated
        # -> return False
        return ok
    # If result dir exists but user wants to force, remove this result dir
    elif os.path.isdir(prok_dir) and force:
        logger.warning("Prokka results folder already exists, but removed because --force option "
    # Now that we checked and solved those cases:
    #     - outdir exists (problems or not, we returned appropriate boolean)
    #     - if outdir exists exists but force, remove this outdir.
    # So, outdir does not exist -> run prokka
    cmd = (f"prokka --outdir {prok_dir} --cpus {threads} "
           f"--prefix {name} --centre prokka {gpath}")
    error = (f"Error while trying to run prokka on {name} from {gpath}")
    logger.log(utils.detail_lvl(), "Prokka command: " + cmd)
    prokf = open(prok_logfile, "w")
    ret = utils.run_cmd(cmd, error, eof=False, stderr=prokf, logger=logger)
    if ret.returncode != 0:
        return False
    ok = check_prokka(prok_dir, prok_logfile, name, gpath, nbcont, logger)
    logger.log(utils.detail_lvl(), f"End annotating {name} from {gpath}.")
    return ok