Exemplo n.º 1
0
def test_log_listen(capsys):
    """
    Check that when we log to a queue listener, and then handle the logs
    via logger_thread, the logs appear.
    """
    import multiprocessing
    import threading

    # Create Queue, QueueHandler, and log messages to it
    m = multiprocessing.Manager()
    q = m.Queue()
    qh = logging.handlers.QueueHandler(q)
    root = logging.getLogger()
    root.setLevel(logging.DEBUG)
    root.handlers = []
    logging.addLevelName(utils.detail_lvl(), "DETAIL")
    root.addHandler(qh)
    logger = logging.getLogger('process')
    logger.debug("debug message")
    logger.log(utils.detail_lvl(), "detail message")
    logger.info("info message")
    logger.warning("warning message")
    logger.error("error message")
    logger.critical("critical message")
    q.put(None)

    # Initialize real logger
    logfile = os.path.join(GENEPATH, "logfile_test.txt")
    utils.init_logger(logfile, 0, '')

    # Listen to QueueHandler and handle messages to stdout/stderr/files
    lp = threading.Thread(target=utils.logger_thread, args=(q, ))
    lp.start()
    lp.join()

    out, err = capsys.readouterr()
    assert "info message" in out
    assert "error message" in err
    assert "critical message" in err
    with open(logfile + ".log", "r") as logf:
        assert logf.readline().endswith(" :: INFO :: info message\n")
        assert logf.readline().endswith(" :: WARNING :: warning message\n")
        assert logf.readline().endswith(" :: ERROR :: error message\n")
        assert logf.readline().endswith(" :: CRITICAL :: critical message\n")
    with open(logfile + ".log.details") as logf:
        assert logf.readline().endswith(" :: DETAIL :: detail message\n")
        assert logf.readline().endswith(" :: INFO :: info message\n")
        assert logf.readline().endswith(" :: WARNING :: warning message\n")
        assert logf.readline().endswith(" :: ERROR :: error message\n")
        assert logf.readline().endswith(" :: CRITICAL :: critical message\n")
    with open(logfile + ".log.err", "r") as logf:
        assert logf.readline().endswith(" :: WARNING :: warning message\n")
        assert logf.readline().endswith(" :: ERROR :: error message\n")
        assert logf.readline().endswith(" :: CRITICAL :: critical message\n")
Exemplo n.º 2
0
def prodigal_train(gpath, annot_folder):
    """
    Use prodigal training mode.
    First, train prodigal on the first genome ('gpath'), and write it to 'genome'.trn,
    file which will be used for the annotation of all next sequence
    Parameters
    ----------
    gpath : str
        path to genome to train on
    annot_folder : str
        path to folder where the log files and train file will be saved

    Returns
    -------
    str
        path and name of train file (will be used to annotate all next genomes)
        If problem, returns empty string
    """
    logger.info(f"Prodigal will train using {gpath}")
    gname = os.path.basename(gpath)             # path/to/original/genome.fasta -> genome.fasta
    gpath_train = os.path.join(annot_folder, gname + ".trn") # path/to/prodiRes/genome.fasta.trn
    if os.path.isfile(gpath_train):
        logger.info(f"A training file already exists ({gpath_train}). "
                     "It will be used to annotate all genomes.")
        return gpath_train
    prodigal_logfile = gpath_train + "-prodigal-train.log"  # path/to/genome-prodigal-train.log
    prodigal_logfile_err = gpath_train + "-prodigal-train.log.err"
    cmd = (f"prodigal -i {gpath} -t {gpath_train}")
    error = (f"Error while trying to train prodigal on {gname}. See {prodigal_logfile_err}.")
    logger.log(utils.detail_lvl(), "prodigal command: " + cmd)
    prodigalf = open(prodigal_logfile, "w")
    prodigalferr = open(prodigal_logfile_err, "w")
    ret = utils.run_cmd(cmd, error, eof=False, stderr=prodigalferr, stdout=prodigalf,
                        logger=logger)
    prodigalf.close()
    prodigalferr.close()
    if ret.returncode == 0:
        logger.log(utils.detail_lvl(), f"End training on {gpath}")
        return gpath_train
    else:
        return ""
Exemplo n.º 3
0
def check_extractions(num_fam, miss_file, prt_file, gen_file, ngenomes, logger):
    """
    Check that extractions went well for the given family:

    - check number of proteins and genes extracted compared to the
      number of genomes

    Parameters
    ----------
    num_fam : int
        current family number
    miss_file : str
        path to file containing the list of genomes missing for the current family
    prt_file : str
        path to file containing all proteins extracted
    gen_file : str
        path to file containing all genes extracted
    ngenomes : int
        total number of genomes in dataset
    logger : logging.Logger
        logger with queueHandler to give logs to main logger

    Returns
    -------
    bool or int
        False if any problem (nbmiss+prt != nbgenomes or nbmiss+gen != nbgenomes). If no
        problem, returns the number of proteins/genes extracted
    """
    logger.log(utils.detail_lvl(), f"Checking extractions for family {num_fam}")

    # Check that extractions went well
    nbmiss = utils.count(miss_file)
    # If files with proteins extracted do not even exist, close with error
    # (they should have been created at the previous step)
    if not os.path.isfile(gen_file):
        logger.error(f"fam {num_fam}: no file with genes extracted "
                     f"('{gen_file}'). Cannot align.")
        sys.exit(1)
    if not os.path.isfile(prt_file):
        logger.error(f"fam {num_fam}: no file with proteins extracted "
                     f"('{prt_file}'). Cannot align.")
        sys.exit(1)
    nbfprt = utils.grep(prt_file, "^>", counts=True)
    nbfgen = utils.grep(gen_file, "^>", counts=True)
    if nbmiss + nbfprt != ngenomes:
        logger.error(("fam {}: wrong sum of missing genomes ({}) and prt extracted ({}) for {} "
                      "genomes in the dataset.").format(num_fam, nbmiss, nbfprt, ngenomes))
        return False
    if nbmiss + nbfgen != ngenomes:
        logger.error(("fam {}: wrong sum of missing genomes ({}) and gen extracted ({}) for {} "
                      "genomes in the dataset.").format(num_fam, nbmiss, nbfgen, ngenomes))
        return False
    return nbfprt
def handle_genome(args):
    """
    For a given genome, check if it has been annotated (in results), if annotation
    (by prokka or prodigal) ran without problems (result = True). In that case,
    format the genome and get the output to see if everything went ok.

    Parameters
    ----------
    args : tuple
        (genome, name, gpath, prok_path, lst_dir, prot_dir,\
         gene_dir, rep_dir, gff_dir, results, q)\
         with:

         * genome : original genome name
         * name : gembase name of the genome
         * gpath : path to the genome sequence which was given to prokka/prodigal for annotation
         * annot_path : directory where prokka/prodigal folders are saved
         * lst_dir : path to 'LSTINFO' folder
         * prot_dir : path to 'Proteins' folder
         * gene_dit : path to 'Genes' folder
         * rep_dir : path to 'Replicons' folder
         * gff_dir : path to 'gff3' folder
         * prodigal_only : True if annotated by prodigal, False if annotated by prokka
         * q : multiprocessing.managers.AutoProxy[Queue] queue to put logs during subprocess

    Returns
    -------
    (bool, str) :

        * True if genome was annotated as expected, False otherwise
        * genome name (used to get info from the pool.map_async)
    """
    (genome, name, gpath, annot_path, lst_dir, prot_dir, gene_dir, rep_dir,
     gff_dir, prodigal_only, q) = args

    # Define which formatting must be used, given the annotation software
    if prodigal_only:
        format_one_genome = fprodigal.format_one_genome
    else:
        format_one_genome = fprokka.format_one_genome
    # Set logger for this process
    qh = logging.handlers.QueueHandler(q)
    root = logging.getLogger()
    root.setLevel(logging.DEBUG)
    root.handlers = []
    logging.addLevelName(utils.detail_lvl(), "DETAIL")
    root.addHandler(qh)
    logger = logging.getLogger('format.handle_genome')
    # Handle genome
    ok_format = format_one_genome(gpath, name, annot_path, lst_dir, prot_dir,
                                  gene_dir, rep_dir, gff_dir)
    return ok_format, genome
Exemplo n.º 5
0
def my_logger(name):
    """
    logger given to function called by a subprocess
    """
    m = multiprocessing.Manager()
    q = m.Queue()
    qh = logging.handlers.QueueHandler(q)
    root = logging.getLogger()
    root.setLevel(logging.DEBUG)
    root.handlers = []
    logging.addLevelName(utils.detail_lvl(), "DETAIL")
    root.addHandler(qh)
    return q, logging.getLogger(name)
Exemplo n.º 6
0
def mafft_align(num_fam, prt_file, mafft_file, nbfprt, logger):
    """
    Align all proteins of the given family with mafft

    Parameters
    ----------
    num_fam : int
        current family number
    prt_file : str
        path to file containing all proteins extracted
    mafft_file : str
        path to file which will contain proteins alignment
    nbfprt : int
        number of proteins extracted in prt file
    logger : logging.Logger
        logger with queueHandler to give logs to main logger

    Returns
    -------
    bool
        True if no problem (alignment ok, same number of proteins extracted and aligned),
        False otherwise
    """
    logger.log(utils.detail_lvl(), f"Aligning family {num_fam}")
    cmd = f"mafft --auto {prt_file}"
    error = f"Problem while trying to align fam {num_fam}"
    stdout = open(mafft_file, "w")
    stderr = open(mafft_file + ".log", "w")
    logger.log(utils.detail_lvl(), f"Mafft command: {cmd}")
    ret = utils.run_cmd(cmd, error, stdout=stdout, stderr=stderr, logger=logger)
    stdout.close()
    if not isinstance(ret, int):
        ret = ret.returncode
    if ret != 0:
        os.remove(mafft_file)
        return False
    message = (f"fam {num_fam}: different number of proteins extracted in {prt_file} ({nbfprt}) and proteins "
               f"aligned in {mafft_file}")
    return check_nb_seqs(mafft_file, nbfprt, logger, message)
Exemplo n.º 7
0
def test_log_no_listen(capsys):
    """
    Check that when we log to a queue listener, but never listen to the queue,
    there is nothing in stderr/stdout/files
    """
    import multiprocessing

    # Create Queue, QueueHandler, and log messages to it
    m = multiprocessing.Manager()
    q = m.Queue()
    qh = logging.handlers.QueueHandler(q)
    root = logging.getLogger()
    root.setLevel(logging.DEBUG)
    root.handlers = []
    logging.addLevelName(utils.detail_lvl(), "DETAIL")
    root.addHandler(qh)
    logger = logging.getLogger('process')
    logger.debug("debug message")
    logger.log(utils.detail_lvl(), "detail message")
    logger.info("info message")
    logger.warning("warning message")
    logger.error("error message")
    logger.critical("critical message")
    q.put(None)

    # Initialize real logger
    logfile = os.path.join(GENEPATH, "test_log_listen")
    utils.init_logger(logfile, 0, '')

    assert q.qsize() == 7
    out, err = capsys.readouterr()
    assert out == ""
    assert err == ""
    with open(logfile + ".log", "r") as logf:
        assert logf.readlines() == []
    with open(logfile + ".log.details") as logf:
        assert logf.readlines() == []
    with open(logfile + ".log.err", "r") as logf:
        assert logf.readlines() == []
Exemplo n.º 8
0
def read_alignments(all_alns, all_genomes):
    """
    Read alignment file, and assign each sequence to a genome

    Parameters
    ----------
    all_alns : str
        path to file containing all alignments concatenated
    all_genomes : []
        list of all genomes

    Returns
    -------
    dict or None
        - {genome_name: [list of sequences for this genome]}
        - None if problem with a protein for which we don't find the genome
    """
    sequences = {}  # name: [ordered list of sequences]
    genome = None
    seq = ""
    with open(all_alns, 'r') as alnf:
        for line in alnf:
            if line.startswith(">"):
                # If new header, write previous protein name/sequence to 'sequences'
                if genome and seq:
                    sequences[genome].append(seq)
                    seq = ""
                # Get new genome header
                genome = get_genome(line, all_genomes)
                if not genome:
                    return None
                if genome not in sequences:
                    sequences[genome] = []
            else:
                seq += line.strip()
    if genome and seq:
        sequences[genome].append(seq)
    per_genome = [len(seq) for seq in sequences.values()]
    if len(set(per_genome)) != 1:
        logger.error(
            "Problems occurred while grouping alignments by genome: all genomes "
            "do not have the same number of sequences. Check that each protein "
            "name contains the name of the genome from which it comes.")
        return None
    logger.log(utils.detail_lvl(),
               f"{per_genome[0]} sequences found per genome")
    return sequences
Exemplo n.º 9
0
def write_groups(outfile, sequences):
    """
    Writing alignments per genome to output file.

    Parameters
    ----------
    outfile : str
        path to file that will contain alignments grouped by genome
    sequences : dict
        {genome_name: [list of sequences (DNA, prot...) for this genome]}
    """
    logger.log(utils.detail_lvl(), "Writing alignments per genome")
    with open(outfile, "w") as outf:
        for genome in sorted(sequences, key=utils.sort_genomes_by_name):
            # write header for genome
            outf.write(">" + genome + "\n")
            # Write all sequences
            outf.write("".join(sequences[genome]) + "\n")
Exemplo n.º 10
0
def back_translate(num_fam, mafft_file, gen_file, btr_file, nbfal, logger):
    """
    Backtranslate protein alignment to nucleotides

    Parameters
    ----------
    num_fam : int
        current family number. Used for log messages
    mafft_file : str
        path to file containing protein alignments by mafft
    gen_file : str
        path to file containing all sequences, not aligned, in nucleotides. It is used to
        convert the alignment in proteins into a nucleotide alignment
    btr_file : str
        path to the file that will contain the nucleotide alignment
    nbfal : int
        number of sequences aligned for the family by mafft
    logger : logging.Logger
        logger with queueHandler to give logs to main logger

    Returns
    -------
    bool
        - False if problem (back-translation, different number of families...)
        - number of sequences in btr file if everything went well
    """
    logger.log(utils.detail_lvl(), f"Back-translating family {num_fam}")
    curpath = os.path.dirname(os.path.abspath(__file__))
    awk_script = os.path.join(curpath, "prt2codon.awk")
    cmd = f"awk -f {awk_script} {mafft_file} {gen_file}"
    stdout = open(btr_file, "w")
    error = f"Problem while trying to backtranslate {mafft_file} to a nucleotide alignment"
    ret = utils.run_cmd(cmd, error, stdout=stdout, logger=logger)
    stdout.close()
    if not isinstance(ret, int):
        ret = ret.returncode
    if ret != 0:
        os.remove(btr_file)
        return False
    message = (f"fam {num_fam}: different number of proteins aligned in {mafft_file} ({nbfal}) and genes "
               f"back-translated in {btr_file}")
    # Check number of sequences in btr file, and return True/False according to it
    # It should contain the same number of sequences as the mafft file.
    return check_nb_seqs(mafft_file, nbfal, logger, message)
Exemplo n.º 11
0
def handle_family(args):
    """
    For the given family:

    - align its proteins with mafft
    - back-translate to nucleotides
    - add missing genomes

    Parameters
    ----------
    args : ()
         (prefix, num_fam, ngenomes, q) with:

         - prefix: path to ``aldir/<name of dataset>``
         - num_fam: the current family number
         - ngenomes: the total number of genomes in dataset
         - q: a queue, which will be used by logger to put logs while in other process

    Returns
    -------
    bool
        - "OK" if the files were not re-created, and have the expected format. This is used by
          ``align_all_families`` function, to know if something was regenerated, or if everything
          already existed with the expected format. If something was redone and concat/group files
          exist, it removes them.
        - False if any problem (extractions, alignment, btr, add missing genomes...)
        - True if just generated all files, and everything is ok
    """
    prefix, num_fam, ngenomes, q = args
    qh = logging.handlers.QueueHandler(q)
    root = logging.getLogger()
    root.setLevel(logging.DEBUG)
    root.handlers = []
    logging.addLevelName(utils.detail_lvl(), "DETAIL")
    root.addHandler(qh)
    logger = logging.getLogger('align.align_family')
    return handle_family_1thread((prefix, num_fam, ngenomes))
Exemplo n.º 12
0
def main(cmd, ncbi_species_name, ncbi_species_taxid, ncbi_taxid, ncbi_strains,
         levels, ncbi_section, outdir, tmp_dir, threads, norefseq, db_dir,
         only_mash, info_file, l90, nbcont, cutn, min_dist, max_dist, verbose,
         quiet):
    """
    Main method, constructing the draft dataset for the given species

    verbosity:
    - defaut 0 : stdout contains INFO, stderr contains ERROR, .log contains INFO and more, .log.err contains warning and more
    - 1: same as 0 + WARNING in stderr
    - 2: same as 1 + DETAILS in stdout + DETAILS in .log.details
    - >=15: same as 2 + Add DEBUG in stdout + create .log.debug with everything from info to debug


    Parameters
    ----------
    cmd : str
        command line used to launch this program
    ncbi_species_name : str
        name of species to download, as given by NCBI
    ncbi_species_taxid : int
        species taxid given in NCBI
    ncbi_taxid : int
        NCBI taxid (sub-species)
    ncbi_strains : str
        specific strains to download
    levels: str
        Level of assembly to download. Choice between 'all', 'complete', 'chromosome',
        'scaffold', 'contig'. Default is 'all'
    outdir : str
        path to output directory (where created database will be saved).
    tmp_dir : str
        Path to directory where tmp files are saved (sequences split at each row of 5 'N')
    threads : int
        max number of threads to use
    norefseq : bool
        True if user does not want to download again the database
    db_dir : str
        Name of the folder where already downloaded fasta files are saved.
    only_mash : bool
        True if user user already has the database and quality of each genome (L90, #contigs etc.)
    info_file : str
        File containing information on QC if it was already ran before (columns to_annotate,
        gsize, nb_conts and L90).
    l90 : int
        Max L90 allowed to keep a genome
    nbcont : int
        Max number of contigs allowed to keep a genome
    cutn : int
        cut at each when there are 'cutn' N in a row. Don't cut if equal to 0
    min_dist : int
        lower limit of distance between 2 genomes to keep them
    max_dist : int
        upper limit of distance between 2 genomes to keep them (default is 0.06)
    verbose : int
        verbosity:
        - defaut 0 : stdout contains INFO, stderr contains ERROR, .log contains INFO and more,
          .log.err contains warning and more
        - 1: same as 0 + WARNING in stderr
        - 2: same as 1 + DETAILS in stdout + DETAILS in .log.details
        - >=15: same as 2 + Add DEBUG in stdout + create .log.debug with everything
          from info to debug
    quiet : bool
        True if nothing must be sent to stdout/stderr, False otherwise
    """

    # get species name in NCBI format
    # -> will be used to name output directory
    # -> will be used to download summary file if given species corresponds to NCBI name
    if ncbi_species_name:
        species_linked = "_".join(ncbi_species_name.split())
        species_linked = "_".join(species_linked.split("/"))

    # if species name not given by user, use species taxID (if given) to name output directory
    elif ncbi_species_taxid:
        species_linked = str(ncbi_species_taxid)
    # if species name not species taxid by user, use taxID (if given) to name output directory
    elif ncbi_taxid:
        species_linked = str(ncbi_taxid)
    # If no species nor taxID, get specific strain names
    elif ncbi_strains:
        if os.path.isfile(ncbi_strains):
            species_linked = os.path.basename(ncbi_strains)
            species_linked = os.path.splitext(species_linked)[0]
        else:
            species_linked = "_".join(ncbi_strains.split())
            species_linked = "-".join(species_linked.split("/"))
            species_linked = "_and_".join(species_linked.split(","))
    # if neither speName, speID, taxID nor strainName given (--norefseq, mashonly), name is NA
    else:
        species_linked = "NA"
    # Default outdir is species name if given, or species taxID
    if not outdir:
        outdir = species_linked
    # Default tmp_dir is outdir/tmp_files
    if not tmp_dir:
        tmp_dir = os.path.join(outdir, "tmp_files")
    # directory that will be created by ncbi_genome_download
    ncbidir = os.path.join(outdir, ncbi_section, "bacteria")
    os.makedirs(outdir, exist_ok=True)
    os.makedirs(tmp_dir, exist_ok=True)

    # Initialize logger
    # set level of logger: level is the minimum level that will be considered.
    if verbose <= 1:
        level = logging.INFO
    # for verbose = 2, ignore only debug
    if verbose >= 2 and verbose < 15:
        level = utils.detail_lvl()  # int corresponding to detail level
    # for verbose >= 15, write everything
    if verbose >= 15:
        level = logging.DEBUG
    logfile_base = os.path.join(outdir,
                                "PanACoTA_prepare_{}").format(species_linked)
    logfile_base, logger = utils.init_logger(logfile_base,
                                             level,
                                             'prepare',
                                             log_details=True,
                                             verbose=verbose,
                                             quiet=quiet)

    # Message on what will be done (cmd, cores used)
    logger.info(f'PanACoTA version {version}')
    logger.info("Command used\n \t > " + cmd)
    message = f"'PanACoTA prepare' will run on {threads} "
    message += f"cores" if threads > 1 else "core"
    logger.info(message)

    # Start prepare step
    # Run more than only mash filter (!only_mash):
    # - start from QC and mash (norefseq)
    # - start from genome download (!norefseq))
    if not only_mash:
        # Not only mash, so a new info file will be created. If the user still gave an info
        # file (he will be warned that it will be ignored), rename it with '.bak'
        # to avoid erasing it
        if info_file and os.path.isfile(info_file):
            os.rename(info_file, info_file + ".back")

        # 'norefseq = True" : Do not download genomes, just do QC and mash filter on given genomes
        # -> if not, error and exit
        if norefseq:
            logger.warning(f'You asked to skip {ncbi_section} downloads.')

            # -> if db_dir given, watch for sequences there. If does not exist, error and exit
            # (user gave a directory (even if it does not exist), so we won't look for
            # the sequences in other folders)
            if db_dir:
                if not os.path.exists(db_dir):
                    logger.error(
                        f"Database folder {db_dir} supposed to contain fasta "
                        "sequences does not "
                        "exist. Please give a valid folder, or leave the default "
                        "directory (no '-d' option).")
                    sys.exit(1)
            # -> If user did not give db_dir, genomes could be in
            # outdir/Database_init/<genome_name>.fna
            else:
                db_dir = os.path.join(outdir, "Database_init")
                # If it does not exist, check if default compressed files folder exists.
                if not os.path.exists(db_dir):
                    logger.warning(
                        f"Database folder {db_dir} supposed to contain fasta "
                        "sequences does not "
                        "exist. We will check if the download folder (with compressed "
                        "sequences) exists.")
                    # -> if not in database_init, genomes must be in
                    # outdir/refeq/bacteria/<genome_name>.fna.gz. In that case,
                    # uncompress and add them to Database_init
                    if not os.path.exists(ncbidir):
                        logger.error(
                            f"Folder {ncbidir} does not exist. You do not have any "
                            "genome to analyse. Possible reasons:\n"
                            "- if you want to rerun analysis in the same folder as "
                            "sequences were downloaded (my_outdir/Database_init or "
                            f"my_outdir/{ncbi_section}), make sure you have '-o my_outdir' "
                            "option\n"
                            "- if you want to rerun analysis and save them in a new "
                            "output folder called 'new_outdir', make sure you have "
                            "'-o new_outdir' option, "
                            "and you specified where the uncompressed sequences to "
                            "use are ('-d sequence_database_path'). ")
                        sys.exit(1)
                    # add genomes from refseq/bacteria folder to Database_init
                    nb_gen, _ = dgf.to_database(outdir, ncbi_section)
        # No sequence: Do all steps -> download, QC, mash filter
        else:
            # Download all genomes of the given taxID
            db_dir, nb_gen = dgf.download_from_ncbi(species_linked,
                                                    ncbi_section,
                                                    ncbi_species_name,
                                                    ncbi_species_taxid,
                                                    ncbi_taxid, ncbi_strains,
                                                    levels, outdir, threads)
            logger.info(f"{nb_gen} {ncbi_section} genome(s) downloaded")

        # Now that genomes are downloaded and uncompressed, check their quality to remove bad ones
        genomes = fg.check_quality(species_linked, db_dir, tmp_dir, l90,
                                   nbcont, cutn)

    # Do only mash filter. Genomes must be already downloaded, and there must be a file with
    # all information on these genomes (L90 etc.)
    else:
        logger.warning('You asked to run only mash steps.')
        if not os.path.exists(
                info_file):  # info-file missing -> error and exit
            logger.error(
                f"Your info file {info_file} does not exist. Please provide the  "
                "right name/path, or remove the '--mash-only option to rerun "
                "quality control.")
            sys.exit(1)
        logger.info(("You want to run only mash steps. Getting information "
                     "from {}").format(info_file))
        genomes = utils.read_genomes_info(
            info_file,
            species_linked,
        )

    # Run Mash
    # genomes : {genome_file: [genome_name, orig_name, path_to_seq_to_annotate, size, nbcont, l90]}
    # sorted_genome : [genome_file] ordered by L90/nbcont (keys of genomes)
    sorted_genomes = fg.sort_genomes_minhash(genomes, l90, nbcont)

    # Write discarded genomes to a file -> orig_name, to_annotate, gsize, nb_conts, L90
    discQC = f"by-L90_nbcont-{species_linked}.txt"
    utils.write_genomes_info(genomes, sorted_genomes, discQC, outdir)

    # Remove genomes not corresponding to mash filters
    removed = fg.iterative_mash(sorted_genomes, genomes, outdir,
                                species_linked, min_dist, max_dist, threads,
                                quiet)
    # Write list of genomes kept, and list of genomes discarded by mash step
    info_file = fg.write_outputfiles(genomes, sorted_genomes, removed, outdir,
                                     species_linked, min_dist, max_dist)
    logger.info("End")
    return info_file
Exemplo n.º 13
0
def create_gen(ffnseq, lstfile, genseq):
    """
    Generate .gen file, from sequences contained in .ffn, but changing the
    headers using the information in .lst

    Parameters
    ----------
    ffnseq : str
        .ffn file generated by prokka
    lstfile : str
        lstfile converted from prokka tbl file
    genseq : str
        output file, to write in Genes directory
    logger : logging.Logger
        logger object to put information

    Returns
    -------
    bool :
        True if conversion went well, False otherwise
    """
    problem = False
    write = True  # Write next sequence
    with open(ffnseq) as ffn, open(lstfile) as lst, open(genseq, "w") as gen:
        for line_ffn in ffn:
            # Ignore gene that we do not want to write (should be a crispr)
            # If line of sequence, write it as is, and go to next line
            if not line_ffn.startswith(">"):
                # We just read a seq line. If we can write (write is True), do it and go
                # to next line
                # Otherwise, just go to next line
                if write:
                    gen.write(line_ffn)
                continue
            # Try to get gene ID. If does not work, ignore this gene (it may be a
            # CRISPR, and we ignore them
            test_gen_id = line_ffn.split()[0].split("_")[-1]
            if not test_gen_id.isdigit():
                # Maybe a CRISPR? Or wrong gene name? -> ignore
                logger.log(
                    utils.detail_lvl(),
                    f"Unknown header format for {line_ffn.strip()}. "
                    "This gene will be ignored in .gen output file.")
                write = False
                continue
            # If ffn contains a gene header, find its information in lst file
            else:
                write = True
                lstline = lst.readline().strip()
                gen_id = int(test_gen_id)
                # genID exists, ffn header is for a gene. Check that it corresponds to
                # information in lst file.
                id_lst = lstline.split("\t")[4].split("_")[-1]
                gen_id_lst = int(id_lst)
                # in lst, find the same gene ID as in ffn (some gene IDs in lst can be absent
                # from ffn, if prokka do not give their sequence).
                # As they are ordered by increasing number, go to next lstline until
                # corresponding gene ID is found. However, if ffn ID > lst ID: ID does not
                # exist in .lst -> problem.
                while gen_id > gen_id_lst:
                    lstline = lst.readline().strip()
                    if not lstline:
                        gen_id_lst = "-1"
                        break
                    id_lst = lstline.split("\t")[4].split("_")[-1]
                    gen_id_lst = int(id_lst)
                # If it found the same gene ID, write info in gene file
                if gen_id == gen_id_lst:
                    general.write_header(lstline.strip(), gen)
                # If gene ID of ffn not found, write error message and stop
                else:
                    logger.error(
                        f"Missing info for gene {line_ffn.strip()} "
                        f"(from {ffnseq}) in {lstfile}. If it is actually present "
                        "in the lst file, check that genes are ordered by increasing number in both lst and ffn files."
                    )
                    return False
    return True
Exemplo n.º 14
0
def run_prodigal(arguments):
    """
    Run prodigal for the given genome.

    Parameters
    ----------
    arguments : tuple
        (gpath, prodigal_folder, cores_annot, name, force, nbcont, q) with:

        * gpath: path and filename of genome to annotate
        * prodigal_folder: path to folder where all prodigal folders for all genomes are saved
        * cores_annot: how many cores can use prodigal
        * name: output name of annotated genome
        * force: True if force run (override existing files), False otherwise
        * nbcont: number of contigs in the input genome, to check prodigal results
        * small: ifcontigs are too small (<20000bp), use -p meta option
        * q : queue where logs are put

    Returns
    -------
    boolean
        True if eveything went well (all needed output files present,
        corresponding numbers of proteins, genes etc.). False otherwise.
    """
    gpath, prodigal_folder, threads, name, force, nbcont, gpath_train, q = arguments
    # Set logger for this process, which will be given to all subprocess
    qh = logging.handlers.QueueHandler(q)
    root = logging.getLogger()
    root.setLevel(logging.DEBUG)
    root.handlers = []
    logging.addLevelName(utils.detail_lvl(), "DETAIL")
    root.addHandler(qh)
    logger = logging.getLogger('annotate.run_prodigal')
    # Define prodigal directory and logfile, and check their existence
    # By default, prodigal is in tmp_folder -> resdir/tmp_files/genome-prodigalRes
    g_ori_name = os.path.basename(gpath)
    prodigal_dir = os.path.join(prodigal_folder, g_ori_name + "-prodigalRes")
    prodigal_logfile = os.path.join(prodigal_folder, g_ori_name + "-prodigal.log")
    prodigal_logfile_err = os.path.join(prodigal_folder, g_ori_name + "-prodigal.log.err")

    # If result dir exists but user wants to force, remove this result dir
    if os.path.isdir(prodigal_dir) and force:
        shutil.rmtree(prodigal_dir)
        logger.warning("Prodigal results folder already exists, but is removed because "
                       "--force option was used.")

    # Training file can be "small option", meaning that we did not use the training mode.
    # If not "small option", we used the training mode. If training file does not exist 
    # and prodigal result directory neither, return False
    # We cannot annotate using nothing.
    # Happens if there was a problem while training
    if (gpath_train != "small option" and not os.path.isfile(gpath_train) 
        and not os.path.isdir(prodigal_dir)):
        return False

    logger.log(utils.detail_lvl(), f"Start annotating {name} (from {gpath} sequence) "
                                     "with Prodigal")
    # If prodigal results dir already exists (meaning user did not want to force,
    # otherwise it would have been deleted just before),
    # can we use it for next step ? -> check content.
    if os.path.isdir(prodigal_dir):
        logger.warning(f"Prodigal results folder {prodigal_dir} already exists.")
        ok = check_prodigal(gpath, name, prodigal_dir, logger)
        # If everything ok in the result dir, do not rerun prodigal,
        # use those results for next step (formatting)
        if ok:
            logger.log(utils.detail_lvl(), "Prodigal did not run again. "
                                           "Formatting step will use already generated results of "
                                           "Prodigal in {}. If you want to re-run Prodigal, first "
                                           "remove this result folder, or use '-F' or '--force' "
                                           "option.".format(prodigal_dir))

            logger.log(utils.detail_lvl(), f"End annotating {name} (from {gpath})")
        # If missing files, or other problems in result dir, error message,
        # ask user to force or remove this folder.
        else:
            logger.warning("Problems in the files contained in your already existing output dir "
                           f"({prodigal_dir}). Please check it, or remove it to "
                           "re-annotate.")
        # If everything was ok -> everything is ready for next step -> return True
        # If something is wrong -> cannot use those results, genome won't be annotated
        # -> return False
        return ok
    else:
        # We are sure prodigal result dir does not exist yet, because either:
        #     - never existed
        #     - removed because user asked to force
        #     - exists but left function, so does not go until this line
        #        -> either if files inside are ok or not
        # So make prodigal_dir (not automatically created by prodigal)
        os.makedirs(prodigal_dir)

    # Prodigal_directory is empty and ready to get prodigal results
    basic_outname = os.path.join(prodigal_dir, name)
    # Define cmd, stderr and stdout files, and error to write if problem.
    error = (f"Error while trying to run prodigal. See {prodigal_logfile_err}.")
    prodigalf = open(prodigal_logfile, "w")
    prodigalferr = open(prodigal_logfile_err, "w")
    if gpath_train == "small option":
        training = "-p meta"
    else:
        training = f"-t {gpath_train}"
    cmd = (f"prodigal -i {gpath} -d {basic_outname + '.ffn'} -a {basic_outname + '.faa'} "
           f"-f gff -o {basic_outname + '.gff'} {training} -q")
    logger.log(utils.detail_lvl(), "Prodigal command: " + cmd)

    ret = utils.run_cmd(cmd, error, eof=False, stderr=prodigalferr, stdout=prodigalf,
                        logger=logger)
    prodigalf.close()
    prodigalferr.close()
    if ret.returncode == 0:
        logger.log(utils.detail_lvl(), f"End annotating {name} (from {gpath})")
        return True
    else:
        return False
Exemplo n.º 15
0
def add_missing_genomes(align_file, ali_type, miss_file, num_fam, ngenomes, status1, logger):
    """
    Once all family proteins are aligned, and back-translated to nucleotides,
    add missing genomes for the family to the alignment with '-'.
    (Add missing genomes to both mafft alignment and back-translated alignment)

    Parameters
    ----------
    align_file : str
        path to file containing alignments (proteins if from mafft output, 
        or nucleic sequences if after backtranslating them)
    ali_type : str
        protein or backtranslated
    miss_file : str
        path to file containing the list of missing genomes in this family
    num_fam : int
        family number
    ngenomes : int
        total number of genomes in dataset 
    status1 : bool or str
        - "OK" if we did not redo the alignments as they already were as expected. In that case,
          if missing genomes are already present, just add a warning message saying that we
          used the already existing btr file.
        - True if we just did the alignments and backtranslate. So no warning message needed.
        - False if problem with extraction, alignment or backtranslation (will never happen as
          this function is not called if status1 == False)
    logger : logging.Logger
        the logger, having a queue Handler, to give logs to the main logger

    Returns
    -------
    bool or str
        - "OK" if btr file was not recreated, and already has the right number of sequences,
          and all with the same length.
        - False if problem in btr file alignment, so missing genomes not added 
        - True if alignment + adding missing genomes is ok. Can happen if there is no missing
          genome for this family (in that case, btr generated already has the right number of
          sequences), or if we just added the missing genomes.

    """
    # btr_file should always exist.
    # Sometimes it comes from previous step ('missing genomes' are missing)
    # Sometimes it comes from a previous run (all genomes should be here)
    status = check_add_missing(align_file, num_fam, ngenomes, logger, prev=True)
    # If btr_file has the correct number of sequences, all the same length, return True
    if status is True:
        if status1 == "OK":
            logger.warning(f"{ali_type} alignment already done for family {num_fam}. The program will use "
                            "it for next steps")
            return "OK"
        else:
            return True
    # If btr_files has problem in alignment (not all sequences with same size)
    elif status is False:
        return False
    # All sequences have same length but some genomes are missing -> Add missing genomes
    # status is length of sequence (if it was True or False, it already ended this function)
    logger.log(utils.detail_lvl(), f"Adding missing genomes for family {num_fam} in {ali_type} alignment.")
    len_aln = status
    with open(miss_file, "r") as missf, open(align_file, "a") as alif:
        for genome in missf:
            genome = genome.strip()
            toadd = ">" + genome + "\n" + "-" * len_aln + "\n"
            alif.write(toadd)
    # check_add_missing called with prev=False :
    # output is True if all ok, or False if problems. Cannot be sequence length (as it can be with prev=True)
    ret = check_add_missing(align_file, num_fam, ngenomes, logger, prev=False)
    return ret
Exemplo n.º 16
0
def run_prokka(arguments):
    """
    Run prokka for the given genome.

    Parameters
    ----------
    arguments : tuple
        (gpath, prok_folder, cores_annot, name, force, nbcont, small, q) with:

        * gpath: path and filename of genome to annotate
        * prok_folder: path to folder where all prokka folders for all genomes are saved
        * cores_annot: how many cores can use prokka
        * name: output name of annotated genome
        * force: True if force run (override existing files), False otherwise
        * nbcont: number of contigs in the input genome, to check prokka results
        * small: used for prodigal, if sequences to annotate are small. Not used here
        * q : queue where logs are put

    Returns
    -------
    boolean
        True if eveything went well (all needed output files present,
        corresponding numbers of proteins, genes etc.). False otherwise.
    """
    gpath, prok_folder, threads, name, force, nbcont, _, q = arguments
    # Set logger for this process
    qh = logging.handlers.QueueHandler(q)
    root = logging.getLogger()
    root.setLevel(logging.DEBUG)
    root.handlers = []
    logging.addLevelName(utils.detail_lvl(), "DETAIL")
    root.addHandler(qh)
    logger = logging.getLogger('annotate.run_prokka')
    logger.log(utils.detail_lvl(), f"Start annotating {name} from {gpath} with Prokka")

    # Define prokka directory and logfile, and check their existence
    prok_dir = os.path.join(prok_folder, os.path.basename(gpath) + "-prokkaRes")
    fnull = open(os.devnull, 'w')
    prok_logfile = os.path.join(prok_folder, os.path.basename(gpath) + "-prokka.log")
    # import sys
    # sys.exit(1)
    # If result dir already exists, check if we can use it or next step or not
    if os.path.isdir(prok_dir) and not force:
        logger.warning(f"Prokka results folder {prok_dir} already exists.")
        ok = check_prokka(prok_dir, prok_logfile, name, gpath, nbcont, logger)
        # If everything ok in the result dir, do not rerun prokka,
        # use those results for next step (formatting)
        if ok:
            logger.log(utils.detail_lvl(), "Prokka did not run again, "
                       "formatting step used already generated results of "
                       f"Prokka in {prok_dir}. If you want to re-run prokka, first "
                       "remove this result folder, or use '-F' or '--force' "
                       "option if you want to rerun prokka for all genomes.")
            logger.log(utils.detail_lvl(), f"End annotating {name} {gpath}")
        # If missing files, or other problems in result dir, error message,
        # ask user to force or remove this folder.
        else:
            logger.warning("Problems in the files contained in your already existing output dir "
                           "({}). Please check it, or remove it to "
                           "re-annotate.".format(prok_dir))
        # If everything was ok -> everything is ready for next step -> return True
        # If something is wrong -> cannot use those results, genome won't be annotated
        # -> return False
        return ok
    # If result dir exists but user wants to force, remove this result dir
    elif os.path.isdir(prok_dir) and force:
        shutil.rmtree(prok_dir)
        logger.warning("Prokka results folder already exists, but removed because --force option "
                       "used")
    # Now that we checked and solved those cases:
    #     - outdir exists (problems or not, we returned appropriate boolean)
    #     - if outdir exists exists but force, remove this outdir.
    # So, outdir does not exist -> run prokka
    cmd = (f"prokka --outdir {prok_dir} --cpus {threads} "
           f"--prefix {name} --centre prokka {gpath}")
    error = (f"Error while trying to run prokka on {name} from {gpath}")
    logger.log(utils.detail_lvl(), "Prokka command: " + cmd)
    prokf = open(prok_logfile, "w")
    ret = utils.run_cmd(cmd, error, eof=False, stderr=prokf, logger=logger)
    prokf.close()
    if ret.returncode != 0:
        return False
    ok = check_prokka(prok_dir, prok_logfile, name, gpath, nbcont, logger)
    logger.log(utils.detail_lvl(), f"End annotating {name} from {gpath}.")
    return ok
Exemplo n.º 17
0
def main(cmd, lstinfo, name, dbpath, min_id, outdir, clust_mode, spe_dir, threads, outfile=None,
         verbose=0, quiet=False):
    """
    Main method, doing all steps:

    - concatenate all protein files
    - create database as ffindex
    - cluster all proteins
    - convert to pangenome file
    - creating summary and matrix of pangenome

    Parameters
    ----------
    lstinfo : str
        file with name of genomes to consider for pan in the first column, without extension.
        Other columns are ignored. The first column header must be 'gembase_name'
    name : str
        name given to the dataset. For example, ESCO44 for 44 *Escherichia coli* genomes.
    dbpath : str
        path to the folder containing all protein files (files called as the name of genome
        given in lstinfo + ".prt"
    min_id : float
        Minimum percentage of identity between 2 proteins to put them in the same family
    outdir : str
        path to folder which will contain pangenome results and tmp files
    clust_mode : [0, 1, 2]
        0 for 'set cover', 1 for 'single-linkage', 2 for 'CD-Hit'
    spe_dir : str or None
        path to the folder where concatenated bank of proteins must be saved.
        None to use the same folder as protein files
    threads : int
        Max number of threads to use
    outfile : str or None
        Name of the pangenome. None to use the default name
    verbose : int
            verbosity:
        - defaut 0 : stdout contains INFO, stderr contains ERROR.
        - 1: stdout contains INFO, stderr contains WARNING and ERROR
        - 2: stdout contains (DEBUG), DETAIL and INFO, stderr contains WARNING and ERROR
        - >=15: Add DEBUG in stdout
    quiet : bool
        True if nothing must be sent to stdout/stderr, False otherwise
    """
    # import needed packages
    import logging
    from PanACoTA import utils
    from PanACoTA.pangenome_module import protein_seq_functions as protf
    from PanACoTA.pangenome_module import mmseqs_functions as mmf
    from PanACoTA.pangenome_module import post_treatment as pt
    from PanACoTA import __version__ as version

    # test if mmseqs is installed and in the path
    if not utils.check_installed("mmseqs"):  # pragma: no cover
        print("mmseqs is not installed. 'PanACoTA pangenome' cannot run.")
        sys.exit(1)

    os.makedirs(outdir, exist_ok=True)
    # level is the minimum level that will be considered.
    # for verbose = 0 or 1, ignore details and debug, start from info
    if verbose <= 1:
        level = logging.INFO
    # for verbose = 2, ignore only debug
    if verbose >= 2 and verbose < 15:
        level = utils.detail_lvl() # int corresponding to detail level
    # for verbose >= 15, write everything
    if verbose >= 15:
        level = logging.DEBUG
    # name logfile, add timestamp if already existing
    logfile_base = os.path.join(outdir, "PanACoTA-pangenome_" + name)
    utils.init_logger(logfile_base, level, '', verbose=verbose, quiet=quiet, log_details=True)
    logger = logging.getLogger("pangenome")
    logger.info(f'PanACoTA version {version}')
    logger.info("Command used\n \t > " + cmd)

    # Build bank with all proteins to include in the pangenome
    prt_path = protf.build_prt_bank(lstinfo, dbpath, name, spe_dir, quiet)
    # Do pangenome
    families, panfile = mmf.run_all_pangenome(min_id, clust_mode, outdir,
                                              prt_path, threads, outfile, quiet)
    # Create matrix pan_quali, pan_quanti and summary file
    pt.post_treat(families, panfile)
    logger.info("DONE")
    return panfile
Exemplo n.º 18
0
def main(cmd, args_all, args_prepare, args_annot, args_pan, args_corepers,
         args_align, args_tree):
    """
    Call all modules, one by one, using output of one as input for the next one

    Parameters
    ----------
    cmd : str
        command line used to launch the program
    args_all : tuple
        arguments common to all modules: output directory (str),
        threads (int), verbose (int), quiet (bool)
    args_prepare : tuple
        arguments for prepare module (see subcommands.prepare.py): NCBI_species_taxid (int),
        NCBI_species_name (str), NCBI_species_taxid (int), NCBI_taxid (int), NCBI_strains (str), levels (str), NCBI_section (str),
        tmp_dir (str), norefseq (bool), db_dir (str),
        only_mash (bool), info_file (str), l90 (int), nbcont (int), cutn (int),
        min_dist (float), max_dist (float)
    args_annot : tuple
        arguments for annotate module (see subcommands/annotate.py): name (str), qc_only (bool),
        date (str), prodigal_only (bool), small (bool)
    args_pan : tuple
        arguments for pangenome module (see subcommands/pangenome.py): min_id (float),
        clust_mode (int), spe_dir (str), outfile (str)
    args_corepers : tuple
        arguments for corepers module (see subcommands.corepers.py): tol (float), mixed (bool),
        multi (bool), floor (bool)
    args_align : tuple
        arguments for align module (see subcommands.align.py): prot_ali (bool)
    args_tree : tuple
        arguments for tree module (see subcommands.tree.py): soft (str), model (str), boot (bool),
        write_boot (bool), memory (str), fast (bool)
    """
    outdir, threads, verbose, quiet = args_all
    os.makedirs(outdir, exist_ok=True)
    # Initialize logger
    import logging
    # set level of logger: level is the minimum level that will be considered.
    if verbose <= 1:
        level = logging.INFO
    # for verbose = 2, ignore only debug
    if verbose >= 2 and verbose < 15:
        level = utils.detail_lvl()  # int corresponding to detail level
    # for verbose >= 15, write everything
    if verbose >= 15:
        level = logging.DEBUG
    logfile_base = os.path.join(outdir, "PanACoTA-all_modules")
    logfile_base = utils.init_logger(logfile_base,
                                     level,
                                     name='all_modules',
                                     verbose=verbose,
                                     quiet=quiet)
    logger = logging.getLogger('all_modules')
    logger.info(f'PanACoTA version {version}')
    logger.info("Command used\n \t > " + cmd)

    # Run prepare module
    outdir_prepare = os.path.join(outdir, "1-prepare_module")
    (NCBI_species_name, NCBI_species_taxid, NCBI_taxid, NCBI_strains, levels,
     NCBI_section, tmp_dir, norefseq, db_dir, only_mash, info_file, l90,
     nbcont, cutn, min_dist, max_dist) = args_prepare
    logger.info("prepare step")
    info_file = prepare.main("PanACoTA prepare", NCBI_species_name,
                             NCBI_species_taxid, NCBI_taxid, NCBI_strains,
                             levels, NCBI_section, outdir_prepare, tmp_dir,
                             threads, norefseq, db_dir, only_mash, info_file,
                             l90, nbcont, cutn, min_dist, max_dist, verbose,
                             quiet)

    # Run annotate module
    list_file = ""
    db_path = ""
    tmp_dir = ""
    force = False
    outdir_annotate = os.path.join(outdir, "2-annotate_module")
    (name, qc_only, date, prodigal_only, small) = args_annot
    res_annot_dir = None

    logger.info("annotate step")
    lstinfo, nbgenomes = annotate.main("PanACoTA annotate",
                                       list_file,
                                       db_path,
                                       outdir_annotate,
                                       name,
                                       date,
                                       l90,
                                       nbcont,
                                       cutn,
                                       threads,
                                       force,
                                       qc_only,
                                       info_file,
                                       tmp_dir,
                                       res_annot_dir,
                                       verbose,
                                       quiet,
                                       prodigal_only=prodigal_only,
                                       small=small)
    if qc_only:
        return "QC_only done"

    # Pangenome step
    name_pan = f"{name}_{nbgenomes}"
    outdir_pan = os.path.join(outdir, "3-pangenome_module")
    dbpath = os.path.join(outdir_annotate, "Proteins")
    (min_id, clust_mode, spe_dir, outfile) = args_pan
    logger.info("pangenome step")
    panfile = pangenome.main("PanACoTA pangenome",
                             lstinfo,
                             name_pan,
                             dbpath,
                             min_id,
                             outdir_pan,
                             clust_mode,
                             spe_dir,
                             threads,
                             outfile,
                             verbose=verbose,
                             quiet=quiet)

    # Coregenome step
    outdir_corpers = os.path.join(outdir, "4-corepers_module")
    logger.info("corepers step")
    (tol, mixed, multi, floor) = args_corepers
    lstinfo_file = ""  # include all genomes in core
    corepers_file = corepers.main("PanACoTA corepers", panfile, tol, multi,
                                  mixed, outdir_corpers, lstinfo_file, floor,
                                  verbose, quiet)
    # Align step
    outdir_align = os.path.join(outdir, "5-align_module")
    force = False
    logger.info("align step")
    (prot_ali) = args_align
    align_file = align.main("PanACoTA align",
                            corepers_file,
                            lstinfo,
                            name_pan,
                            outdir_annotate,
                            outdir_align,
                            prot_ali,
                            threads,
                            force,
                            verbose=verbose,
                            quiet=quiet)

    # Tree step
    (soft, model, boot, write_boot, memory, fast) = args_tree
    outdir_tree = os.path.join(outdir, "6-tree_module")
    logger.info("tree step")
    tree.main("PanACoTA tree",
              align_file,
              outdir_tree,
              soft,
              model,
              threads,
              boot,
              write_boot,
              memory,
              fast,
              verbose=verbose,
              quiet=quiet)
    logger.info("All modules of PanACOTA are finished.")
    return 0
Exemplo n.º 19
0
def main(cmd,
         list_file,
         db_path,
         res_dir,
         name,
         date,
         l90=100,
         nbcont=999,
         cutn=5,
         threads=1,
         force=False,
         qc_only=False,
         from_info=None,
         tmp_dir=None,
         res_annot_dir=None,
         verbose=0,
         quiet=False,
         prodigal_only=False,
         small=False):
    """
    Main method, doing all steps:

    1. analyze genomes (nb contigs, L90, rows of N...)
    2. keep only genomes with 'good' (according to user thresholds) L90 and nb_contigs
    3. rename genomes with strain number in decreasing quality
    4. annotate genome with prokka or only prodigal
    5. format annotated genomes

    If option '-Q': ends at step 2.
    If option '--info <genome_info file name>' option: starts at step 2

    verbosity:

    - defaut 0 : stdout contains INFO, stderr contains ERROR.
    - 1: stdout contains INFO, stderr contains WARNING and ERROR
    - 2: stdout contains (DEBUG), DETAIL and INFO, stderr contains WARNING and ERROR
    - >=15: Add DEBUG in stdout

    Parameters
    ----------
    cmd : str
        command line used to launch this program
    list_file : str
        file containing the list of genome files, 1 genome per line, separated by a
        space if a genome is split in several fasta files. This file can also
        specify date and/or species information, according to the format described
        in documentation.
    db_path : str
        Path to the folder containing all the fasta files which will be annotated
    res_dir : str
        Path to the folder which will contain result folders and files
    name : str
        4 alpha numeric characters, describing the species (for example ESCO). Used by default
        if no species name is given in list_file line.
    date : str
        4 alpha numeric characters, defining the default date, for strains where it is not specified
        in the list_file
    l90 : int
        Max L90 allowed to keep a genome
    nbcont : int
        Max number of contigs allowed to keep a genome
    cutn : int
        cut each time there are at least cutn 'N' in a row. Don't cut if equal to 0
    threads : int
        max number of threads to use
    force : bool
        If True, overwrite previous results, if False keep what is already calculated
    qc_only : bool
        If True, do only quality control, if False, also do annotation
    from_info : str
        File containing information on genomes and their quality information (from prepare step)
    tmp_dir : str or None
        Path to folder where tmp files must be saved. None to use the default tmp folder
    res_annot_dir : str or None
        Path to folder where are the prokka/prodigal result folders for the genomes. None
        to use the default prokka/prodigal folder
    verbose : int
        verbosity:
        default (0): info in stdout, error and more in stderr
        1 = add warnings in stderr
        2 = like 1 + add DETAIL to stdout (by default only INFO)
        >15: add debug to stdout
    quiet : bool
        True if nothing must be sent to stdout/stderr, False otherwise
    prodigal_only : bool
        True -> run only prodigal. False -> run prokka
    small : bool
        True -> use -p meta option with prodigal

    Returns
    -------
    (genomes, kept_genomes, skipped, skipped_format) : tuple
        with:

        - genomes: dict with all genomes in list_file:
          {genome: [gembase_name, path_split_gembase, gsize, nbcont, L90]}
        - kept_genomes: dict with all genomes kept for annotation (same format as genomes)
        - skipped: list of genomes skipped because they had a problem in annotation step
        - skipped_format : list of genomes skipped because they had a problem in format step
    """
    # import needed packages
    import shutil
    import logging
    from PanACoTA.annotate_module import genome_seq_functions as gfunc
    from PanACoTA.annotate_module import annotation_functions as pfunc
    from PanACoTA.annotate_module import general_format_functions as ffunc
    from PanACoTA import utils
    from PanACoTA import __version__ as version
    # Check that needed softs are installed
    prokka = utils.check_installed("prokka")
    prodigal = utils.check_installed("prodigal")
    if prodigal_only:
        soft = "prodigal"
    else:
        soft = "prokka"

    changed = cutn != 0
    if not qc_only:  # pragma: no cover
        # If user using prokka: check prokka is installed and in the path
        if not prodigal_only and not prokka:
            print(
                "Prokka is not installed. 'PanACoTA annotate' cannot run. Install prokka "
                "to be able to annotate genomes. If you only need syntactical annotation, "
                "check that prodigal is installed, and add '--prodigal' option."
            )
            sys.exit(1)
        if prodigal_only and not prodigal:
            print(
                "Prodigal is not installed. 'PanACoTA annotate' cannot run. Install "
                "prodigal to be able to annotate genomes. If you also need functional "
                "annotation, check that prokka is installed, and remove '--prodigal' "
                "option.")
            sys.exit(1)

    # By default, all tmp files (split sequences, renamed sequences, prokka/prodigal results) will
    # be saved in the given <res_dir>/tmp_files.
    # Create output (results, tmp...) directories if not already existing
    if not tmp_dir:
        tmp_dir = os.path.join(res_dir, "tmp_files")
    if not res_annot_dir:
        res_annot_dir = tmp_dir
    os.makedirs(res_dir, exist_ok=True)
    os.makedirs(tmp_dir, exist_ok=True)
    os.makedirs(res_annot_dir, exist_ok=True)

    # If force was set, remove result folders (Proteins, Replicons, Genes, LSTINFO, gff)
    if force:
        shutil.rmtree(os.path.join(res_dir, "LSTINFO"), ignore_errors=True)
        shutil.rmtree(os.path.join(res_dir, "Proteins"), ignore_errors=True)
        shutil.rmtree(os.path.join(res_dir, "Genes"), ignore_errors=True)
        shutil.rmtree(os.path.join(res_dir, "Replicons"), ignore_errors=True)
        shutil.rmtree(os.path.join(res_dir, "gff3"), ignore_errors=True)
    # If not --force, check that result folders do not already contain results
    else:
        utils.check_out_dirs(res_dir)

    # get only filename of list_file, without extension
    if list_file:
        listfile_base = os.path.basename(os.path.splitext(list_file)[0])
    else:
        list_file = from_info
        listfile_base = os.path.basename(os.path.splitext(list_file)[0])

    # Initialize logger
    # set level of logger: level is the minimum level that will be considered.
    if verbose <= 1:
        level = logging.INFO
    # for verbose = 2, ignore only debug
    if verbose >= 2 and verbose < 15:
        level = utils.detail_lvl()  # int corresponding to detail level
    # for verbose >= 15, write everything
    if verbose >= 15:
        level = logging.DEBUG
    logfile_base = os.path.join(res_dir, "PanACoTA-annotate_" + listfile_base)
    logfile_base = utils.init_logger(logfile_base,
                                     level,
                                     name='annotate',
                                     log_details=True,
                                     verbose=verbose,
                                     quiet=quiet)
    logger = logging.getLogger('annotate')
    logger.info(f'PanACoTA version {version}')
    logger.info("Command used\n \t > " + cmd)

    # STEP 1. analyze genomes (nb contigs, L90, rows of N...)
    # If already info on genome ('--info <file>' option), skip this step
    # If no info on genomes, read them and get needed information
    if not from_info:
        # Read genome names.
        # genomes = {genome: [spegenus.date]}
        genomes = utils.read_genomes(list_file, name, date, db_path, tmp_dir,
                                     logger)
        if not genomes:
            logger.error(
                ("We did not find any genome listed in {} in the folder {}. "
                 "Please check your list to give valid genome "
                 "names.").format(list_file, db_path))
            sys.exit(1)
        # Get L90, nbcontig, size for all genomes, and cut at row of cutn 'N' if asked
        # -> genome: [spegenus.date, orig_path, to_annotate_path, size, nbcont, l90]
        gfunc.analyse_all_genomes(genomes,
                                  db_path,
                                  tmp_dir,
                                  cutn,
                                  soft,
                                  logger,
                                  quiet=quiet)
    # --info <filename> option given: read information (L90, nb contigs...) from this file.
    else:
        # genomes = {genome: [spegenus.date, orig_path, to_annotate_path, size, nbcont, l90]}
        # orig_path is the path to the original sequence
        # and to_annotate_path the path to the sequence to annotate (once split etc.)
        # Here, both are the same, as we take given sequences as is.
        genomes = utils.read_genomes_info(from_info, name, date, logger)

    # STEP 2. keep only genomes with 'good' (according to user thresholds) L90 and nb_contigs
    # genomes = {genome: [spegenus.date, orig_seq, path_to_splitSequence, size, nbcont, l90]}
    # Plot L90 and nb_contigs distributions
    gfunc.plot_distributions(genomes, res_dir, listfile_base, l90, nbcont)
    # Get list of genomes kept (according to L90 and nbcont thresholds)
    kept_genomes = {
        genome: info
        for genome, info in genomes.items()
        if info[-2] <= nbcont and info[-1] <= l90
    }
    # Write discarded genomes to a file -> orig_name, to_annotate, gsize, nb_conts, L90
    utils.write_genomes_info(genomes, list(kept_genomes.keys()), list_file,
                             res_dir)

    if not kept_genomes:
        logger.info("No genome kept for annotation.")
        return "", 0
    # Info on folder containing original sequences
    if not from_info:
        logger.info(
            f"-> Original sequences folder ('orig_name' column): {db_path} ")
        logger.info(
            f"\t-> If original sequence not found in {db_path}, "
            f"look for it in {tmp_dir}, as it must be a concatenation of several "
            "input sequence files.")
        if cutn == 0:
            logger.info(
                "-> Sequences used for annotation ('to_annotate' column) are the "
                "same as the previous ones (original sequences).")
        else:
            logger.info(
                f"-> Folder with sequence files that will be used for annotation "
                f"('to_annotate' column): {tmp_dir}")
    # If only QC, stop here.
    if qc_only:
        # Write information on genomes that would be annotated with the current
        # parameters if not QC_only:
        # orig_name, to_annnote, gsize, nb_conts, L90
        utils.write_genomes_info(genomes, [], list_file, res_dir, qc=True)
        logger.info("QC only done.")
        return "", 0

    # STEP 3. Rename genomes kept, ordered by decreasing quality
    first_gname = gfunc.rename_all_genomes(kept_genomes)
    # kept_genomes = {genome: [gembase_name, path_to_origfile, path_split_gembase,
    #                 gsize, nbcont, L90]}
    # first_gname = name of the first genome
    # Write lstinfo file (list of genomes kept with info on L90 etc.)
    outlst = utils.write_lstinfo(list_file, kept_genomes, res_dir)

    # STEP 4. Annotate all kept genomes
    results = pfunc.run_annotation_all(kept_genomes,
                                       threads,
                                       force,
                                       res_annot_dir,
                                       first_gname,
                                       prodigal_only,
                                       small=small,
                                       quiet=quiet)
    # Information on genomes to format
    # results_ok = {genome: [gembase_name, path_to_origfile, path_split_gembase,
    #               gsize, nbcont, L90]}
    results_ok = {
        genome: info
        for genome, info in kept_genomes.items() if results[genome]
    }
    # If no genome was ok, no need to format them. Just print that no genome was annotated,
    # end program.
    if not results_ok:
        logger.error(
            "Error: No genome was correctly annotated, no need to format them."
        )
        sys.exit(1)
    # list of genomes skipped because annotation had problems: no format step run
    skipped = [genome for (genome, ok) in results.items() if not ok]
    # At least 1 genome was not annotated: write a message to warn on it
    if skipped:
        utils.write_warning_skipped(skipped,
                                    prodigal_only=prodigal_only,
                                    logfile=logfile_base)

    # STEP 5. Format genomes annotated
    # Here, we have at least 1 genome annotated (otherwise,
    # it would already have stopped because results_ok is empty)
    # Initialize list of genomes skipped because something went wrong while formatting.
    skipped_format = []
    # Generate database (folders Proteins, Genes, Replicons, LSTINFO)
    skipped_format = ffunc.format_genomes(results_ok,
                                          res_dir,
                                          res_annot_dir,
                                          prodigal_only,
                                          threads,
                                          quiet=quiet)
    # At least one genome could not be formatted -> warn user
    if skipped_format:
        utils.write_warning_skipped(skipped_format,
                                    do_format=True,
                                    prodigal_only=prodigal_only,
                                    logfile=logfile_base)
    logger.info("Annotation step done.")
    return outlst, len(kept_genomes) - len(skipped) - len(skipped_format)