def test_log_listen(capsys): """ Check that when we log to a queue listener, and then handle the logs via logger_thread, the logs appear. """ import multiprocessing import threading # Create Queue, QueueHandler, and log messages to it m = multiprocessing.Manager() q = m.Queue() qh = logging.handlers.QueueHandler(q) root = logging.getLogger() root.setLevel(logging.DEBUG) root.handlers = [] logging.addLevelName(utils.detail_lvl(), "DETAIL") root.addHandler(qh) logger = logging.getLogger('process') logger.debug("debug message") logger.log(utils.detail_lvl(), "detail message") logger.info("info message") logger.warning("warning message") logger.error("error message") logger.critical("critical message") q.put(None) # Initialize real logger logfile = os.path.join(GENEPATH, "logfile_test.txt") utils.init_logger(logfile, 0, '') # Listen to QueueHandler and handle messages to stdout/stderr/files lp = threading.Thread(target=utils.logger_thread, args=(q, )) lp.start() lp.join() out, err = capsys.readouterr() assert "info message" in out assert "error message" in err assert "critical message" in err with open(logfile + ".log", "r") as logf: assert logf.readline().endswith(" :: INFO :: info message\n") assert logf.readline().endswith(" :: WARNING :: warning message\n") assert logf.readline().endswith(" :: ERROR :: error message\n") assert logf.readline().endswith(" :: CRITICAL :: critical message\n") with open(logfile + ".log.details") as logf: assert logf.readline().endswith(" :: DETAIL :: detail message\n") assert logf.readline().endswith(" :: INFO :: info message\n") assert logf.readline().endswith(" :: WARNING :: warning message\n") assert logf.readline().endswith(" :: ERROR :: error message\n") assert logf.readline().endswith(" :: CRITICAL :: critical message\n") with open(logfile + ".log.err", "r") as logf: assert logf.readline().endswith(" :: WARNING :: warning message\n") assert logf.readline().endswith(" :: ERROR :: error message\n") assert logf.readline().endswith(" :: CRITICAL :: critical message\n")
def prodigal_train(gpath, annot_folder): """ Use prodigal training mode. First, train prodigal on the first genome ('gpath'), and write it to 'genome'.trn, file which will be used for the annotation of all next sequence Parameters ---------- gpath : str path to genome to train on annot_folder : str path to folder where the log files and train file will be saved Returns ------- str path and name of train file (will be used to annotate all next genomes) If problem, returns empty string """ logger.info(f"Prodigal will train using {gpath}") gname = os.path.basename(gpath) # path/to/original/genome.fasta -> genome.fasta gpath_train = os.path.join(annot_folder, gname + ".trn") # path/to/prodiRes/genome.fasta.trn if os.path.isfile(gpath_train): logger.info(f"A training file already exists ({gpath_train}). " "It will be used to annotate all genomes.") return gpath_train prodigal_logfile = gpath_train + "-prodigal-train.log" # path/to/genome-prodigal-train.log prodigal_logfile_err = gpath_train + "-prodigal-train.log.err" cmd = (f"prodigal -i {gpath} -t {gpath_train}") error = (f"Error while trying to train prodigal on {gname}. See {prodigal_logfile_err}.") logger.log(utils.detail_lvl(), "prodigal command: " + cmd) prodigalf = open(prodigal_logfile, "w") prodigalferr = open(prodigal_logfile_err, "w") ret = utils.run_cmd(cmd, error, eof=False, stderr=prodigalferr, stdout=prodigalf, logger=logger) prodigalf.close() prodigalferr.close() if ret.returncode == 0: logger.log(utils.detail_lvl(), f"End training on {gpath}") return gpath_train else: return ""
def check_extractions(num_fam, miss_file, prt_file, gen_file, ngenomes, logger): """ Check that extractions went well for the given family: - check number of proteins and genes extracted compared to the number of genomes Parameters ---------- num_fam : int current family number miss_file : str path to file containing the list of genomes missing for the current family prt_file : str path to file containing all proteins extracted gen_file : str path to file containing all genes extracted ngenomes : int total number of genomes in dataset logger : logging.Logger logger with queueHandler to give logs to main logger Returns ------- bool or int False if any problem (nbmiss+prt != nbgenomes or nbmiss+gen != nbgenomes). If no problem, returns the number of proteins/genes extracted """ logger.log(utils.detail_lvl(), f"Checking extractions for family {num_fam}") # Check that extractions went well nbmiss = utils.count(miss_file) # If files with proteins extracted do not even exist, close with error # (they should have been created at the previous step) if not os.path.isfile(gen_file): logger.error(f"fam {num_fam}: no file with genes extracted " f"('{gen_file}'). Cannot align.") sys.exit(1) if not os.path.isfile(prt_file): logger.error(f"fam {num_fam}: no file with proteins extracted " f"('{prt_file}'). Cannot align.") sys.exit(1) nbfprt = utils.grep(prt_file, "^>", counts=True) nbfgen = utils.grep(gen_file, "^>", counts=True) if nbmiss + nbfprt != ngenomes: logger.error(("fam {}: wrong sum of missing genomes ({}) and prt extracted ({}) for {} " "genomes in the dataset.").format(num_fam, nbmiss, nbfprt, ngenomes)) return False if nbmiss + nbfgen != ngenomes: logger.error(("fam {}: wrong sum of missing genomes ({}) and gen extracted ({}) for {} " "genomes in the dataset.").format(num_fam, nbmiss, nbfgen, ngenomes)) return False return nbfprt
def handle_genome(args): """ For a given genome, check if it has been annotated (in results), if annotation (by prokka or prodigal) ran without problems (result = True). In that case, format the genome and get the output to see if everything went ok. Parameters ---------- args : tuple (genome, name, gpath, prok_path, lst_dir, prot_dir,\ gene_dir, rep_dir, gff_dir, results, q)\ with: * genome : original genome name * name : gembase name of the genome * gpath : path to the genome sequence which was given to prokka/prodigal for annotation * annot_path : directory where prokka/prodigal folders are saved * lst_dir : path to 'LSTINFO' folder * prot_dir : path to 'Proteins' folder * gene_dit : path to 'Genes' folder * rep_dir : path to 'Replicons' folder * gff_dir : path to 'gff3' folder * prodigal_only : True if annotated by prodigal, False if annotated by prokka * q : multiprocessing.managers.AutoProxy[Queue] queue to put logs during subprocess Returns ------- (bool, str) : * True if genome was annotated as expected, False otherwise * genome name (used to get info from the pool.map_async) """ (genome, name, gpath, annot_path, lst_dir, prot_dir, gene_dir, rep_dir, gff_dir, prodigal_only, q) = args # Define which formatting must be used, given the annotation software if prodigal_only: format_one_genome = fprodigal.format_one_genome else: format_one_genome = fprokka.format_one_genome # Set logger for this process qh = logging.handlers.QueueHandler(q) root = logging.getLogger() root.setLevel(logging.DEBUG) root.handlers = [] logging.addLevelName(utils.detail_lvl(), "DETAIL") root.addHandler(qh) logger = logging.getLogger('format.handle_genome') # Handle genome ok_format = format_one_genome(gpath, name, annot_path, lst_dir, prot_dir, gene_dir, rep_dir, gff_dir) return ok_format, genome
def my_logger(name): """ logger given to function called by a subprocess """ m = multiprocessing.Manager() q = m.Queue() qh = logging.handlers.QueueHandler(q) root = logging.getLogger() root.setLevel(logging.DEBUG) root.handlers = [] logging.addLevelName(utils.detail_lvl(), "DETAIL") root.addHandler(qh) return q, logging.getLogger(name)
def mafft_align(num_fam, prt_file, mafft_file, nbfprt, logger): """ Align all proteins of the given family with mafft Parameters ---------- num_fam : int current family number prt_file : str path to file containing all proteins extracted mafft_file : str path to file which will contain proteins alignment nbfprt : int number of proteins extracted in prt file logger : logging.Logger logger with queueHandler to give logs to main logger Returns ------- bool True if no problem (alignment ok, same number of proteins extracted and aligned), False otherwise """ logger.log(utils.detail_lvl(), f"Aligning family {num_fam}") cmd = f"mafft --auto {prt_file}" error = f"Problem while trying to align fam {num_fam}" stdout = open(mafft_file, "w") stderr = open(mafft_file + ".log", "w") logger.log(utils.detail_lvl(), f"Mafft command: {cmd}") ret = utils.run_cmd(cmd, error, stdout=stdout, stderr=stderr, logger=logger) stdout.close() if not isinstance(ret, int): ret = ret.returncode if ret != 0: os.remove(mafft_file) return False message = (f"fam {num_fam}: different number of proteins extracted in {prt_file} ({nbfprt}) and proteins " f"aligned in {mafft_file}") return check_nb_seqs(mafft_file, nbfprt, logger, message)
def test_log_no_listen(capsys): """ Check that when we log to a queue listener, but never listen to the queue, there is nothing in stderr/stdout/files """ import multiprocessing # Create Queue, QueueHandler, and log messages to it m = multiprocessing.Manager() q = m.Queue() qh = logging.handlers.QueueHandler(q) root = logging.getLogger() root.setLevel(logging.DEBUG) root.handlers = [] logging.addLevelName(utils.detail_lvl(), "DETAIL") root.addHandler(qh) logger = logging.getLogger('process') logger.debug("debug message") logger.log(utils.detail_lvl(), "detail message") logger.info("info message") logger.warning("warning message") logger.error("error message") logger.critical("critical message") q.put(None) # Initialize real logger logfile = os.path.join(GENEPATH, "test_log_listen") utils.init_logger(logfile, 0, '') assert q.qsize() == 7 out, err = capsys.readouterr() assert out == "" assert err == "" with open(logfile + ".log", "r") as logf: assert logf.readlines() == [] with open(logfile + ".log.details") as logf: assert logf.readlines() == [] with open(logfile + ".log.err", "r") as logf: assert logf.readlines() == []
def read_alignments(all_alns, all_genomes): """ Read alignment file, and assign each sequence to a genome Parameters ---------- all_alns : str path to file containing all alignments concatenated all_genomes : [] list of all genomes Returns ------- dict or None - {genome_name: [list of sequences for this genome]} - None if problem with a protein for which we don't find the genome """ sequences = {} # name: [ordered list of sequences] genome = None seq = "" with open(all_alns, 'r') as alnf: for line in alnf: if line.startswith(">"): # If new header, write previous protein name/sequence to 'sequences' if genome and seq: sequences[genome].append(seq) seq = "" # Get new genome header genome = get_genome(line, all_genomes) if not genome: return None if genome not in sequences: sequences[genome] = [] else: seq += line.strip() if genome and seq: sequences[genome].append(seq) per_genome = [len(seq) for seq in sequences.values()] if len(set(per_genome)) != 1: logger.error( "Problems occurred while grouping alignments by genome: all genomes " "do not have the same number of sequences. Check that each protein " "name contains the name of the genome from which it comes.") return None logger.log(utils.detail_lvl(), f"{per_genome[0]} sequences found per genome") return sequences
def write_groups(outfile, sequences): """ Writing alignments per genome to output file. Parameters ---------- outfile : str path to file that will contain alignments grouped by genome sequences : dict {genome_name: [list of sequences (DNA, prot...) for this genome]} """ logger.log(utils.detail_lvl(), "Writing alignments per genome") with open(outfile, "w") as outf: for genome in sorted(sequences, key=utils.sort_genomes_by_name): # write header for genome outf.write(">" + genome + "\n") # Write all sequences outf.write("".join(sequences[genome]) + "\n")
def back_translate(num_fam, mafft_file, gen_file, btr_file, nbfal, logger): """ Backtranslate protein alignment to nucleotides Parameters ---------- num_fam : int current family number. Used for log messages mafft_file : str path to file containing protein alignments by mafft gen_file : str path to file containing all sequences, not aligned, in nucleotides. It is used to convert the alignment in proteins into a nucleotide alignment btr_file : str path to the file that will contain the nucleotide alignment nbfal : int number of sequences aligned for the family by mafft logger : logging.Logger logger with queueHandler to give logs to main logger Returns ------- bool - False if problem (back-translation, different number of families...) - number of sequences in btr file if everything went well """ logger.log(utils.detail_lvl(), f"Back-translating family {num_fam}") curpath = os.path.dirname(os.path.abspath(__file__)) awk_script = os.path.join(curpath, "prt2codon.awk") cmd = f"awk -f {awk_script} {mafft_file} {gen_file}" stdout = open(btr_file, "w") error = f"Problem while trying to backtranslate {mafft_file} to a nucleotide alignment" ret = utils.run_cmd(cmd, error, stdout=stdout, logger=logger) stdout.close() if not isinstance(ret, int): ret = ret.returncode if ret != 0: os.remove(btr_file) return False message = (f"fam {num_fam}: different number of proteins aligned in {mafft_file} ({nbfal}) and genes " f"back-translated in {btr_file}") # Check number of sequences in btr file, and return True/False according to it # It should contain the same number of sequences as the mafft file. return check_nb_seqs(mafft_file, nbfal, logger, message)
def handle_family(args): """ For the given family: - align its proteins with mafft - back-translate to nucleotides - add missing genomes Parameters ---------- args : () (prefix, num_fam, ngenomes, q) with: - prefix: path to ``aldir/<name of dataset>`` - num_fam: the current family number - ngenomes: the total number of genomes in dataset - q: a queue, which will be used by logger to put logs while in other process Returns ------- bool - "OK" if the files were not re-created, and have the expected format. This is used by ``align_all_families`` function, to know if something was regenerated, or if everything already existed with the expected format. If something was redone and concat/group files exist, it removes them. - False if any problem (extractions, alignment, btr, add missing genomes...) - True if just generated all files, and everything is ok """ prefix, num_fam, ngenomes, q = args qh = logging.handlers.QueueHandler(q) root = logging.getLogger() root.setLevel(logging.DEBUG) root.handlers = [] logging.addLevelName(utils.detail_lvl(), "DETAIL") root.addHandler(qh) logger = logging.getLogger('align.align_family') return handle_family_1thread((prefix, num_fam, ngenomes))
def main(cmd, ncbi_species_name, ncbi_species_taxid, ncbi_taxid, ncbi_strains, levels, ncbi_section, outdir, tmp_dir, threads, norefseq, db_dir, only_mash, info_file, l90, nbcont, cutn, min_dist, max_dist, verbose, quiet): """ Main method, constructing the draft dataset for the given species verbosity: - defaut 0 : stdout contains INFO, stderr contains ERROR, .log contains INFO and more, .log.err contains warning and more - 1: same as 0 + WARNING in stderr - 2: same as 1 + DETAILS in stdout + DETAILS in .log.details - >=15: same as 2 + Add DEBUG in stdout + create .log.debug with everything from info to debug Parameters ---------- cmd : str command line used to launch this program ncbi_species_name : str name of species to download, as given by NCBI ncbi_species_taxid : int species taxid given in NCBI ncbi_taxid : int NCBI taxid (sub-species) ncbi_strains : str specific strains to download levels: str Level of assembly to download. Choice between 'all', 'complete', 'chromosome', 'scaffold', 'contig'. Default is 'all' outdir : str path to output directory (where created database will be saved). tmp_dir : str Path to directory where tmp files are saved (sequences split at each row of 5 'N') threads : int max number of threads to use norefseq : bool True if user does not want to download again the database db_dir : str Name of the folder where already downloaded fasta files are saved. only_mash : bool True if user user already has the database and quality of each genome (L90, #contigs etc.) info_file : str File containing information on QC if it was already ran before (columns to_annotate, gsize, nb_conts and L90). l90 : int Max L90 allowed to keep a genome nbcont : int Max number of contigs allowed to keep a genome cutn : int cut at each when there are 'cutn' N in a row. Don't cut if equal to 0 min_dist : int lower limit of distance between 2 genomes to keep them max_dist : int upper limit of distance between 2 genomes to keep them (default is 0.06) verbose : int verbosity: - defaut 0 : stdout contains INFO, stderr contains ERROR, .log contains INFO and more, .log.err contains warning and more - 1: same as 0 + WARNING in stderr - 2: same as 1 + DETAILS in stdout + DETAILS in .log.details - >=15: same as 2 + Add DEBUG in stdout + create .log.debug with everything from info to debug quiet : bool True if nothing must be sent to stdout/stderr, False otherwise """ # get species name in NCBI format # -> will be used to name output directory # -> will be used to download summary file if given species corresponds to NCBI name if ncbi_species_name: species_linked = "_".join(ncbi_species_name.split()) species_linked = "_".join(species_linked.split("/")) # if species name not given by user, use species taxID (if given) to name output directory elif ncbi_species_taxid: species_linked = str(ncbi_species_taxid) # if species name not species taxid by user, use taxID (if given) to name output directory elif ncbi_taxid: species_linked = str(ncbi_taxid) # If no species nor taxID, get specific strain names elif ncbi_strains: if os.path.isfile(ncbi_strains): species_linked = os.path.basename(ncbi_strains) species_linked = os.path.splitext(species_linked)[0] else: species_linked = "_".join(ncbi_strains.split()) species_linked = "-".join(species_linked.split("/")) species_linked = "_and_".join(species_linked.split(",")) # if neither speName, speID, taxID nor strainName given (--norefseq, mashonly), name is NA else: species_linked = "NA" # Default outdir is species name if given, or species taxID if not outdir: outdir = species_linked # Default tmp_dir is outdir/tmp_files if not tmp_dir: tmp_dir = os.path.join(outdir, "tmp_files") # directory that will be created by ncbi_genome_download ncbidir = os.path.join(outdir, ncbi_section, "bacteria") os.makedirs(outdir, exist_ok=True) os.makedirs(tmp_dir, exist_ok=True) # Initialize logger # set level of logger: level is the minimum level that will be considered. if verbose <= 1: level = logging.INFO # for verbose = 2, ignore only debug if verbose >= 2 and verbose < 15: level = utils.detail_lvl() # int corresponding to detail level # for verbose >= 15, write everything if verbose >= 15: level = logging.DEBUG logfile_base = os.path.join(outdir, "PanACoTA_prepare_{}").format(species_linked) logfile_base, logger = utils.init_logger(logfile_base, level, 'prepare', log_details=True, verbose=verbose, quiet=quiet) # Message on what will be done (cmd, cores used) logger.info(f'PanACoTA version {version}') logger.info("Command used\n \t > " + cmd) message = f"'PanACoTA prepare' will run on {threads} " message += f"cores" if threads > 1 else "core" logger.info(message) # Start prepare step # Run more than only mash filter (!only_mash): # - start from QC and mash (norefseq) # - start from genome download (!norefseq)) if not only_mash: # Not only mash, so a new info file will be created. If the user still gave an info # file (he will be warned that it will be ignored), rename it with '.bak' # to avoid erasing it if info_file and os.path.isfile(info_file): os.rename(info_file, info_file + ".back") # 'norefseq = True" : Do not download genomes, just do QC and mash filter on given genomes # -> if not, error and exit if norefseq: logger.warning(f'You asked to skip {ncbi_section} downloads.') # -> if db_dir given, watch for sequences there. If does not exist, error and exit # (user gave a directory (even if it does not exist), so we won't look for # the sequences in other folders) if db_dir: if not os.path.exists(db_dir): logger.error( f"Database folder {db_dir} supposed to contain fasta " "sequences does not " "exist. Please give a valid folder, or leave the default " "directory (no '-d' option).") sys.exit(1) # -> If user did not give db_dir, genomes could be in # outdir/Database_init/<genome_name>.fna else: db_dir = os.path.join(outdir, "Database_init") # If it does not exist, check if default compressed files folder exists. if not os.path.exists(db_dir): logger.warning( f"Database folder {db_dir} supposed to contain fasta " "sequences does not " "exist. We will check if the download folder (with compressed " "sequences) exists.") # -> if not in database_init, genomes must be in # outdir/refeq/bacteria/<genome_name>.fna.gz. In that case, # uncompress and add them to Database_init if not os.path.exists(ncbidir): logger.error( f"Folder {ncbidir} does not exist. You do not have any " "genome to analyse. Possible reasons:\n" "- if you want to rerun analysis in the same folder as " "sequences were downloaded (my_outdir/Database_init or " f"my_outdir/{ncbi_section}), make sure you have '-o my_outdir' " "option\n" "- if you want to rerun analysis and save them in a new " "output folder called 'new_outdir', make sure you have " "'-o new_outdir' option, " "and you specified where the uncompressed sequences to " "use are ('-d sequence_database_path'). ") sys.exit(1) # add genomes from refseq/bacteria folder to Database_init nb_gen, _ = dgf.to_database(outdir, ncbi_section) # No sequence: Do all steps -> download, QC, mash filter else: # Download all genomes of the given taxID db_dir, nb_gen = dgf.download_from_ncbi(species_linked, ncbi_section, ncbi_species_name, ncbi_species_taxid, ncbi_taxid, ncbi_strains, levels, outdir, threads) logger.info(f"{nb_gen} {ncbi_section} genome(s) downloaded") # Now that genomes are downloaded and uncompressed, check their quality to remove bad ones genomes = fg.check_quality(species_linked, db_dir, tmp_dir, l90, nbcont, cutn) # Do only mash filter. Genomes must be already downloaded, and there must be a file with # all information on these genomes (L90 etc.) else: logger.warning('You asked to run only mash steps.') if not os.path.exists( info_file): # info-file missing -> error and exit logger.error( f"Your info file {info_file} does not exist. Please provide the " "right name/path, or remove the '--mash-only option to rerun " "quality control.") sys.exit(1) logger.info(("You want to run only mash steps. Getting information " "from {}").format(info_file)) genomes = utils.read_genomes_info( info_file, species_linked, ) # Run Mash # genomes : {genome_file: [genome_name, orig_name, path_to_seq_to_annotate, size, nbcont, l90]} # sorted_genome : [genome_file] ordered by L90/nbcont (keys of genomes) sorted_genomes = fg.sort_genomes_minhash(genomes, l90, nbcont) # Write discarded genomes to a file -> orig_name, to_annotate, gsize, nb_conts, L90 discQC = f"by-L90_nbcont-{species_linked}.txt" utils.write_genomes_info(genomes, sorted_genomes, discQC, outdir) # Remove genomes not corresponding to mash filters removed = fg.iterative_mash(sorted_genomes, genomes, outdir, species_linked, min_dist, max_dist, threads, quiet) # Write list of genomes kept, and list of genomes discarded by mash step info_file = fg.write_outputfiles(genomes, sorted_genomes, removed, outdir, species_linked, min_dist, max_dist) logger.info("End") return info_file
def create_gen(ffnseq, lstfile, genseq): """ Generate .gen file, from sequences contained in .ffn, but changing the headers using the information in .lst Parameters ---------- ffnseq : str .ffn file generated by prokka lstfile : str lstfile converted from prokka tbl file genseq : str output file, to write in Genes directory logger : logging.Logger logger object to put information Returns ------- bool : True if conversion went well, False otherwise """ problem = False write = True # Write next sequence with open(ffnseq) as ffn, open(lstfile) as lst, open(genseq, "w") as gen: for line_ffn in ffn: # Ignore gene that we do not want to write (should be a crispr) # If line of sequence, write it as is, and go to next line if not line_ffn.startswith(">"): # We just read a seq line. If we can write (write is True), do it and go # to next line # Otherwise, just go to next line if write: gen.write(line_ffn) continue # Try to get gene ID. If does not work, ignore this gene (it may be a # CRISPR, and we ignore them test_gen_id = line_ffn.split()[0].split("_")[-1] if not test_gen_id.isdigit(): # Maybe a CRISPR? Or wrong gene name? -> ignore logger.log( utils.detail_lvl(), f"Unknown header format for {line_ffn.strip()}. " "This gene will be ignored in .gen output file.") write = False continue # If ffn contains a gene header, find its information in lst file else: write = True lstline = lst.readline().strip() gen_id = int(test_gen_id) # genID exists, ffn header is for a gene. Check that it corresponds to # information in lst file. id_lst = lstline.split("\t")[4].split("_")[-1] gen_id_lst = int(id_lst) # in lst, find the same gene ID as in ffn (some gene IDs in lst can be absent # from ffn, if prokka do not give their sequence). # As they are ordered by increasing number, go to next lstline until # corresponding gene ID is found. However, if ffn ID > lst ID: ID does not # exist in .lst -> problem. while gen_id > gen_id_lst: lstline = lst.readline().strip() if not lstline: gen_id_lst = "-1" break id_lst = lstline.split("\t")[4].split("_")[-1] gen_id_lst = int(id_lst) # If it found the same gene ID, write info in gene file if gen_id == gen_id_lst: general.write_header(lstline.strip(), gen) # If gene ID of ffn not found, write error message and stop else: logger.error( f"Missing info for gene {line_ffn.strip()} " f"(from {ffnseq}) in {lstfile}. If it is actually present " "in the lst file, check that genes are ordered by increasing number in both lst and ffn files." ) return False return True
def run_prodigal(arguments): """ Run prodigal for the given genome. Parameters ---------- arguments : tuple (gpath, prodigal_folder, cores_annot, name, force, nbcont, q) with: * gpath: path and filename of genome to annotate * prodigal_folder: path to folder where all prodigal folders for all genomes are saved * cores_annot: how many cores can use prodigal * name: output name of annotated genome * force: True if force run (override existing files), False otherwise * nbcont: number of contigs in the input genome, to check prodigal results * small: ifcontigs are too small (<20000bp), use -p meta option * q : queue where logs are put Returns ------- boolean True if eveything went well (all needed output files present, corresponding numbers of proteins, genes etc.). False otherwise. """ gpath, prodigal_folder, threads, name, force, nbcont, gpath_train, q = arguments # Set logger for this process, which will be given to all subprocess qh = logging.handlers.QueueHandler(q) root = logging.getLogger() root.setLevel(logging.DEBUG) root.handlers = [] logging.addLevelName(utils.detail_lvl(), "DETAIL") root.addHandler(qh) logger = logging.getLogger('annotate.run_prodigal') # Define prodigal directory and logfile, and check their existence # By default, prodigal is in tmp_folder -> resdir/tmp_files/genome-prodigalRes g_ori_name = os.path.basename(gpath) prodigal_dir = os.path.join(prodigal_folder, g_ori_name + "-prodigalRes") prodigal_logfile = os.path.join(prodigal_folder, g_ori_name + "-prodigal.log") prodigal_logfile_err = os.path.join(prodigal_folder, g_ori_name + "-prodigal.log.err") # If result dir exists but user wants to force, remove this result dir if os.path.isdir(prodigal_dir) and force: shutil.rmtree(prodigal_dir) logger.warning("Prodigal results folder already exists, but is removed because " "--force option was used.") # Training file can be "small option", meaning that we did not use the training mode. # If not "small option", we used the training mode. If training file does not exist # and prodigal result directory neither, return False # We cannot annotate using nothing. # Happens if there was a problem while training if (gpath_train != "small option" and not os.path.isfile(gpath_train) and not os.path.isdir(prodigal_dir)): return False logger.log(utils.detail_lvl(), f"Start annotating {name} (from {gpath} sequence) " "with Prodigal") # If prodigal results dir already exists (meaning user did not want to force, # otherwise it would have been deleted just before), # can we use it for next step ? -> check content. if os.path.isdir(prodigal_dir): logger.warning(f"Prodigal results folder {prodigal_dir} already exists.") ok = check_prodigal(gpath, name, prodigal_dir, logger) # If everything ok in the result dir, do not rerun prodigal, # use those results for next step (formatting) if ok: logger.log(utils.detail_lvl(), "Prodigal did not run again. " "Formatting step will use already generated results of " "Prodigal in {}. If you want to re-run Prodigal, first " "remove this result folder, or use '-F' or '--force' " "option.".format(prodigal_dir)) logger.log(utils.detail_lvl(), f"End annotating {name} (from {gpath})") # If missing files, or other problems in result dir, error message, # ask user to force or remove this folder. else: logger.warning("Problems in the files contained in your already existing output dir " f"({prodigal_dir}). Please check it, or remove it to " "re-annotate.") # If everything was ok -> everything is ready for next step -> return True # If something is wrong -> cannot use those results, genome won't be annotated # -> return False return ok else: # We are sure prodigal result dir does not exist yet, because either: # - never existed # - removed because user asked to force # - exists but left function, so does not go until this line # -> either if files inside are ok or not # So make prodigal_dir (not automatically created by prodigal) os.makedirs(prodigal_dir) # Prodigal_directory is empty and ready to get prodigal results basic_outname = os.path.join(prodigal_dir, name) # Define cmd, stderr and stdout files, and error to write if problem. error = (f"Error while trying to run prodigal. See {prodigal_logfile_err}.") prodigalf = open(prodigal_logfile, "w") prodigalferr = open(prodigal_logfile_err, "w") if gpath_train == "small option": training = "-p meta" else: training = f"-t {gpath_train}" cmd = (f"prodigal -i {gpath} -d {basic_outname + '.ffn'} -a {basic_outname + '.faa'} " f"-f gff -o {basic_outname + '.gff'} {training} -q") logger.log(utils.detail_lvl(), "Prodigal command: " + cmd) ret = utils.run_cmd(cmd, error, eof=False, stderr=prodigalferr, stdout=prodigalf, logger=logger) prodigalf.close() prodigalferr.close() if ret.returncode == 0: logger.log(utils.detail_lvl(), f"End annotating {name} (from {gpath})") return True else: return False
def add_missing_genomes(align_file, ali_type, miss_file, num_fam, ngenomes, status1, logger): """ Once all family proteins are aligned, and back-translated to nucleotides, add missing genomes for the family to the alignment with '-'. (Add missing genomes to both mafft alignment and back-translated alignment) Parameters ---------- align_file : str path to file containing alignments (proteins if from mafft output, or nucleic sequences if after backtranslating them) ali_type : str protein or backtranslated miss_file : str path to file containing the list of missing genomes in this family num_fam : int family number ngenomes : int total number of genomes in dataset status1 : bool or str - "OK" if we did not redo the alignments as they already were as expected. In that case, if missing genomes are already present, just add a warning message saying that we used the already existing btr file. - True if we just did the alignments and backtranslate. So no warning message needed. - False if problem with extraction, alignment or backtranslation (will never happen as this function is not called if status1 == False) logger : logging.Logger the logger, having a queue Handler, to give logs to the main logger Returns ------- bool or str - "OK" if btr file was not recreated, and already has the right number of sequences, and all with the same length. - False if problem in btr file alignment, so missing genomes not added - True if alignment + adding missing genomes is ok. Can happen if there is no missing genome for this family (in that case, btr generated already has the right number of sequences), or if we just added the missing genomes. """ # btr_file should always exist. # Sometimes it comes from previous step ('missing genomes' are missing) # Sometimes it comes from a previous run (all genomes should be here) status = check_add_missing(align_file, num_fam, ngenomes, logger, prev=True) # If btr_file has the correct number of sequences, all the same length, return True if status is True: if status1 == "OK": logger.warning(f"{ali_type} alignment already done for family {num_fam}. The program will use " "it for next steps") return "OK" else: return True # If btr_files has problem in alignment (not all sequences with same size) elif status is False: return False # All sequences have same length but some genomes are missing -> Add missing genomes # status is length of sequence (if it was True or False, it already ended this function) logger.log(utils.detail_lvl(), f"Adding missing genomes for family {num_fam} in {ali_type} alignment.") len_aln = status with open(miss_file, "r") as missf, open(align_file, "a") as alif: for genome in missf: genome = genome.strip() toadd = ">" + genome + "\n" + "-" * len_aln + "\n" alif.write(toadd) # check_add_missing called with prev=False : # output is True if all ok, or False if problems. Cannot be sequence length (as it can be with prev=True) ret = check_add_missing(align_file, num_fam, ngenomes, logger, prev=False) return ret
def run_prokka(arguments): """ Run prokka for the given genome. Parameters ---------- arguments : tuple (gpath, prok_folder, cores_annot, name, force, nbcont, small, q) with: * gpath: path and filename of genome to annotate * prok_folder: path to folder where all prokka folders for all genomes are saved * cores_annot: how many cores can use prokka * name: output name of annotated genome * force: True if force run (override existing files), False otherwise * nbcont: number of contigs in the input genome, to check prokka results * small: used for prodigal, if sequences to annotate are small. Not used here * q : queue where logs are put Returns ------- boolean True if eveything went well (all needed output files present, corresponding numbers of proteins, genes etc.). False otherwise. """ gpath, prok_folder, threads, name, force, nbcont, _, q = arguments # Set logger for this process qh = logging.handlers.QueueHandler(q) root = logging.getLogger() root.setLevel(logging.DEBUG) root.handlers = [] logging.addLevelName(utils.detail_lvl(), "DETAIL") root.addHandler(qh) logger = logging.getLogger('annotate.run_prokka') logger.log(utils.detail_lvl(), f"Start annotating {name} from {gpath} with Prokka") # Define prokka directory and logfile, and check their existence prok_dir = os.path.join(prok_folder, os.path.basename(gpath) + "-prokkaRes") fnull = open(os.devnull, 'w') prok_logfile = os.path.join(prok_folder, os.path.basename(gpath) + "-prokka.log") # import sys # sys.exit(1) # If result dir already exists, check if we can use it or next step or not if os.path.isdir(prok_dir) and not force: logger.warning(f"Prokka results folder {prok_dir} already exists.") ok = check_prokka(prok_dir, prok_logfile, name, gpath, nbcont, logger) # If everything ok in the result dir, do not rerun prokka, # use those results for next step (formatting) if ok: logger.log(utils.detail_lvl(), "Prokka did not run again, " "formatting step used already generated results of " f"Prokka in {prok_dir}. If you want to re-run prokka, first " "remove this result folder, or use '-F' or '--force' " "option if you want to rerun prokka for all genomes.") logger.log(utils.detail_lvl(), f"End annotating {name} {gpath}") # If missing files, or other problems in result dir, error message, # ask user to force or remove this folder. else: logger.warning("Problems in the files contained in your already existing output dir " "({}). Please check it, or remove it to " "re-annotate.".format(prok_dir)) # If everything was ok -> everything is ready for next step -> return True # If something is wrong -> cannot use those results, genome won't be annotated # -> return False return ok # If result dir exists but user wants to force, remove this result dir elif os.path.isdir(prok_dir) and force: shutil.rmtree(prok_dir) logger.warning("Prokka results folder already exists, but removed because --force option " "used") # Now that we checked and solved those cases: # - outdir exists (problems or not, we returned appropriate boolean) # - if outdir exists exists but force, remove this outdir. # So, outdir does not exist -> run prokka cmd = (f"prokka --outdir {prok_dir} --cpus {threads} " f"--prefix {name} --centre prokka {gpath}") error = (f"Error while trying to run prokka on {name} from {gpath}") logger.log(utils.detail_lvl(), "Prokka command: " + cmd) prokf = open(prok_logfile, "w") ret = utils.run_cmd(cmd, error, eof=False, stderr=prokf, logger=logger) prokf.close() if ret.returncode != 0: return False ok = check_prokka(prok_dir, prok_logfile, name, gpath, nbcont, logger) logger.log(utils.detail_lvl(), f"End annotating {name} from {gpath}.") return ok
def main(cmd, lstinfo, name, dbpath, min_id, outdir, clust_mode, spe_dir, threads, outfile=None, verbose=0, quiet=False): """ Main method, doing all steps: - concatenate all protein files - create database as ffindex - cluster all proteins - convert to pangenome file - creating summary and matrix of pangenome Parameters ---------- lstinfo : str file with name of genomes to consider for pan in the first column, without extension. Other columns are ignored. The first column header must be 'gembase_name' name : str name given to the dataset. For example, ESCO44 for 44 *Escherichia coli* genomes. dbpath : str path to the folder containing all protein files (files called as the name of genome given in lstinfo + ".prt" min_id : float Minimum percentage of identity between 2 proteins to put them in the same family outdir : str path to folder which will contain pangenome results and tmp files clust_mode : [0, 1, 2] 0 for 'set cover', 1 for 'single-linkage', 2 for 'CD-Hit' spe_dir : str or None path to the folder where concatenated bank of proteins must be saved. None to use the same folder as protein files threads : int Max number of threads to use outfile : str or None Name of the pangenome. None to use the default name verbose : int verbosity: - defaut 0 : stdout contains INFO, stderr contains ERROR. - 1: stdout contains INFO, stderr contains WARNING and ERROR - 2: stdout contains (DEBUG), DETAIL and INFO, stderr contains WARNING and ERROR - >=15: Add DEBUG in stdout quiet : bool True if nothing must be sent to stdout/stderr, False otherwise """ # import needed packages import logging from PanACoTA import utils from PanACoTA.pangenome_module import protein_seq_functions as protf from PanACoTA.pangenome_module import mmseqs_functions as mmf from PanACoTA.pangenome_module import post_treatment as pt from PanACoTA import __version__ as version # test if mmseqs is installed and in the path if not utils.check_installed("mmseqs"): # pragma: no cover print("mmseqs is not installed. 'PanACoTA pangenome' cannot run.") sys.exit(1) os.makedirs(outdir, exist_ok=True) # level is the minimum level that will be considered. # for verbose = 0 or 1, ignore details and debug, start from info if verbose <= 1: level = logging.INFO # for verbose = 2, ignore only debug if verbose >= 2 and verbose < 15: level = utils.detail_lvl() # int corresponding to detail level # for verbose >= 15, write everything if verbose >= 15: level = logging.DEBUG # name logfile, add timestamp if already existing logfile_base = os.path.join(outdir, "PanACoTA-pangenome_" + name) utils.init_logger(logfile_base, level, '', verbose=verbose, quiet=quiet, log_details=True) logger = logging.getLogger("pangenome") logger.info(f'PanACoTA version {version}') logger.info("Command used\n \t > " + cmd) # Build bank with all proteins to include in the pangenome prt_path = protf.build_prt_bank(lstinfo, dbpath, name, spe_dir, quiet) # Do pangenome families, panfile = mmf.run_all_pangenome(min_id, clust_mode, outdir, prt_path, threads, outfile, quiet) # Create matrix pan_quali, pan_quanti and summary file pt.post_treat(families, panfile) logger.info("DONE") return panfile
def main(cmd, args_all, args_prepare, args_annot, args_pan, args_corepers, args_align, args_tree): """ Call all modules, one by one, using output of one as input for the next one Parameters ---------- cmd : str command line used to launch the program args_all : tuple arguments common to all modules: output directory (str), threads (int), verbose (int), quiet (bool) args_prepare : tuple arguments for prepare module (see subcommands.prepare.py): NCBI_species_taxid (int), NCBI_species_name (str), NCBI_species_taxid (int), NCBI_taxid (int), NCBI_strains (str), levels (str), NCBI_section (str), tmp_dir (str), norefseq (bool), db_dir (str), only_mash (bool), info_file (str), l90 (int), nbcont (int), cutn (int), min_dist (float), max_dist (float) args_annot : tuple arguments for annotate module (see subcommands/annotate.py): name (str), qc_only (bool), date (str), prodigal_only (bool), small (bool) args_pan : tuple arguments for pangenome module (see subcommands/pangenome.py): min_id (float), clust_mode (int), spe_dir (str), outfile (str) args_corepers : tuple arguments for corepers module (see subcommands.corepers.py): tol (float), mixed (bool), multi (bool), floor (bool) args_align : tuple arguments for align module (see subcommands.align.py): prot_ali (bool) args_tree : tuple arguments for tree module (see subcommands.tree.py): soft (str), model (str), boot (bool), write_boot (bool), memory (str), fast (bool) """ outdir, threads, verbose, quiet = args_all os.makedirs(outdir, exist_ok=True) # Initialize logger import logging # set level of logger: level is the minimum level that will be considered. if verbose <= 1: level = logging.INFO # for verbose = 2, ignore only debug if verbose >= 2 and verbose < 15: level = utils.detail_lvl() # int corresponding to detail level # for verbose >= 15, write everything if verbose >= 15: level = logging.DEBUG logfile_base = os.path.join(outdir, "PanACoTA-all_modules") logfile_base = utils.init_logger(logfile_base, level, name='all_modules', verbose=verbose, quiet=quiet) logger = logging.getLogger('all_modules') logger.info(f'PanACoTA version {version}') logger.info("Command used\n \t > " + cmd) # Run prepare module outdir_prepare = os.path.join(outdir, "1-prepare_module") (NCBI_species_name, NCBI_species_taxid, NCBI_taxid, NCBI_strains, levels, NCBI_section, tmp_dir, norefseq, db_dir, only_mash, info_file, l90, nbcont, cutn, min_dist, max_dist) = args_prepare logger.info("prepare step") info_file = prepare.main("PanACoTA prepare", NCBI_species_name, NCBI_species_taxid, NCBI_taxid, NCBI_strains, levels, NCBI_section, outdir_prepare, tmp_dir, threads, norefseq, db_dir, only_mash, info_file, l90, nbcont, cutn, min_dist, max_dist, verbose, quiet) # Run annotate module list_file = "" db_path = "" tmp_dir = "" force = False outdir_annotate = os.path.join(outdir, "2-annotate_module") (name, qc_only, date, prodigal_only, small) = args_annot res_annot_dir = None logger.info("annotate step") lstinfo, nbgenomes = annotate.main("PanACoTA annotate", list_file, db_path, outdir_annotate, name, date, l90, nbcont, cutn, threads, force, qc_only, info_file, tmp_dir, res_annot_dir, verbose, quiet, prodigal_only=prodigal_only, small=small) if qc_only: return "QC_only done" # Pangenome step name_pan = f"{name}_{nbgenomes}" outdir_pan = os.path.join(outdir, "3-pangenome_module") dbpath = os.path.join(outdir_annotate, "Proteins") (min_id, clust_mode, spe_dir, outfile) = args_pan logger.info("pangenome step") panfile = pangenome.main("PanACoTA pangenome", lstinfo, name_pan, dbpath, min_id, outdir_pan, clust_mode, spe_dir, threads, outfile, verbose=verbose, quiet=quiet) # Coregenome step outdir_corpers = os.path.join(outdir, "4-corepers_module") logger.info("corepers step") (tol, mixed, multi, floor) = args_corepers lstinfo_file = "" # include all genomes in core corepers_file = corepers.main("PanACoTA corepers", panfile, tol, multi, mixed, outdir_corpers, lstinfo_file, floor, verbose, quiet) # Align step outdir_align = os.path.join(outdir, "5-align_module") force = False logger.info("align step") (prot_ali) = args_align align_file = align.main("PanACoTA align", corepers_file, lstinfo, name_pan, outdir_annotate, outdir_align, prot_ali, threads, force, verbose=verbose, quiet=quiet) # Tree step (soft, model, boot, write_boot, memory, fast) = args_tree outdir_tree = os.path.join(outdir, "6-tree_module") logger.info("tree step") tree.main("PanACoTA tree", align_file, outdir_tree, soft, model, threads, boot, write_boot, memory, fast, verbose=verbose, quiet=quiet) logger.info("All modules of PanACOTA are finished.") return 0
def main(cmd, list_file, db_path, res_dir, name, date, l90=100, nbcont=999, cutn=5, threads=1, force=False, qc_only=False, from_info=None, tmp_dir=None, res_annot_dir=None, verbose=0, quiet=False, prodigal_only=False, small=False): """ Main method, doing all steps: 1. analyze genomes (nb contigs, L90, rows of N...) 2. keep only genomes with 'good' (according to user thresholds) L90 and nb_contigs 3. rename genomes with strain number in decreasing quality 4. annotate genome with prokka or only prodigal 5. format annotated genomes If option '-Q': ends at step 2. If option '--info <genome_info file name>' option: starts at step 2 verbosity: - defaut 0 : stdout contains INFO, stderr contains ERROR. - 1: stdout contains INFO, stderr contains WARNING and ERROR - 2: stdout contains (DEBUG), DETAIL and INFO, stderr contains WARNING and ERROR - >=15: Add DEBUG in stdout Parameters ---------- cmd : str command line used to launch this program list_file : str file containing the list of genome files, 1 genome per line, separated by a space if a genome is split in several fasta files. This file can also specify date and/or species information, according to the format described in documentation. db_path : str Path to the folder containing all the fasta files which will be annotated res_dir : str Path to the folder which will contain result folders and files name : str 4 alpha numeric characters, describing the species (for example ESCO). Used by default if no species name is given in list_file line. date : str 4 alpha numeric characters, defining the default date, for strains where it is not specified in the list_file l90 : int Max L90 allowed to keep a genome nbcont : int Max number of contigs allowed to keep a genome cutn : int cut each time there are at least cutn 'N' in a row. Don't cut if equal to 0 threads : int max number of threads to use force : bool If True, overwrite previous results, if False keep what is already calculated qc_only : bool If True, do only quality control, if False, also do annotation from_info : str File containing information on genomes and their quality information (from prepare step) tmp_dir : str or None Path to folder where tmp files must be saved. None to use the default tmp folder res_annot_dir : str or None Path to folder where are the prokka/prodigal result folders for the genomes. None to use the default prokka/prodigal folder verbose : int verbosity: default (0): info in stdout, error and more in stderr 1 = add warnings in stderr 2 = like 1 + add DETAIL to stdout (by default only INFO) >15: add debug to stdout quiet : bool True if nothing must be sent to stdout/stderr, False otherwise prodigal_only : bool True -> run only prodigal. False -> run prokka small : bool True -> use -p meta option with prodigal Returns ------- (genomes, kept_genomes, skipped, skipped_format) : tuple with: - genomes: dict with all genomes in list_file: {genome: [gembase_name, path_split_gembase, gsize, nbcont, L90]} - kept_genomes: dict with all genomes kept for annotation (same format as genomes) - skipped: list of genomes skipped because they had a problem in annotation step - skipped_format : list of genomes skipped because they had a problem in format step """ # import needed packages import shutil import logging from PanACoTA.annotate_module import genome_seq_functions as gfunc from PanACoTA.annotate_module import annotation_functions as pfunc from PanACoTA.annotate_module import general_format_functions as ffunc from PanACoTA import utils from PanACoTA import __version__ as version # Check that needed softs are installed prokka = utils.check_installed("prokka") prodigal = utils.check_installed("prodigal") if prodigal_only: soft = "prodigal" else: soft = "prokka" changed = cutn != 0 if not qc_only: # pragma: no cover # If user using prokka: check prokka is installed and in the path if not prodigal_only and not prokka: print( "Prokka is not installed. 'PanACoTA annotate' cannot run. Install prokka " "to be able to annotate genomes. If you only need syntactical annotation, " "check that prodigal is installed, and add '--prodigal' option." ) sys.exit(1) if prodigal_only and not prodigal: print( "Prodigal is not installed. 'PanACoTA annotate' cannot run. Install " "prodigal to be able to annotate genomes. If you also need functional " "annotation, check that prokka is installed, and remove '--prodigal' " "option.") sys.exit(1) # By default, all tmp files (split sequences, renamed sequences, prokka/prodigal results) will # be saved in the given <res_dir>/tmp_files. # Create output (results, tmp...) directories if not already existing if not tmp_dir: tmp_dir = os.path.join(res_dir, "tmp_files") if not res_annot_dir: res_annot_dir = tmp_dir os.makedirs(res_dir, exist_ok=True) os.makedirs(tmp_dir, exist_ok=True) os.makedirs(res_annot_dir, exist_ok=True) # If force was set, remove result folders (Proteins, Replicons, Genes, LSTINFO, gff) if force: shutil.rmtree(os.path.join(res_dir, "LSTINFO"), ignore_errors=True) shutil.rmtree(os.path.join(res_dir, "Proteins"), ignore_errors=True) shutil.rmtree(os.path.join(res_dir, "Genes"), ignore_errors=True) shutil.rmtree(os.path.join(res_dir, "Replicons"), ignore_errors=True) shutil.rmtree(os.path.join(res_dir, "gff3"), ignore_errors=True) # If not --force, check that result folders do not already contain results else: utils.check_out_dirs(res_dir) # get only filename of list_file, without extension if list_file: listfile_base = os.path.basename(os.path.splitext(list_file)[0]) else: list_file = from_info listfile_base = os.path.basename(os.path.splitext(list_file)[0]) # Initialize logger # set level of logger: level is the minimum level that will be considered. if verbose <= 1: level = logging.INFO # for verbose = 2, ignore only debug if verbose >= 2 and verbose < 15: level = utils.detail_lvl() # int corresponding to detail level # for verbose >= 15, write everything if verbose >= 15: level = logging.DEBUG logfile_base = os.path.join(res_dir, "PanACoTA-annotate_" + listfile_base) logfile_base = utils.init_logger(logfile_base, level, name='annotate', log_details=True, verbose=verbose, quiet=quiet) logger = logging.getLogger('annotate') logger.info(f'PanACoTA version {version}') logger.info("Command used\n \t > " + cmd) # STEP 1. analyze genomes (nb contigs, L90, rows of N...) # If already info on genome ('--info <file>' option), skip this step # If no info on genomes, read them and get needed information if not from_info: # Read genome names. # genomes = {genome: [spegenus.date]} genomes = utils.read_genomes(list_file, name, date, db_path, tmp_dir, logger) if not genomes: logger.error( ("We did not find any genome listed in {} in the folder {}. " "Please check your list to give valid genome " "names.").format(list_file, db_path)) sys.exit(1) # Get L90, nbcontig, size for all genomes, and cut at row of cutn 'N' if asked # -> genome: [spegenus.date, orig_path, to_annotate_path, size, nbcont, l90] gfunc.analyse_all_genomes(genomes, db_path, tmp_dir, cutn, soft, logger, quiet=quiet) # --info <filename> option given: read information (L90, nb contigs...) from this file. else: # genomes = {genome: [spegenus.date, orig_path, to_annotate_path, size, nbcont, l90]} # orig_path is the path to the original sequence # and to_annotate_path the path to the sequence to annotate (once split etc.) # Here, both are the same, as we take given sequences as is. genomes = utils.read_genomes_info(from_info, name, date, logger) # STEP 2. keep only genomes with 'good' (according to user thresholds) L90 and nb_contigs # genomes = {genome: [spegenus.date, orig_seq, path_to_splitSequence, size, nbcont, l90]} # Plot L90 and nb_contigs distributions gfunc.plot_distributions(genomes, res_dir, listfile_base, l90, nbcont) # Get list of genomes kept (according to L90 and nbcont thresholds) kept_genomes = { genome: info for genome, info in genomes.items() if info[-2] <= nbcont and info[-1] <= l90 } # Write discarded genomes to a file -> orig_name, to_annotate, gsize, nb_conts, L90 utils.write_genomes_info(genomes, list(kept_genomes.keys()), list_file, res_dir) if not kept_genomes: logger.info("No genome kept for annotation.") return "", 0 # Info on folder containing original sequences if not from_info: logger.info( f"-> Original sequences folder ('orig_name' column): {db_path} ") logger.info( f"\t-> If original sequence not found in {db_path}, " f"look for it in {tmp_dir}, as it must be a concatenation of several " "input sequence files.") if cutn == 0: logger.info( "-> Sequences used for annotation ('to_annotate' column) are the " "same as the previous ones (original sequences).") else: logger.info( f"-> Folder with sequence files that will be used for annotation " f"('to_annotate' column): {tmp_dir}") # If only QC, stop here. if qc_only: # Write information on genomes that would be annotated with the current # parameters if not QC_only: # orig_name, to_annnote, gsize, nb_conts, L90 utils.write_genomes_info(genomes, [], list_file, res_dir, qc=True) logger.info("QC only done.") return "", 0 # STEP 3. Rename genomes kept, ordered by decreasing quality first_gname = gfunc.rename_all_genomes(kept_genomes) # kept_genomes = {genome: [gembase_name, path_to_origfile, path_split_gembase, # gsize, nbcont, L90]} # first_gname = name of the first genome # Write lstinfo file (list of genomes kept with info on L90 etc.) outlst = utils.write_lstinfo(list_file, kept_genomes, res_dir) # STEP 4. Annotate all kept genomes results = pfunc.run_annotation_all(kept_genomes, threads, force, res_annot_dir, first_gname, prodigal_only, small=small, quiet=quiet) # Information on genomes to format # results_ok = {genome: [gembase_name, path_to_origfile, path_split_gembase, # gsize, nbcont, L90]} results_ok = { genome: info for genome, info in kept_genomes.items() if results[genome] } # If no genome was ok, no need to format them. Just print that no genome was annotated, # end program. if not results_ok: logger.error( "Error: No genome was correctly annotated, no need to format them." ) sys.exit(1) # list of genomes skipped because annotation had problems: no format step run skipped = [genome for (genome, ok) in results.items() if not ok] # At least 1 genome was not annotated: write a message to warn on it if skipped: utils.write_warning_skipped(skipped, prodigal_only=prodigal_only, logfile=logfile_base) # STEP 5. Format genomes annotated # Here, we have at least 1 genome annotated (otherwise, # it would already have stopped because results_ok is empty) # Initialize list of genomes skipped because something went wrong while formatting. skipped_format = [] # Generate database (folders Proteins, Genes, Replicons, LSTINFO) skipped_format = ffunc.format_genomes(results_ok, res_dir, res_annot_dir, prodigal_only, threads, quiet=quiet) # At least one genome could not be formatted -> warn user if skipped_format: utils.write_warning_skipped(skipped_format, do_format=True, prodigal_only=prodigal_only, logfile=logfile_base) logger.info("Annotation step done.") return outlst, len(kept_genomes) - len(skipped) - len(skipped_format)