def test_run_all_prodigal_error_train(): """ Check that when we want to train on a genome but it fails, it returns False for all genomes Here, it fails because genome to train on is too small """ logger = my_logger("test_run_all_parallel_more_threads") utils.init_logger(LOGFILE_BASE, 0, 'test_run_all_parallel_more_threads') # genomes = {genome: [name, gpath, annot_path, size, nbcont, l90]} genome1 = "H299_H561.fasta" gpath1 = os.path.join(GEN_PATH, genome1) genome2 = "A_H738.fasta" gpath2 = os.path.join(GEN_PATH, genome2) genomes = { genome1: ["test_runall_1by1_1", gpath1, gpath1, 12656, 3, 0], genome2: ["test_runall_1by1_2", gpath2, gpath2, 456464645, 1, 465] } threads = 8 force = False trn_gname = genome1 final = afunc.run_annotation_all(genomes, threads, force, GENEPATH, trn_gname, prodigal_only=True, quiet=True) assert not final[genome1] assert not final[genome2] q = logger[0] assert q.qsize() == 4 assert q.get().message == "Annotating all genomes with prodigal" assert q.get().message == ("Prodigal will train using " "test/data/annotate/genomes/H299_H561.fasta") assert q.get().message == ( "prodigal command: prodigal -i " "test/data/annotate/genomes/H299_H561.fasta -t " "test/data/annotate/generated_by_unit-tests/H299_H561.fasta.trn") assert q.get().message == ( "Error while trying to train prodigal on H299_H561.fasta. See " "test/data/annotate/generated_by_unit-tests/" "H299_H561.fasta.trn-prodigal-train.log.err.")
def test_run_all_prodigal(): """ Check that there is no problem when running prodigal on all genomes Start and end are not necessarily in the same order (ex: start1, start2, end2, end1) """ logger = my_logger("test_run_all_parallel_more_threads") utils.init_logger(LOGFILE_BASE, 0, 'test_run_all_parallel_more_threads') # genomes = {genome: [name, gpath, annot_path, size, nbcont, l90]} genome1 = "H299_H561.fasta" gpath1 = os.path.join(GEN_PATH, genome1) genome2 = "A_H738.fasta" gpath2 = os.path.join(GEN_PATH, genome2) genomes = { genome1: ["test_runall_1by1_1", gpath1, gpath1, 12656, 3, 0], genome2: ["test_runall_1by1_2", gpath2, gpath2, 456464645, 1, 465] } threads = 8 force = False trn_gname = genome2 final = afunc.run_annotation_all(genomes, threads, force, GENEPATH, trn_gname, prodigal_only=True, quiet=True) assert final[genome1] assert final[genome2] q = logger[0] assert q.qsize() == 10 assert q.get().message == "Annotating all genomes with prodigal" assert q.get( ).message == "Prodigal will train using test/data/annotate/genomes/A_H738.fasta" assert q.get().message == ( "prodigal command: prodigal -i " "test/data/annotate/genomes/A_H738.fasta -t " "test/data/annotate/generated_by_unit-tests/A_H738.fasta.trn") assert q.get( ).message == "End training on test/data/annotate/genomes/A_H738.fasta" messages = [] for i in range(6): a = q.get().message messages.append(a) message_start_annot1 = ( "Start annotating test_runall_1by1_1 " "(from test/data/annotate/genomes/H299_H561.fasta sequence) " "with Prodigal") message_start_annot2 = ( "Start annotating test_runall_1by1_2 " "(from test/data/annotate/genomes/A_H738.fasta sequence) " "with Prodigal") # Check that all messages exist. We cannot know in which order, # as 'genomes' is a dict, hence unordered, and as computation is done in parallel assert message_start_annot1 in messages assert message_start_annot2 in messages # Prodigal cmd message_cmd1 = ( "Prodigal command: prodigal -i test/data/annotate/genomes/H299_H561.fasta " "-d test/data/annotate/generated_by_unit-tests/H299_H561.fasta-prodigalRes/" "test_runall_1by1_1.ffn -a test/data/annotate/generated_by_unit-tests/" "H299_H561.fasta-prodigalRes/test_runall_1by1_1.faa -f gff " "-o test/data/annotate/generated_by_unit-tests/" "H299_H561.fasta-prodigalRes/test_runall_1by1_1.gff -t " "test/data/annotate/generated_by_unit-tests/A_H738.fasta.trn -q") message_cmd2 = ( "Prodigal command: prodigal -i test/data/annotate/genomes/A_H738.fasta " "-d test/data/annotate/generated_by_unit-tests/A_H738.fasta-prodigalRes/" "test_runall_1by1_2.ffn -a test/data/annotate/generated_by_unit-tests/" "A_H738.fasta-prodigalRes/test_runall_1by1_2.faa -f gff " "-o test/data/annotate/generated_by_unit-tests/A_H738.fasta-prodigalRes/" "test_runall_1by1_2.gff -t " "test/data/annotate/generated_by_unit-tests/A_H738.fasta.trn -q") assert message_cmd1 in messages assert message_cmd2 in messages message_end_annot1 = ( "End annotating test_runall_1by1_1 (from test/data/annotate/genomes/" "H299_H561.fasta)") message_end_annot2 = ( "End annotating test_runall_1by1_2 (from test/data/annotate/genomes/" "A_H738.fasta)") assert message_end_annot1 in messages assert message_end_annot2 in messages
def test_run_all_parallel_prokka_more_threads(): """ Check that there is no problem when running with more threads than genomes (6 threads and 2 genome: each genome uses 3 threads) Genomes H299 should run well but genome1.fasta should get an error """ logger = my_logger("test_run_all_parallel_more_threads") utils.init_logger(LOGFILE_BASE, 0, 'test_run_all_4threads') # genomes = {genome: [name, gpath, size, nbcont, l90]} gnames = ["H299_H561.fasta", "genome1.fasta"] gpaths = [os.path.join(GEN_PATH, name) for name in gnames] genomes = { gnames[0]: ["test_runall_1by1_1", gpaths[0], gpaths[0], 12656, 3, 1], gnames[1]: ["test_runall_1by1_2", gpaths[1], gpaths[1], 456464645, 4, 1], } threads = 6 force = False trn_file = "nofile.trn" final = afunc.run_annotation_all(genomes, threads, force, GENEPATH, trn_file) assert final[gnames[0]] assert not final[gnames[1]] q = logger[0] # Check size of logs # -> starting log -> 1 log # -> for genome ok : start annotate, prokka cmd, end annotate -> 3 logs # -> for genome not ok : start annotate, prokka cmd, problem, end annotate -> 4 logs assert q.qsize() == 8 assert q.get().message == "Annotating all genomes with prokka" # messages start annotation messages = [] for i in range(7): a = q.get().message messages.append(a) message_start_annot1 = ("Start annotating test_runall_1by1_1 " "from test/data/annotate/genomes/H299_H561.fasta " "with Prokka") message_start_annot2 = ("Start annotating test_runall_1by1_2 " "from test/data/annotate/genomes/genome1.fasta " "with Prokka") # Check that all messages exist. We cannot know in which order, # as 'genomes' is a dict, hence unordered, and as computation is done in parallel assert message_start_annot1 in messages assert message_start_annot2 in messages # messages Prokka cmd message_cmd1 = ( "Prokka command: prokka --outdir test/data/annotate/generated_by_unit-tests/" "H299_H561.fasta-prokkaRes --cpus 3 --prefix test_runall_1by1_1 " "--centre prokka test/data/annotate/genomes/H299_H561.fasta") message_cmd2 = ( "Prokka command: prokka --outdir test/data/annotate/generated_by_unit-tests/" "genome1.fasta-prokkaRes --cpus 3 --prefix test_runall_1by1_2 " "--centre prokka test/data/annotate/genomes/genome1.fasta") assert message_cmd1 in messages assert message_cmd2 in messages # Messages end annotation cmd message_end1 = ("End annotating test_runall_1by1_1 from " "test/data/annotate/genomes/H299_H561.fasta.") message_end2 = ("End annotating test_runall_1by1_2 from " "test/data/annotate/genomes/genome1.fasta.") assert message_end1 in messages assert message_end2 in messages # Messages error annotation cmd message_err1 = "test_runall_1by1_2 genome1.fasta: several .faa files" assert message_err1 in messages
def test_run_all_prokka_parallel_less_threads(): """ Check that there is no problem when running with less threads than genomes (each genomes uses 2 threads) Genomes H299 and A_H738 should run well, but genomes genome* have problems (no CDS found), so check_prokka should return false. """ logger = my_logger("test_run_all_parallel_more_threads") utils.init_logger(LOGFILE_BASE, 0, 'test_run_all_4threads') # genomes = {genome: [name, gpath, size, nbcont, l90]} gnames = [ "H299_H561.fasta", "A_H738.fasta", "genome1.fasta", "genome2.fasta", "genome3.fasta" ] gpaths = [os.path.join(GEN_PATH, name) for name in gnames] genomes = { gnames[0]: ["test_runall_1by1_1", gpaths[0], gpaths[0], 12656, 3, 1], gnames[1]: ["test_runall_1by1_2", gpaths[1], gpaths[1], 456464645, 1, 1], gnames[2]: ["test_runall_1by1_3", gpaths[2], gpaths[2], 456464645, 4, 1], gnames[3]: ["test_runall_1by1_4", gpaths[3], gpaths[3], 456464645, 3, 1], gnames[4]: ["test_runall_1by1_5", gpaths[4], gpaths[4], 456464645, 1, 1] } threads = 4 force = False trn_file = "nofile.trn" final = afunc.run_annotation_all(genomes, threads, force, GENEPATH, trn_file) assert final[gnames[0]] assert final[gnames[1]] assert not final[gnames[2]] assert not final[gnames[3]] assert not final[gnames[4]] q = logger[0] # Check size of logs # -> starting log -> 1 log # -> for each genome ok (2 first ones): start annotate, prokka cmd, end annotate -> 6 logs # -> for each genome not ok (3 others): # start annotate, prokka cmd, problem, end annotate -> 12 logs assert q.qsize() == 19 assert q.get().message == "Annotating all genomes with prokka" # messages start annotation messages = [] for i in range(18): a = q.get().message messages.append(a) message_start_annot1 = ("Start annotating test_runall_1by1_1 " "from test/data/annotate/genomes/H299_H561.fasta " "with Prokka") message_start_annot2 = ("Start annotating test_runall_1by1_2 " "from test/data/annotate/genomes/A_H738.fasta " "with Prokka") message_start_annot3 = ("Start annotating test_runall_1by1_4 " "from test/data/annotate/genomes/genome2.fasta " "with Prokka") # Check that all messages exist. We cannot know in which order, # as 'genomes' is a dict, hence unordered, and as computation is done in parallel assert message_start_annot1 in messages assert message_start_annot2 in messages assert message_start_annot3 in messages # messages Prokka cmd message_cmd1 = ( "Prokka command: prokka --outdir test/data/annotate/generated_by_unit-tests/" "H299_H561.fasta-prokkaRes --cpus 2 --prefix test_runall_1by1_1 " "--centre prokka test/data/annotate/genomes/H299_H561.fasta") message_cmd2 = ( "Prokka command: prokka --outdir test/data/annotate/generated_by_unit-tests/" "A_H738.fasta-prokkaRes --cpus 2 --prefix test_runall_1by1_2 " "--centre prokka test/data/annotate/genomes/A_H738.fasta") message_cmd3 = ( "Prokka command: prokka --outdir test/data/annotate/generated_by_unit-tests/" "genome1.fasta-prokkaRes --cpus 2 --prefix test_runall_1by1_3 " "--centre prokka test/data/annotate/genomes/genome1.fasta") assert message_cmd1 in messages assert message_cmd2 in messages assert message_cmd3 in messages # Messages end annotation cmd message_end1 = ("End annotating test_runall_1by1_1 from " "test/data/annotate/genomes/H299_H561.fasta.") message_end2 = ("End annotating test_runall_1by1_3 from " "test/data/annotate/genomes/genome1.fasta.") message_end3 = ("End annotating test_runall_1by1_5 from " "test/data/annotate/genomes/genome3.fasta.") assert message_end1 in messages assert message_end2 in messages assert message_end3 in messages # Messages error annotation cmd message_err1 = "test_runall_1by1_3 genome1.fasta: several .faa files" message_err2 = "test_runall_1by1_4 genome2.fasta: several .faa files" message_err3 = "test_runall_1by1_5 genome3.fasta: several .faa files" assert message_err1 in messages assert message_err2 in messages assert message_err3 in messages
def test_run_all_1by1_prokka(): """ Check that when running with 3 threads (not parallel), prokka runs as expected, and returns True for each genome -> Runs 1 by 1, with prokka using 3 cpus Start and end must be ordered: (start1, end1, start2, end2) or (start2, end2, start1, end1) """ logger = my_logger("test_runall_1by1_1") utils.init_logger(LOGFILE_BASE, 0, 'test_run_all_1by1') # genomes = {genome: [name, gpath, size, nbcont, l90]} genome1 = "H299_H561.fasta" gpath1 = os.path.join(GEN_PATH, genome1) genome2 = "A_H738.fasta" gpath2 = os.path.join(GEN_PATH, genome2) genomes = { genome1: ["test_runall_1by1_1", gpath1, gpath1, 12656, 3, 0], genome2: ["test_runall_1by1_2", gpath2, gpath2, 456464645, 1, 465] } threads = 3 force = False trn_file = "nofile.trn" annot_folder = os.path.join(GENEPATH, "annot-folder") os.makedirs(annot_folder) final = afunc.run_annotation_all(genomes, threads, force, annot_folder, trn_file) assert final[genome1] assert final[genome2] q = logger[0] assert q.qsize() == 7 assert q.get().message == 'Annotating all genomes with prokka' # Messages for start and end annotation of the different genomes message_start_annot1 = ( "Start annotating test_runall_1by1_1 test/data/annotate/genomes/" "H299_H561.fasta") message_cmd1 = ( "Prokka command: prokka --outdir test/data/annotate/generated_by_unit-tests/" "annot-folder/H299_H561.fasta-prokkaRes --cpus 3") message_end_annot1 = ( "End annotating test_runall_1by1_1 from test/data/annotate/genomes/" "H299_H561.fasta.") message_start_annot2 = ( "Start annotating test_runall_1by1_2 test/data/annotate/genomes/" "A_H738.fasta") message_cmd2 = ( "Prokka command: prokka --outdir test/data/annotate/generated_by_unit-tests/" "annot-folder/A_H738.fasta-prokkaRes --cpus 3") message_end_annot2 = ( "End annotating test_runall_1by1_2 from test/data/annotate/genomes/" "A_H738.fasta.") qget = q.get().message # Check logs. Given that it is executed in parallel, we cannot know in which order messages # will appear assert qget == message_start_annot1 or message_start_annot2 if qget == message_start_annot1: # Ending annotation of first genome (same genome as started because running 1by1) assert q.get().message.startswith(message_cmd1) assert q.get().message == message_end_annot1 else: assert q.get().message.startswith(message_cmd2) assert q.get().message == message_end_annot2 qget2 = q.get().message assert qget2 == message_start_annot1 or message_start_annot2 if qget2 == message_start_annot2: # Ending annotation of first genome (same genome as started because running 1by1) assert q.get().message.startswith(message_cmd2) assert q.get().message == message_end_annot2 else: assert q.get().message.startswith(message_cmd1) assert q.get().message == message_end_annot1
def test_run_all_prodigal_outexists_error(): """ trn file already exists, and output folder too. No force option. Output folder is empty -> error message while checking prodigal """ logger = my_logger("test_run_all_parallel_more_threads") utils.init_logger(LOGFILE_BASE, 0, 'test_run_all_parallel_more_threads') # genomes = {genome: [name, gpath, annot_path, size, nbcont, l90]} genome1 = "toto.fasta" genome2 = "A_H738.fasta" genomes = { genome1: ["test_runall_1by1_1", genome1, genome1, 12656, 3, 0], genome2: ["test_runall_1by1_2", genome2, genome2, 456464645, 1, 465] } # Create prodigal result directories prodigaldir_g1 = os.path.join(GENEPATH, "A_H738.fasta-prodigalRes") prodigaldir_g2 = os.path.join(GENEPATH, "toto.fasta-prodigalRes") os.makedirs(prodigaldir_g1) os.makedirs(prodigaldir_g2) # Other parameters threads = 1 force = False # Add existing training file orig_trn_file = os.path.join(TEST_DIR, "A_H738-and-B2_A3_5.fna.trn") trn_file = os.path.join(GENEPATH, "toto.fasta.trn") shutil.copyfile(orig_trn_file, trn_file) trn_gname = genome1 final = afunc.run_annotation_all(genomes, threads, force, GENEPATH, trn_gname, prodigal_only=True, quiet=False) assert not final[genome1] assert not final[genome2] q = logger[0] assert q.qsize() == 15 assert q.get().message == "Annotating all genomes with prodigal" assert q.get().message == "Prodigal will train using toto.fasta" assert q.get().message == ( "A training file already exists (test/data/annotate/" "generated_by_unit-tests/toto.fasta.trn). It will " "be used to annotate all genomes.") messages = [] for i in range(12): a = q.get().message messages.append(a) message_start_annot1 = ("Start annotating test_runall_1by1_1 " "(from toto.fasta sequence) with Prodigal") # Check that all messages exist. We cannot know in which order, # as 'genomes' is a dict, hence unordered, and as computation is done in parallel assert message_start_annot1 in messages # Prodigal cmd message_exists1 = ( "Prodigal results folder test/data/annotate/generated_by_unit-tests/" "toto.fasta-prodigalRes already exists.") message_errorfaa = ( "test_runall_1by1_1 toto.fasta: no or several .faa file(s)") message_errorffn = ( "test_runall_1by1_1 toto.fasta: no or several .ffn file(s)") message_errorgff = ( "test_runall_1by1_1 toto.fasta: no or several .gff file(s)") message_error1 = ( "Problems in the files contained in your already existing output dir " "(test/data/annotate/generated_by_unit-tests/toto.fasta-prodigalRes). " "Please check it, or remove it to re-annotate.") assert message_exists1 in messages assert message_errorfaa in messages assert message_errorffn in messages assert message_errorgff in messages assert message_error1 in messages message_start_annot2 = ("Start annotating test_runall_1by1_2 " "(from A_H738.fasta sequence) with Prodigal") assert message_start_annot2 in messages message_error_annot2 = ( "Problems in the files contained in your already existing output dir " "(test/data/annotate/generated_by_unit-tests/A_H738.fasta-prodigalRes). " "Please check it, or remove it to re-annotate.") assert message_error_annot2 in messages
def test_run_all_prodigal_train_exists_ok(): """ Check that when we want to train on a genome but it fails, it returns False for all genomes Here, it fails because genome to train on is too small """ logger = my_logger("test_run_prodigal_train_exist_error") utils.init_logger(LOGFILE_BASE, 0, 'test_run_prodigal_train_exist_error') # genomes = {genome: [name, gpath, annot_path, size, nbcont, l90]} genome1 = "toto.fasta" gpath1 = os.path.join(GEN_PATH, genome1) genome2 = "A_H738.fasta" gpath2 = os.path.join(GEN_PATH, genome2) genomes = { genome1: ["test_runall_1by1_1", gpath1, gpath1, 12656, 3, 0], genome2: ["test_runall_1by1_2", gpath2, gpath2, 456464645, 1, 465] } threads = 8 force = False trn_gname = genome1 # Copy trn file to outdir, so that panacota detects that it already exists orig_trn_file = os.path.join(TEST_DIR, "A_H738-and-B2_A3_5.fna.trn") trn_file = os.path.join(GENEPATH, "toto.fasta.trn") shutil.copyfile(orig_trn_file, trn_file) # Run annotation all final = afunc.run_annotation_all(genomes, threads, force, GENEPATH, trn_gname, prodigal_only=True, quiet=False) assert not final[genome1] assert final[genome2] q = logger[0] assert q.qsize() == 9 assert q.get().message == "Annotating all genomes with prodigal" assert q.get().message == ("Prodigal will train using " "test/data/annotate/genomes/toto.fasta") assert q.get().message == ( "A training file already exists (test/data/annotate/" "generated_by_unit-tests/toto.fasta.trn). It will be used " "to annotate all genomes.") # Check that all messages exist. We cannot know in which order, # as 'genomes' is a dict, hence unordered, and as computation is done in parallel messages = [] for i in range(6): a = q.get().message messages.append(a) # Check start annotation messages message_start_annot1 = ( "Start annotating test_runall_1by1_1 " "(from test/data/annotate/genomes/toto.fasta sequence) " "with Prodigal") message_start_annot2 = ( "Start annotating test_runall_1by1_2 " "(from test/data/annotate/genomes/A_H738.fasta sequence) " "with Prodigal") assert message_start_annot1 in messages assert message_start_annot2 in messages # Prodigal cmd message_cmd1 = ( "Prodigal command: prodigal -i test/data/annotate/genomes/toto.fasta " "-d test/data/annotate/generated_by_unit-tests/toto.fasta-prodigalRes/" "test_runall_1by1_1.ffn -a test/data/annotate/generated_by_unit-tests/" "toto.fasta-prodigalRes/test_runall_1by1_1.faa -f gff " "-o test/data/annotate/generated_by_unit-tests/" "toto.fasta-prodigalRes/test_runall_1by1_1.gff -t " "test/data/annotate/generated_by_unit-tests/toto.fasta.trn -q") message_cmd2 = ( "Prodigal command: prodigal -i test/data/annotate/genomes/A_H738.fasta " "-d test/data/annotate/generated_by_unit-tests/A_H738.fasta-prodigalRes/" "test_runall_1by1_2.ffn -a test/data/annotate/generated_by_unit-tests/" "A_H738.fasta-prodigalRes/test_runall_1by1_2.faa -f gff " "-o test/data/annotate/generated_by_unit-tests/A_H738.fasta-prodigalRes/" "test_runall_1by1_2.gff -t " "test/data/annotate/generated_by_unit-tests/toto.fasta.trn -q") assert message_cmd1 in messages assert message_cmd2 in messages message_end_annot1 = ("Error while trying to run prodigal. See " "test/data/annotate/generated_by_unit-tests/" "toto.fasta-prodigal.log.err.") message_end_annot2 = ( "End annotating test_runall_1by1_2 (from test/data/annotate/genomes/" "A_H738.fasta)") assert message_end_annot1 in messages assert message_end_annot2 in messages
def main(cmd, list_file, db_path, res_dir, name, date, l90=100, nbcont=999, cutn=5, threads=1, force=False, qc_only=False, from_info=None, tmp_dir=None, res_annot_dir=None, verbose=0, quiet=False, prodigal_only=False, small=False): """ Main method, doing all steps: 1. analyze genomes (nb contigs, L90, rows of N...) 2. keep only genomes with 'good' (according to user thresholds) L90 and nb_contigs 3. rename genomes with strain number in decreasing quality 4. annotate genome with prokka or only prodigal 5. format annotated genomes If option '-Q': ends at step 2. If option '--info <genome_info file name>' option: starts at step 2 verbosity: - defaut 0 : stdout contains INFO, stderr contains ERROR. - 1: stdout contains INFO, stderr contains WARNING and ERROR - 2: stdout contains (DEBUG), DETAIL and INFO, stderr contains WARNING and ERROR - >=15: Add DEBUG in stdout Parameters ---------- cmd : str command line used to launch this program list_file : str file containing the list of genome files, 1 genome per line, separated by a space if a genome is split in several fasta files. This file can also specify date and/or species information, according to the format described in documentation. db_path : str Path to the folder containing all the fasta files which will be annotated res_dir : str Path to the folder which will contain result folders and files name : str 4 alpha numeric characters, describing the species (for example ESCO). Used by default if no species name is given in list_file line. date : str 4 alpha numeric characters, defining the default date, for strains where it is not specified in the list_file l90 : int Max L90 allowed to keep a genome nbcont : int Max number of contigs allowed to keep a genome cutn : int cut each time there are at least cutn 'N' in a row. Don't cut if equal to 0 threads : int max number of threads to use force : bool If True, overwrite previous results, if False keep what is already calculated qc_only : bool If True, do only quality control, if False, also do annotation from_info : str File containing information on genomes and their quality information (from prepare step) tmp_dir : str or None Path to folder where tmp files must be saved. None to use the default tmp folder res_annot_dir : str or None Path to folder where are the prokka/prodigal result folders for the genomes. None to use the default prokka/prodigal folder verbose : int verbosity: default (0): info in stdout, error and more in stderr 1 = add warnings in stderr 2 = like 1 + add DETAIL to stdout (by default only INFO) >15: add debug to stdout quiet : bool True if nothing must be sent to stdout/stderr, False otherwise prodigal_only : bool True -> run only prodigal. False -> run prokka small : bool True -> use -p meta option with prodigal Returns ------- (genomes, kept_genomes, skipped, skipped_format) : tuple with: - genomes: dict with all genomes in list_file: {genome: [gembase_name, path_split_gembase, gsize, nbcont, L90]} - kept_genomes: dict with all genomes kept for annotation (same format as genomes) - skipped: list of genomes skipped because they had a problem in annotation step - skipped_format : list of genomes skipped because they had a problem in format step """ # import needed packages import shutil import logging from PanACoTA.annotate_module import genome_seq_functions as gfunc from PanACoTA.annotate_module import annotation_functions as pfunc from PanACoTA.annotate_module import general_format_functions as ffunc from PanACoTA import utils from PanACoTA import __version__ as version # Check that needed softs are installed prokka = utils.check_installed("prokka") prodigal = utils.check_installed("prodigal") if prodigal_only: soft = "prodigal" else: soft = "prokka" changed = cutn != 0 if not qc_only: # pragma: no cover # If user using prokka: check prokka is installed and in the path if not prodigal_only and not prokka: print( "Prokka is not installed. 'PanACoTA annotate' cannot run. Install prokka " "to be able to annotate genomes. If you only need syntactical annotation, " "check that prodigal is installed, and add '--prodigal' option." ) sys.exit(1) if prodigal_only and not prodigal: print( "Prodigal is not installed. 'PanACoTA annotate' cannot run. Install " "prodigal to be able to annotate genomes. If you also need functional " "annotation, check that prokka is installed, and remove '--prodigal' " "option.") sys.exit(1) # By default, all tmp files (split sequences, renamed sequences, prokka/prodigal results) will # be saved in the given <res_dir>/tmp_files. # Create output (results, tmp...) directories if not already existing if not tmp_dir: tmp_dir = os.path.join(res_dir, "tmp_files") if not res_annot_dir: res_annot_dir = tmp_dir os.makedirs(res_dir, exist_ok=True) os.makedirs(tmp_dir, exist_ok=True) os.makedirs(res_annot_dir, exist_ok=True) # If force was set, remove result folders (Proteins, Replicons, Genes, LSTINFO, gff) if force: shutil.rmtree(os.path.join(res_dir, "LSTINFO"), ignore_errors=True) shutil.rmtree(os.path.join(res_dir, "Proteins"), ignore_errors=True) shutil.rmtree(os.path.join(res_dir, "Genes"), ignore_errors=True) shutil.rmtree(os.path.join(res_dir, "Replicons"), ignore_errors=True) shutil.rmtree(os.path.join(res_dir, "gff3"), ignore_errors=True) # If not --force, check that result folders do not already contain results else: utils.check_out_dirs(res_dir) # get only filename of list_file, without extension if list_file: listfile_base = os.path.basename(os.path.splitext(list_file)[0]) else: list_file = from_info listfile_base = os.path.basename(os.path.splitext(list_file)[0]) # Initialize logger # set level of logger: level is the minimum level that will be considered. if verbose <= 1: level = logging.INFO # for verbose = 2, ignore only debug if verbose >= 2 and verbose < 15: level = utils.detail_lvl() # int corresponding to detail level # for verbose >= 15, write everything if verbose >= 15: level = logging.DEBUG logfile_base = os.path.join(res_dir, "PanACoTA-annotate_" + listfile_base) logfile_base = utils.init_logger(logfile_base, level, name='annotate', log_details=True, verbose=verbose, quiet=quiet) logger = logging.getLogger('annotate') logger.info(f'PanACoTA version {version}') logger.info("Command used\n \t > " + cmd) # STEP 1. analyze genomes (nb contigs, L90, rows of N...) # If already info on genome ('--info <file>' option), skip this step # If no info on genomes, read them and get needed information if not from_info: # Read genome names. # genomes = {genome: [spegenus.date]} genomes = utils.read_genomes(list_file, name, date, db_path, tmp_dir, logger) if not genomes: logger.error( ("We did not find any genome listed in {} in the folder {}. " "Please check your list to give valid genome " "names.").format(list_file, db_path)) sys.exit(1) # Get L90, nbcontig, size for all genomes, and cut at row of cutn 'N' if asked # -> genome: [spegenus.date, orig_path, to_annotate_path, size, nbcont, l90] gfunc.analyse_all_genomes(genomes, db_path, tmp_dir, cutn, soft, logger, quiet=quiet) # --info <filename> option given: read information (L90, nb contigs...) from this file. else: # genomes = {genome: [spegenus.date, orig_path, to_annotate_path, size, nbcont, l90]} # orig_path is the path to the original sequence # and to_annotate_path the path to the sequence to annotate (once split etc.) # Here, both are the same, as we take given sequences as is. genomes = utils.read_genomes_info(from_info, name, date, logger) # STEP 2. keep only genomes with 'good' (according to user thresholds) L90 and nb_contigs # genomes = {genome: [spegenus.date, orig_seq, path_to_splitSequence, size, nbcont, l90]} # Plot L90 and nb_contigs distributions gfunc.plot_distributions(genomes, res_dir, listfile_base, l90, nbcont) # Get list of genomes kept (according to L90 and nbcont thresholds) kept_genomes = { genome: info for genome, info in genomes.items() if info[-2] <= nbcont and info[-1] <= l90 } # Write discarded genomes to a file -> orig_name, to_annotate, gsize, nb_conts, L90 utils.write_genomes_info(genomes, list(kept_genomes.keys()), list_file, res_dir) if not kept_genomes: logger.info("No genome kept for annotation.") return "", 0 # Info on folder containing original sequences if not from_info: logger.info( f"-> Original sequences folder ('orig_name' column): {db_path} ") logger.info( f"\t-> If original sequence not found in {db_path}, " f"look for it in {tmp_dir}, as it must be a concatenation of several " "input sequence files.") if cutn == 0: logger.info( "-> Sequences used for annotation ('to_annotate' column) are the " "same as the previous ones (original sequences).") else: logger.info( f"-> Folder with sequence files that will be used for annotation " f"('to_annotate' column): {tmp_dir}") # If only QC, stop here. if qc_only: # Write information on genomes that would be annotated with the current # parameters if not QC_only: # orig_name, to_annnote, gsize, nb_conts, L90 utils.write_genomes_info(genomes, [], list_file, res_dir, qc=True) logger.info("QC only done.") return "", 0 # STEP 3. Rename genomes kept, ordered by decreasing quality first_gname = gfunc.rename_all_genomes(kept_genomes) # kept_genomes = {genome: [gembase_name, path_to_origfile, path_split_gembase, # gsize, nbcont, L90]} # first_gname = name of the first genome # Write lstinfo file (list of genomes kept with info on L90 etc.) outlst = utils.write_lstinfo(list_file, kept_genomes, res_dir) # STEP 4. Annotate all kept genomes results = pfunc.run_annotation_all(kept_genomes, threads, force, res_annot_dir, first_gname, prodigal_only, small=small, quiet=quiet) # Information on genomes to format # results_ok = {genome: [gembase_name, path_to_origfile, path_split_gembase, # gsize, nbcont, L90]} results_ok = { genome: info for genome, info in kept_genomes.items() if results[genome] } # If no genome was ok, no need to format them. Just print that no genome was annotated, # end program. if not results_ok: logger.error( "Error: No genome was correctly annotated, no need to format them." ) sys.exit(1) # list of genomes skipped because annotation had problems: no format step run skipped = [genome for (genome, ok) in results.items() if not ok] # At least 1 genome was not annotated: write a message to warn on it if skipped: utils.write_warning_skipped(skipped, prodigal_only=prodigal_only, logfile=logfile_base) # STEP 5. Format genomes annotated # Here, we have at least 1 genome annotated (otherwise, # it would already have stopped because results_ok is empty) # Initialize list of genomes skipped because something went wrong while formatting. skipped_format = [] # Generate database (folders Proteins, Genes, Replicons, LSTINFO) skipped_format = ffunc.format_genomes(results_ok, res_dir, res_annot_dir, prodigal_only, threads, quiet=quiet) # At least one genome could not be formatted -> warn user if skipped_format: utils.write_warning_skipped(skipped_format, do_format=True, prodigal_only=prodigal_only, logfile=logfile_base) logger.info("Annotation step done.") return outlst, len(kept_genomes) - len(skipped) - len(skipped_format)