def test_write_genome_prt_exists(): """ Test that when only prt file exists, it overwrites it and generates expected prt and gen files """ listdir = os.path.join(GENEPATH, "Listdir") aldir = os.path.join(GENEPATH, "Aldir") # Create align folder os.makedirs(listdir) dname = "test_write_genome" strain = "ESCO4" members = ALL_PROTS[strain] # Create prt file fileprt = os.path.join(listdir, f"{dname}-getEntry_prt_ESCO4.txt") with open(fileprt, "w") as prtf: prtf.write("Wrong prt file\n") p2p.write_genome_file(listdir, aldir, dname, strain, members, SEVERAL) # Check creation of files and content expprt = os.path.join(EXPPATH, "exp_getentry-prt-ESCO4_write-prt.txt") assert tutil.compare_file_content(fileprt, expprt) filegen = os.path.join(listdir, f"{dname}-getEntry_gen_ESCO4.txt") expgen = os.path.join(EXPPATH, "exp_getentry-gen-ESCO4_write-prt.txt") assert tutil.compare_file_content(expgen, filegen)
def test_main_qc(): """ Test that when only QC is run, it writes: - the list of all genomes with their characteristics - the list of genomes that would be discarded for annotation - the 2 png files """ list_file = os.path.join(TEST_DIR, "list_genomes-func-test-default.txt") name = "ESCO" cutn = 0 threads = 1 l90 = 1 date = "0417" force = False qc_only = True assert annot.main("cmd", list_file, GEN_PATH, GENEPATH, name, date, l90=l90, cutn=cutn, qc_only=qc_only) == ("", 0) # Check files are here lstfile = os.path.join(GENEPATH, "ALL-GENOMES-info-list_genomes-func-test-default.lst") exp_lstfile = os.path.join(EXP_DIR, "exp_ALL-GENOMES-QC.lst") discardedfile = os.path.join(GENEPATH, "discarded-list_genomes-func-test-default.lst") exp_discarded = os.path.join(EXP_DIR, "exp_discarded_QC.lst") assert os.path.isfile(lstfile) assert os.path.isfile(discardedfile) assert os.path.isfile(os.path.join(GENEPATH, "QC_L90-list_genomes-func-test-default.png")) assert os.path.isfile(os.path.join(GENEPATH, "QC_nb-contigs-list_genomes-func-test-default.png")) # Check content of discarded genomes assert tutil.compare_file_content(lstfile, exp_lstfile) assert tutil.compare_file_content(discardedfile, exp_discarded)
def test_write_getentry(): """ Test that when giving a list of genomes with their persistent gene names, it creates all expected files. """ listdir = os.path.join(GENEPATH, "Listdir") aldir = os.path.join(GENEPATH, "Aldir") # Create align folder os.makedirs(listdir) dname = "TEST6" p2p.write_getentry_files(ALL_PROTS, SEVERAL, listdir, aldir, dname, ALL_GENOMES) # Check creation and content of all files genfiles = [os.path.join(listdir, "{}-getEntry_gen_ESCO{}.txt".format(dname, num)) for num in range(1, 7)] expgens = [os.path.join(EXPPATH, "exp_getentry-gen-ESCO{}.txt".format(num)) for num in range(1, 7)] for fexp, fout in zip(expgens, genfiles): print(fexp, fout) assert tutil.compare_file_content(fexp, fout) prtfiles = [os.path.join(listdir, "{}-getEntry_prt_ESCO{}.txt".format(dname, num)) for num in range(1, 7)] expprts = [os.path.join(EXPPATH, "exp_getentry-prt-ESCO{}.txt".format(num)) for num in range(1, 7)] for fexp, fout in zip(expprts, prtfiles): assert tutil.compare_file_content(fexp, fout)
def test_get_all_seqs_prt6(caplog): """ Test that when giving a list of family numbers, and output directories contain only a prt file for 1 family, it removes this prt file and it extracts all expected proteins and genes. => Aldir with prt file for fam 6. Others as default """ caplog.set_level(logging.DEBUG) all_genomes = [ "GEN2.1017.00001", "GEN4.1111.00001", "GENO.1017.00001", "GENO.1216.00002" ] dname = "TESTgetAllSeq" listdir = os.path.join(GENEPATH, "Listdir") aldir = os.path.join(GENEPATH, "Align") all_fams = [1, 6] quiet = False # Create listdir and aldir and put all getentry files in listdir os.makedirs(listdir) os.makedirs(aldir) ref_listdir = os.path.join(TESTPATH, "test_listdir") ref_aldir = os.path.join(EXPPATH, "exp_aldir") prt6 = os.path.join(aldir, "{}-current.6.prt".format(dname)) # Create empty file for prt of family 6 open(prt6, "w").close() for gen in all_genomes: genome_gen = os.path.join(ref_listdir, "getentry-gen_{}".format(gen)) genome_prt = os.path.join(ref_listdir, "getentry-prt_{}".format(gen)) gen_out = os.path.join(listdir, "{}-getEntry_gen_{}.txt".format(dname, gen)) prt_out = os.path.join(listdir, "{}-getEntry_prt_{}.txt".format(dname, gen)) shutil.copyfile(genome_gen, gen_out) shutil.copyfile(genome_prt, prt_out) assert os.path.isfile(prt6) gseq.get_all_seqs(all_genomes, dname, DBPATH, listdir, aldir, all_fams, quiet) # For each family, check that prt and gen files exist, and their content for fam in all_fams: fam_prt = os.path.join(aldir, "{}-current.{}.prt".format(dname, fam)) assert os.path.isfile(fam_prt) exp_fam_prt = os.path.join(ref_aldir, "current.{}.prt".format(fam)) assert tutil.compare_file_content(fam_prt, exp_fam_prt) fam_gen = os.path.join(aldir, "{}-current.{}.gen".format(dname, fam)) assert os.path.isfile(fam_gen) exp_fam_gen = os.path.join(ref_aldir, "current.{}.gen".format(fam)) assert tutil.compare_file_content(fam_gen, exp_fam_gen) # Check logs assert "Extracting proteins and genes from all genomes" in caplog.text for gen in all_genomes: assert "Extracting proteins and genes from {}".format( gen) in caplog.text
def test_write_getentry_error(caplog): """ Test that when giving a list of genomes with their persistent gene names, but for 2 genomes, there is no persistent gene, it exists, with an error message """ caplog.set_level(logging.DEBUG) all_prots = {"ESCO1": {"ESCO1_00001": '1', "ESCO1_00002": '4'}, "ESCO2": {"ESCO2_00001": '1', "ESCO2_22": '2', "ESCO2_456": '4', "ESCO2_46": '3'}, "ESCO3": {"ESCO3_1": '2', "ESCO3_12": '1', "ESCO3_4564": '3', "ESCO3_00123": '4', "ESCO3_8": '2'}, "ESCO6": {"ESCO6_1": '4', "ESCO6_2": '3', "ESCO6_3": '1'}} several = {'1': [], '2': ["ESCO3"], '3': [], '4': []} listdir = os.path.join(GENEPATH, "Listdir") aldir = os.path.join(GENEPATH, "Aldir") # Create align folder os.makedirs(listdir) dname = "TEST6" with pytest.raises(SystemExit): p2p.write_getentry_files(all_prots, several, listdir, aldir, dname, ALL_GENOMES) assert ("There is not any protein for genome ESCO4 in any family! The program will close, " "please fix this problem to be able to run the alignments") in caplog.text assert ("There is not any protein for genome ESCO5 in any family! The program will close, " "please fix this problem to be able to run the alignments") in caplog.text # Check creation and content of all files genfiles = [os.path.join(listdir, "{}-getEntry_gen_ESCO{}.txt".format(dname, num)) for num in list(range(1, 4)) + [6]] expgens = [os.path.join(EXPPATH, "exp_getentry-gen-ESCO{}.txt".format(num)) for num in list(range(1, 4)) + [6]] for fexp, fout in zip(expgens, genfiles): assert tutil.compare_file_content(fexp, fout) prtfiles = [os.path.join(listdir, "{}-getEntry_prt_ESCO{}.txt".format(dname, num)) for num in list(range(1, 4)) + [6]] expprts = [os.path.join(EXPPATH, "exp_getentry-prt-ESCO{}.txt".format(num)) for num in list(range(1, 4)) + [6]] for fexp, fout in zip(expprts, prtfiles): assert tutil.compare_file_content(fexp, fout)
def test_get_all_seqs(caplog): """ Test that when giving a list of family numbers, and output directories are empty, it extracts all expected proteins and genes. => Default. empty output, give database and 2 families to extract and getentry files exist in Listdir """ caplog.set_level(logging.DEBUG) all_genomes = [ "GEN2.1017.00001", "GEN4.1111.00001", "GENO.1017.00001", "GENO.1216.00002" ] dname = "TESTgetAllSeq" listdir = os.path.join(GENEPATH, "Listdir") aldir = os.path.join(GENEPATH, "Align") all_fams = [1, 6] quiet = False # Create listdir and aldir and put all getentry files in listdir os.makedirs(listdir) os.makedirs(aldir) ref_listdir = os.path.join(TESTPATH, "test_listdir") ref_aldir = os.path.join(EXPPATH, "exp_aldir") for gen in all_genomes: genome_gen = os.path.join(ref_listdir, f"getentry-gen_{gen}") genome_prt = os.path.join(ref_listdir, f"getentry-prt_{gen}") gen_out = os.path.join(listdir, f"{dname}-getEntry_gen_{gen}.txt") prt_out = os.path.join(listdir, f"{dname}-getEntry_prt_{gen}.txt") shutil.copyfile(genome_gen, gen_out) shutil.copyfile(genome_prt, prt_out) gseq.get_all_seqs(all_genomes, dname, DBPATH, listdir, aldir, all_fams, quiet) # For each family, check that prt and gen files exist, and their content for fam in all_fams: fam_prt = os.path.join(aldir, f"{dname}-current.{fam}.prt") assert os.path.isfile(fam_prt) exp_fam_prt = os.path.join(ref_aldir, f"current.{fam}.prt") assert tutil.compare_file_content(fam_prt, exp_fam_prt) fam_gen = os.path.join(aldir, f"{dname}-current.{fam}.gen") assert os.path.isfile(fam_gen) exp_fam_gen = os.path.join(ref_aldir, f"current.{fam}.gen") assert tutil.compare_file_content(fam_gen, exp_fam_gen) # Check logs assert "Extracting proteins and genes from all genomes" in caplog.text for gen in all_genomes: assert f"Extracting proteins and genes from {gen}" in caplog.text
def test_extract_seq_out_different(): """ Test that when giving an open fasta file, a list of 3 sequences to extract with a corresponding output file for each, it writes the expected sequences to the expected output file. """ out1 = os.path.join(GENEPATH, "test_extract1.prt") out2 = os.path.join(GENEPATH, "test_extract2.prt") to_extract = { "GEN2.1017.00001.b0001_00001": out1, "GEN2.1017.00001.i0003_00008": out2, "GEN2.1017.00001.b0004_00013": out1 } with open(FASTA, "r") as fasf: gseq.extract_sequences(to_extract, fasf, files_todo=[out1, out2]) exp_extracted1 = os.path.join(EXPPATH, "exp_extracted1.prt") exp_extracted2 = os.path.join(EXPPATH, "exp_extracted2.prt") assert tutil.compare_file_content(out1, exp_extracted1) assert tutil.compare_file_content(out2, exp_extracted2)
def test_get_genome_seqs_outgiven_1col(): """ Test that given a fasta file, and a tab file containing only all sequences to extract, (no filename), and an output file, it extracts all sequences to the same output file. """ tabfile = os.path.join(TESTPATH, "getentry_all_1column.txt") outfile = os.path.join(GENEPATH, "fileout.txt") todo = [] gseq.get_genome_seqs(FASTA, tabfile, todo, outfile) assert os.path.isfile(outfile) exp_file = os.path.join(EXPPATH, "exp_extracted.prt") assert tutil.compare_file_content(outfile, exp_file)
def test_write_genome(): """ Test that given a genome, it writes the list of its proteins and genes in expected files. """ listdir = os.path.join(GENEPATH, "Listdir") aldir = os.path.join(GENEPATH, "Aldir") # Create align folder os.makedirs(listdir) dname = "test_write_genome" strain = "ESCO4" members = ALL_PROTS[strain] p2p.write_genome_file(listdir, aldir, dname, strain, members, SEVERAL) # Check creation of files and content fileprt = os.path.join(listdir, f"{dname}-getEntry_prt_ESCO4.txt") expprt = os.path.join(EXPPATH, "exp_getentry-prt-ESCO4_write-prt.txt") assert tutil.compare_file_content(fileprt, expprt) filegen = os.path.join(listdir, f"{dname}-getEntry_gen_ESCO4.txt") expgen = os.path.join(EXPPATH, "exp_getentry-gen-ESCO4_write-prt.txt") assert tutil.compare_file_content(filegen, expgen)
def test_get_genome_seqs_outgiven_2cols(): """ Test that given a fasta file, and a tab file containing all sequences to extract, with the files to which it must be extracted, and an output file, it extracts all sequences to the same output file, ignoring the ones given in tab file """ tabfile = os.path.join(TESTPATH, "getentry_all_2columns.txt") outfile = os.path.join(GENEPATH, "fileout.txt") todo = [] gseq.get_genome_seqs(FASTA, tabfile, todo, outfile) assert os.path.isfile(outfile) exp_file = os.path.join(EXPPATH, "exp_extracted.prt") assert tutil.compare_file_content(outfile, exp_file) os.remove(outfile)
def test_get_genome_all_seqs(): """ Test that given a fasta file, and a tab file containing all sequences to extract, with the files to which it must be extracted, it extracts everything in the right file. """ tabfile = os.path.join(TESTPATH, "getentry_all_2columns.txt") todo = ["file1.txt", "file2.txt"] todo = [os.path.join(GENEPATH, f) for f in todo] gseq.get_genome_seqs(FASTA, tabfile, todo) for i in range(1, 3): outfile = os.path.join(GENEPATH, f"file{i}.txt") exp_file = os.path.join(EXPPATH, f"exp_extracted{i}.prt") assert os.path.isfile(outfile) assert tutil.compare_file_content(outfile, exp_file)
def test_get_genome_seqs_1notasked(): """ Test that given a fasta file, and a tab file containing all sequences to extract, with the files to which it must be extracted, and only 1 of them in 'files_todo', it extracts only the proteins going to this file. """ tabfile = os.path.join(TESTPATH, "getentry_all_2columns.txt") outfile1 = os.path.join(GENEPATH, "file1.txt") outfile2 = os.path.join(GENEPATH, "file2.txt") todo = [outfile1] gseq.get_genome_seqs(FASTA, tabfile, todo) assert os.path.isfile(outfile1) exp_file = os.path.join(EXPPATH, "exp_extracted1.prt") assert tutil.compare_file_content(outfile1, exp_file) assert not os.path.isfile(outfile2)
def test_extract_seq_out_given(): """ Test that when giving an open fasta file, a list of 3 sequences to extract, and a open file to which extracted sequences must be written, it writes the expected sequences to output file. """ to_extract = [ "GEN2.1017.00001.b0001_00001", "GEN2.1017.00001.i0003_00008", "GEN2.1017.00001.b0004_00013" ] outfile = os.path.join(GENEPATH, "test_extract_out-given.prt") with open(FASTA, "r") as fasf, open(outfile, "w") as outf: gseq.extract_sequences(to_extract, fasf, outf=outf) exp_extracted = os.path.join(EXPPATH, "exp_extracted.prt") assert tutil.compare_file_content(outfile, exp_extracted)
def test_extract_seq_out_allsame(): """ Test that when giving an open fasta file, a list of 3 sequences to extract with a corresponding output file (same for all 3 proteins), it writes the expected sequences to output file. """ out = os.path.join(GENEPATH, "test_extract1.prt") to_extract = { "GEN2.1017.00001.b0001_00001": out, "GEN2.1017.00001.i0003_00008": out, "GEN2.1017.00001.b0004_00013": out } with open(FASTA, "r") as fasf: gseq.extract_sequences(to_extract, fasf, files_todo=[out]) exp_extracted = os.path.join(EXPPATH, "exp_extracted.prt") assert tutil.compare_file_content(out, exp_extracted)
def test_extract_seq_out_different_notasked(): """ Test that when giving an open fasta file, a list of 3 sequences to extract with a corresponding output file for each, it writes the expected sequences to expected output file, only if this file is contained in 'files_todo'. If not, file is not created, and protein not extracted. """ out1 = os.path.join(GENEPATH, "test_extract1.prt") out2 = os.path.join(GENEPATH, "test_extract2.prt") to_extract = { "GEN2.1017.00001.b0001_00001": out1, "GEN2.1017.00001.i0003_00008": out2, "GEN2.1017.00001.b0004_00013": out1 } with open(FASTA, "r") as fasf: gseq.extract_sequences(to_extract, fasf, files_todo=[out1]) exp_extracted1 = os.path.join(EXPPATH, "exp_extracted1.prt") assert tutil.compare_file_content(out1, exp_extracted1) assert not os.path.isfile(out2)
def test_compare_all(caplog): """ Check that comparison of all sketched sequences is as expected (output matrix is as expected) """ out_msh = os.path.join(DATA_TEST_DIR, "test_files", "test_mash_output") matrix = os.path.join(GENEPATH, "matrix_from_test_compare_all.txt") mash_log = os.path.join(GENEPATH, "mashlog_from_test_compare_all.log") threads = 1 # Check msh file exists assert os.path.isfile(out_msh + ".msh") filterg.compare_all(out_msh, matrix, "matrix", mash_log, threads) # Check output files are created assert os.path.isfile(matrix) assert os.path.isfile(mash_log) # Check content of matrix file expect_matrix = os.path.join(DATA_TEST_DIR, "test_files", "test_matrix_mash.txt") assert tutil.compare_file_content(matrix, expect_matrix)
def test_get_all_seqs_prtgen6(caplog): """ Test that when giving a list of family numbers, and output directories contain a prt and a gen file for 1 family, it extracts all expected proteins and genes for other families, but keeps the current file for family already having prt and gen + add mafft and prt2nuc files for this family, and check that they are not removed + add concatenate file, and check that it is removed => prt and gen files in Aldir for fam 6. Others as default """ caplog.set_level(logging.DEBUG) all_genomes = [ "GEN2.1017.00001", "GEN4.1111.00001", "GENO.1017.00001", "GENO.1216.00002" ] dname = "TESTgetAllSeq" listdir = os.path.join(GENEPATH, "Listdir") aldir = os.path.join(GENEPATH, "Align") all_fams = [1, 6] quiet = False # Create listdir and aldir and put all getentry files in listdir os.makedirs(listdir) os.makedirs(aldir) ref_listdir = os.path.join(TESTPATH, "test_listdir") ref_aldir = os.path.join(EXPPATH, "exp_aldir") # Create empty files for prt, gen, mafft and prt2nuc files of family 6 prt6 = os.path.join(aldir, "{}-current.6.prt".format(dname)) gen6 = os.path.join(aldir, "{}-current.6.gen".format(dname)) mafft6 = os.path.join(aldir, "{}-mafft-align.6.aln".format(dname)) prt2nuc6 = os.path.join(aldir, "{}-mafft-prt2nuc.6.aln".format(dname)) # Add concatenate file concat = os.path.join(aldir, "{}-complete.cat.aln".format(dname)) for outf in [prt6, gen6, mafft6, prt2nuc6, concat]: open(outf, "w").close() for gen in all_genomes: genome_gen = os.path.join(ref_listdir, "getentry-gen_{}".format(gen)) genome_prt = os.path.join(ref_listdir, "getentry-prt_{}".format(gen)) gen_out = os.path.join(listdir, "{}-getEntry_gen_{}.txt".format(dname, gen)) prt_out = os.path.join(listdir, "{}-getEntry_prt_{}.txt".format(dname, gen)) shutil.copyfile(genome_gen, gen_out) shutil.copyfile(genome_prt, prt_out) gseq.get_all_seqs(all_genomes, dname, DBPATH, listdir, aldir, all_fams, quiet) # For family 1, check that prt and gen files exist and are as expected fam_prt = os.path.join(aldir, "{}-current.1.prt".format(dname)) assert os.path.isfile(fam_prt) exp_fam_prt = os.path.join(ref_aldir, "current.1.prt") assert tutil.compare_file_content(fam_prt, exp_fam_prt) fam_gen = os.path.join(aldir, "{}-current.1.gen".format(dname)) assert os.path.isfile(fam_gen) exp_fam_gen = os.path.join(ref_aldir, "current.1.gen") assert tutil.compare_file_content(fam_gen, exp_fam_gen) # For family 6 , check that all filesare present and empty for outf in [prt6, gen6, mafft6, prt2nuc6]: assert os.path.isfile(outf) with open(outf, "r") as out: assert out.readlines() == [] # Check that concat file was removed assert not os.path.isfile(concat) # Check logs assert "Extracting proteins and genes from all genomes" in caplog.text for gen in all_genomes: assert "Extracting proteins and genes from {}".format( gen) in caplog.text