def test_read_pangenome_filewrong(caplog): """ giving a wrong pangenome file, raises error """ caplog.set_level(logging.INFO) logger = logging.getLogger("test_pan") # create wrong pangenome file pan_to_use = os.path.join(GENEPATH, "Pangenome.lst") with open(pan_to_use, "w") as ptu: ptu.write("I'm not a good pangenome file") with pytest.raises(SystemExit): upan.read_pangenome(pan_to_use, logger) assert "Reading and getting information from pangenome file" in caplog.text assert "Error in pangenome file. No family found" in caplog.text assert not os.path.isfile(pan_to_use + ".bin")
def post_treat(families, pangenome): """ From clusters = {num: [members]}, create: - a pan_quali matrix (lines = families, columns = genomes, 1 if genome present in\ family, 0 otherwise) - a pan_quanti matrix (lines = families, columns = genomes, number of members from given\ genome in the given family) - a summary file: lines = families. For each family: - nb_members: total number of members - sum_quanti: should be the same as nb_members! - sum_quali: number of different genomes in family - nb_0: number of missing genomes in family - nb_mono: number of genomes with exactly 1 member - nb_multi: number of genomes with more than 1 member - sum_0-mono-multi: should be equal to the total number of genomes in dataset - max_multi: maximum number of members from 1 genome Parameters ---------- families : dict {num_fam: [list of members]}. Can be None, and then they will be retrieved from the\ pangenome file pangenome : str file containing pangenome """ fams_by_strain, families, all_strains = utilsp.read_pangenome( pangenome, logger, families) open_outputs_to_write(fams_by_strain, families, all_strains, pangenome)
def test_read_pangenome_fams(caplog): """ Test that when giving a pangenome file, and families, it directly extracts strain information from the families: pangenome file does not need to exist, and a binary file is created """ caplog.set_level(logging.DEBUG) logger = logging.getLogger("test_pan") panfile = os.path.join(GENEPATH, "toto.txt") fbs, fams, ass = upan.read_pangenome(panfile, logger, families=FAMILIES) assert fbs == FAMS_BY_STRAIN assert fams == FAMILIES assert ass == ALL_STRAINS assert "Retrieving information from pan families" in caplog.text assert "Saving all information to a binary file for later use" in caplog.text assert os.path.isfile(panfile + ".bin")
def test_read_pangenome_filetxt(caplog): """ Test that when giving a pangenome file, it returns all families as expected. """ caplog.set_level(logging.INFO) logger = logging.getLogger("test_pan") # Copy pan file to folder for files generated by tests. It will also save its bin version pan_to_use = os.path.join(GENEPATH, "Pangenome.lst") shutil.copyfile(PAN_FILE, pan_to_use) fbs, fams, ass = upan.read_pangenome(pan_to_use, logger) assert fbs == FAMS_BY_STRAIN assert fams == FAMILIES assert ass == ALL_STRAINS assert "Reading and getting information from pangenome file" in caplog.text assert "Saving all information to a binary file for later use" in caplog.text assert os.path.isfile(pan_to_use + ".bin")
def test_read_pangenome_filebin(caplog): """ Test that when giving only a pangenome filename, and the corresponding bin file exists, it reads the binary file, and returns expected objects. """ caplog.set_level(logging.INFO) logger = logging.getLogger("test_pan") # Copy pan file to folder for files generated by tests. It will also save its bin version pan_to_use = os.path.join(GENEPATH, "Pangenome.lst") panbin_to_use = os.path.join(GENEPATH, "Pangenome.lst.bin") test_panbin = os.path.join(PAN_TEST, "pangenome.bin") shutil.copyfile(PAN_FILE, pan_to_use) shutil.copyfile(test_panbin, panbin_to_use) fbs, fams, ass = upan.read_pangenome(pan_to_use, logger) assert fbs == FAMS_BY_STRAIN assert fams == FAMILIES assert ass == ALL_STRAINS assert "Retrieving info from binary file" in caplog.text
def test_read_pangenome_fams_binok(caplog): """ Test that when giving a pangenome file, and families, it directly extracts strain information from the families: pangenome file does not need to exist. However, the pangenome.bin file already exists (whatever its content), and is then not recreated. """ caplog.set_level(logging.INFO) logger = logging.getLogger("test_pan") panfile = os.path.join(GENEPATH, "toto.txt") # Create bn pangenome file (which is empty open(panfile + ".bin", "w").close() fbs, fams, ass = upan.read_pangenome(panfile, logger, FAMILIES) assert fbs == FAMS_BY_STRAIN assert fams == FAMILIES assert ass == ALL_STRAINS with open(panfile + ".bin", "r") as panf: all_lines = panf.readlines() assert all_lines == [] assert "Retrieving information from pan families" in caplog.text assert os.path.isfile(panfile + ".bin") with open(panfile + ".bin", "r") as pfb: assert pfb.readlines() == []
def test_read_pangenome_filebin_strfamnum(caplog): """ Test that when giving only a pangenome filename, and the corresponding bin file exists, it reads the binary file, and returns expected objects. """ caplog.set_level(logging.INFO) logger = logging.getLogger("test_pan") # Copy pan file to folder for files generated by tests. It will also save its bin version pan_to_use = os.path.join(GENEPATH, "Pangenome.lst") panbin_to_use = os.path.join(GENEPATH, "Pangenome.lst.bin") test_panbin = os.path.join(PAN_TEST, "pangenome-strfamnum.bin") shutil.copyfile(PAN_FILE, pan_to_use) shutil.copyfile(test_panbin, panbin_to_use) fbs, fams, ass = upan.read_pangenome(pan_to_use, logger) assert fbs == {"family1": {"gene1": ["gene1_5"], "gene2": ["gene2_6"], "gene3": ["gene3_8"]}, "2": {"gene1": ["gene1_", "gene1_toto"], "gene3": ["gene3_5"]}} assert fams == {"family1": ["gene1_5", "gene2_6", "gene3_8"], "2": ["gene1_", "gene3_5", "gene1_toto"]} assert ass == ["gene1", "gene2", "gene3"] assert "Retrieving info from binary file" in caplog.text
def test_read_pangenome_strfamnum(caplog): """ giving a wrong pangenome file, raises error """ caplog.set_level(logging.INFO) logger = logging.getLogger("test_pan") # create wrong pangenome file panfile = os.path.join(GENEPATH, "panfile_wrong") with open(panfile, "w") as pfw: pfw.write("family1 gene1_5 gene2_6 gene3_8\n") pfw.write("2 gene1_ gene3_5 gene1_toto") fbs, fams, sas = upan.read_pangenome(panfile, logger) assert fbs == {"family1": {"gene1": ["gene1_5"], "gene2": ["gene2_6"], "gene3": ["gene3_8"]}, "2": {"gene1": ["gene1_", "gene1_toto"], "gene3": ["gene3_5"]}} assert fams == {"family1": ["gene1_5", "gene2_6", "gene3_8"], "2": ["gene1_", "gene3_5", "gene1_toto"]} assert sas == ["gene1", "gene2", "gene3"] assert "Reading and getting information from pangenome file" in caplog.text assert "Saving all information to a binary file for later use" in caplog.text assert os.path.isfile(panfile + ".bin")
def main(cmd, pangenome, tol, multi, mixed, outputdir, lstinfo_file, floor, verbose, quiet): """ Read pangenome and deduce Persistent genome according to the user criteria Parameters ---------- pangenome : str file containing pangenome tol : float min % of genomes present in a family to consider it as persistent (between 0 and 1) multi : bool True if multigenic families are allowed, False otherwise mixed : bool True if mixed families are allowed, False otherwise outputdir : str or None Specific directory for the generated persistent genome. If not given, pangenome directory is used. lstinfo_file : str list of genomes to include in the core/persistent genome. If not given, include all genomes of pan floor : bool Require at least floor(nb_genomes*tol) genomes if True, ceil(nb_genomes*tol) if False verbose : int verbosity: - defaut 0 : stdout contains INFO, stderr contains ERROR. - 1: stdout contains INFO, stderr contains WARNING and ERROR - 2: stdout contains (DEBUG), DETAIL and INFO, stderr contains WARNING and ERROR - >=15: Add DEBUG in stdout quiet : bool True if nothing must be sent to stdout/stderr, False otherwise """ # import needed packages import logging from PanACoTA import utils from PanACoTA import utils_pangenome as utilsp import PanACoTA.corepers_module.persistent_functions as pers from PanACoTA import __version__ as version # get pangenome name info _, base_pan = os.path.split(pangenome) if lstinfo_file: _, base_lst = os.path.split(lstinfo_file) else: base_lst = "all" # Define output filename output_name = f"PersGenome_{base_pan}-{base_lst}_" if floor: output_name += "F" output_name += str(tol) if multi: output_name += "-multi.lst" elif mixed: output_name += "-mixed.lst" else: output_name += ".lst" # Define output directory and filename path if not os.path.isdir(outputdir): os.makedirs(outputdir) outputfile = os.path.join(outputdir, output_name) logfile_base = os.path.join(outputdir, "PanACoTA-corepers") # level is the minimum level that will be considered. # for verbose = 0 or 1, ignore details and debug, start from info if verbose <= 1: level = logging.INFO # for verbose = 2, ignore only debug if verbose >= 2 and verbose < 15: level = 15 # int corresponding to detail level # for verbose >= 15, write everything if verbose >= 15: level = logging.DEBUG utils.init_logger(logfile_base, level, 'corepers', verbose=verbose, quiet=quiet) logger = logging.getLogger("corepers") logger.info(f'PanACoTA version {version}') logger.info("Command used\n \t > " + cmd) logger.info(get_info(tol, multi, mixed, floor)) # Read pangenome fams_by_strain, families, all_strains = utilsp.read_pangenome( pangenome, logger) # If list of genomes given, get subset of previous dicts, including only the genomes aksed if lstinfo_file: fams_by_strain, families, all_strains = pers.get_subset_genomes( fams_by_strain, families, lstinfo_file) # Generate persistent genome fams = pers.get_pers(fams_by_strain, families, len(all_strains), tol, multi, mixed, floor) # Write persistent genome to file pers.write_persistent(fams, outputfile) logger.info("Persistent genome step done.") return outputfile