Пример #1
0
def test_read_pangenome_filewrong(caplog):
    """
    giving a wrong pangenome file, raises error
    """
    caplog.set_level(logging.INFO)
    logger = logging.getLogger("test_pan")
    # create wrong pangenome file
    pan_to_use = os.path.join(GENEPATH, "Pangenome.lst")
    with open(pan_to_use, "w") as ptu:
        ptu.write("I'm not a good pangenome file")
    with pytest.raises(SystemExit):
        upan.read_pangenome(pan_to_use, logger)
    assert "Reading and getting information from pangenome file" in caplog.text
    assert "Error in pangenome file. No family found" in caplog.text
    assert not os.path.isfile(pan_to_use + ".bin")
Пример #2
0
def post_treat(families, pangenome):
    """
    From clusters = {num: [members]}, create:

    - a pan_quali matrix (lines = families, columns = genomes, 1 if genome present in\
     family, 0 otherwise)
    - a pan_quanti matrix (lines = families, columns = genomes, number of members from given\
     genome in the given family)
    - a summary file: lines = families. For each family:

        - nb_members: total number of members
        - sum_quanti: should be the same as nb_members!
        - sum_quali: number of different genomes in family
        - nb_0: number of missing genomes in family
        - nb_mono: number of genomes with exactly 1 member
        - nb_multi: number of genomes with more than 1 member
        - sum_0-mono-multi: should be equal to the total number of genomes in dataset
        - max_multi: maximum number of members from 1 genome

    Parameters
    ----------
    families : dict
        {num_fam: [list of members]}. Can be None, and then they will be retrieved from the\
        pangenome file
    pangenome : str
        file containing pangenome
    """
    fams_by_strain, families, all_strains = utilsp.read_pangenome(
        pangenome, logger, families)
    open_outputs_to_write(fams_by_strain, families, all_strains, pangenome)
Пример #3
0
def test_read_pangenome_fams(caplog):
    """
    Test that when giving a pangenome file, and families, it directly extracts strain information
    from the families: pangenome file does not need to exist, and a binary file is created
    """
    caplog.set_level(logging.DEBUG)
    logger = logging.getLogger("test_pan")
    panfile = os.path.join(GENEPATH, "toto.txt")
    fbs, fams, ass = upan.read_pangenome(panfile, logger, families=FAMILIES)
    assert fbs == FAMS_BY_STRAIN
    assert fams == FAMILIES
    assert ass == ALL_STRAINS
    assert "Retrieving information from pan families" in caplog.text
    assert "Saving all information to a binary file for later use" in caplog.text
    assert os.path.isfile(panfile + ".bin")
Пример #4
0
def test_read_pangenome_filetxt(caplog):
    """
    Test that when giving a pangenome file, it returns all families as expected.
    """
    caplog.set_level(logging.INFO)
    logger = logging.getLogger("test_pan")
    # Copy pan file to folder for files generated by tests. It will also save its bin version
    pan_to_use = os.path.join(GENEPATH, "Pangenome.lst")
    shutil.copyfile(PAN_FILE, pan_to_use)
    fbs, fams, ass = upan.read_pangenome(pan_to_use, logger)
    assert fbs == FAMS_BY_STRAIN
    assert fams == FAMILIES
    assert ass == ALL_STRAINS
    assert "Reading and getting information from pangenome file" in caplog.text
    assert "Saving all information to a binary file for later use" in caplog.text
    assert os.path.isfile(pan_to_use + ".bin")
Пример #5
0
def test_read_pangenome_filebin(caplog):
    """
    Test that when giving only a pangenome filename, and the corresponding bin file exists,
    it reads the binary file, and returns expected objects.
    """
    caplog.set_level(logging.INFO)
    logger = logging.getLogger("test_pan")
    # Copy pan file to folder for files generated by tests. It will also save its bin version
    pan_to_use = os.path.join(GENEPATH, "Pangenome.lst")
    panbin_to_use = os.path.join(GENEPATH, "Pangenome.lst.bin")
    test_panbin = os.path.join(PAN_TEST, "pangenome.bin")
    shutil.copyfile(PAN_FILE, pan_to_use)
    shutil.copyfile(test_panbin, panbin_to_use)
    fbs, fams, ass = upan.read_pangenome(pan_to_use, logger)
    assert fbs == FAMS_BY_STRAIN
    assert fams == FAMILIES
    assert ass == ALL_STRAINS
    assert "Retrieving info from binary file" in caplog.text
Пример #6
0
def test_read_pangenome_fams_binok(caplog):
    """
    Test that when giving a pangenome file, and families, it directly extracts strain information
    from the families: pangenome file does not need to exist. However, the pangenome.bin file
    already exists (whatever its content), and is then not recreated.
    """
    caplog.set_level(logging.INFO)
    logger = logging.getLogger("test_pan")
    panfile = os.path.join(GENEPATH, "toto.txt")
    # Create bn pangenome file (which is empty
    open(panfile + ".bin", "w").close()
    fbs, fams, ass = upan.read_pangenome(panfile, logger, FAMILIES)
    assert fbs == FAMS_BY_STRAIN
    assert fams == FAMILIES
    assert ass == ALL_STRAINS
    with open(panfile + ".bin", "r") as panf:
        all_lines = panf.readlines()
    assert all_lines == []
    assert "Retrieving information from pan families" in caplog.text
    assert os.path.isfile(panfile + ".bin")
    with open(panfile + ".bin", "r") as pfb:
        assert pfb.readlines() == []
Пример #7
0
def test_read_pangenome_filebin_strfamnum(caplog):
    """
    Test that when giving only a pangenome filename, and the corresponding bin file exists,
    it reads the binary file, and returns expected objects.
    """
    caplog.set_level(logging.INFO)
    logger = logging.getLogger("test_pan")
    # Copy pan file to folder for files generated by tests. It will also save its bin version
    pan_to_use = os.path.join(GENEPATH, "Pangenome.lst")
    panbin_to_use = os.path.join(GENEPATH, "Pangenome.lst.bin")
    test_panbin = os.path.join(PAN_TEST, "pangenome-strfamnum.bin")
    shutil.copyfile(PAN_FILE, pan_to_use)
    shutil.copyfile(test_panbin, panbin_to_use)
    fbs, fams, ass = upan.read_pangenome(pan_to_use, logger)
    assert fbs == {"family1": {"gene1": ["gene1_5"],
                               "gene2": ["gene2_6"],
                               "gene3": ["gene3_8"]},
                   "2": {"gene1": ["gene1_", "gene1_toto"],
                         "gene3": ["gene3_5"]}}
    assert fams == {"family1": ["gene1_5", "gene2_6", "gene3_8"],
                    "2": ["gene1_", "gene3_5", "gene1_toto"]}
    assert ass == ["gene1", "gene2", "gene3"]
    assert "Retrieving info from binary file" in caplog.text
Пример #8
0
def test_read_pangenome_strfamnum(caplog):
    """
    giving a wrong pangenome file, raises error
    """
    caplog.set_level(logging.INFO)
    logger = logging.getLogger("test_pan")
    # create wrong pangenome file
    panfile = os.path.join(GENEPATH, "panfile_wrong")
    with open(panfile, "w") as pfw:
        pfw.write("family1 gene1_5 gene2_6 gene3_8\n")
        pfw.write("2 gene1_ gene3_5 gene1_toto")
    fbs, fams, sas = upan.read_pangenome(panfile, logger)
    assert fbs == {"family1": {"gene1": ["gene1_5"],
                               "gene2": ["gene2_6"],
                               "gene3": ["gene3_8"]},
                   "2": {"gene1": ["gene1_", "gene1_toto"],
                         "gene3": ["gene3_5"]}}
    assert fams == {"family1": ["gene1_5", "gene2_6", "gene3_8"],
                    "2": ["gene1_", "gene3_5", "gene1_toto"]}
    assert sas == ["gene1", "gene2", "gene3"]
    assert "Reading and getting information from pangenome file" in caplog.text
    assert "Saving all information to a binary file for later use" in caplog.text
    assert os.path.isfile(panfile + ".bin")
Пример #9
0
def main(cmd, pangenome, tol, multi, mixed, outputdir, lstinfo_file, floor,
         verbose, quiet):
    """
    Read pangenome and deduce Persistent genome according to the user criteria

    Parameters
    ----------
    pangenome : str
        file containing pangenome
    tol : float
        min % of genomes present in a family to consider it as persistent (between 0 and 1)
    multi : bool
        True if multigenic families are allowed, False otherwise
    mixed : bool
        True if mixed families are allowed, False otherwise
    outputdir : str or None
        Specific directory for the generated persistent genome. If not given, pangenome directory is used.
    lstinfo_file : str
        list of genomes to include in the core/persistent genome. If not given, include all genomes of pan
    floor : bool
        Require at least floor(nb_genomes*tol) genomes if True, ceil(nb_genomes*tol) if False
    verbose : int
        verbosity:
        - defaut 0 : stdout contains INFO, stderr contains ERROR.
        - 1: stdout contains INFO, stderr contains WARNING and ERROR
        - 2: stdout contains (DEBUG), DETAIL and INFO, stderr contains WARNING and ERROR
        - >=15: Add DEBUG in stdout
    quiet : bool
        True if nothing must be sent to stdout/stderr, False otherwise
    """
    # import needed packages
    import logging
    from PanACoTA import utils
    from PanACoTA import utils_pangenome as utilsp
    import PanACoTA.corepers_module.persistent_functions as pers
    from PanACoTA import __version__ as version

    # get pangenome name info
    _, base_pan = os.path.split(pangenome)
    if lstinfo_file:
        _, base_lst = os.path.split(lstinfo_file)
    else:
        base_lst = "all"
    # Define output filename
    output_name = f"PersGenome_{base_pan}-{base_lst}_"
    if floor:
        output_name += "F"
    output_name += str(tol)
    if multi:
        output_name += "-multi.lst"
    elif mixed:
        output_name += "-mixed.lst"
    else:
        output_name += ".lst"
    # Define output directory and filename path
    if not os.path.isdir(outputdir):
        os.makedirs(outputdir)
    outputfile = os.path.join(outputdir, output_name)
    logfile_base = os.path.join(outputdir, "PanACoTA-corepers")
    # level is the minimum level that will be considered.
    # for verbose = 0 or 1, ignore details and debug, start from info
    if verbose <= 1:
        level = logging.INFO
    # for verbose = 2, ignore only debug
    if verbose >= 2 and verbose < 15:
        level = 15  # int corresponding to detail level
    # for verbose >= 15, write everything
    if verbose >= 15:
        level = logging.DEBUG
    utils.init_logger(logfile_base,
                      level,
                      'corepers',
                      verbose=verbose,
                      quiet=quiet)
    logger = logging.getLogger("corepers")
    logger.info(f'PanACoTA version {version}')
    logger.info("Command used\n \t > " + cmd)

    logger.info(get_info(tol, multi, mixed, floor))

    # Read pangenome
    fams_by_strain, families, all_strains = utilsp.read_pangenome(
        pangenome, logger)
    # If list of genomes given, get subset of previous dicts, including only the genomes aksed
    if lstinfo_file:
        fams_by_strain, families, all_strains = pers.get_subset_genomes(
            fams_by_strain, families, lstinfo_file)
    # Generate persistent genome
    fams = pers.get_pers(fams_by_strain, families, len(all_strains), tol,
                         multi, mixed, floor)
    # Write persistent genome to file
    pers.write_persistent(fams, outputfile)
    logger.info("Persistent genome step done.")
    return outputfile