Exemplo n.º 1
0
def test_download_2taxid(caplog):
    """
    Give a taxid of a subspecies and a taxid of a specific strain. Should download all genomes
    of the subspecies + specific strain.
    If only the subspecies taxid, the specific strain is not downloaded.
    """
    caplog.set_level(logging.INFO)
    species_linked = "salmo"
    section = "refseq"
    NCBI_species = None
    NCBI_species_taxid = ""
    # 913079 is the subspecies Salmonella enterica subsp. enterica serovar Mississippi
    # 1212561 = strain Salmonella enterica subsp. enterica serovar Mississippi strain 2010K-1406
    NCBI_taxid = "913079,1212561"
    NCBI_strains = ""
    outdir = os.path.join(GENEPATH, "test_download_refseq_2taxid")
    threads = 1
    levels = ""
    db_dir, nb_gen = downg.download_from_ncbi(species_linked, section,
                                              NCBI_species, NCBI_species_taxid,
                                              NCBI_taxid, NCBI_strains, levels,
                                              outdir, threads)

    # Check path to uncompressed files is as expected
    assert db_dir == os.path.join(outdir, "Database_init")
    # Check number of genomes downloaded. We cannot know the exact value,
    # as it is updated everyday. But in nov. 2019, there are 4 genomes.
    # So, there must be at least those 4 genomes
    assert nb_gen >= 13
    # And that db_dir exists and contains nb_gen files
    assert os.path.isdir(db_dir)
    assert len(os.listdir(db_dir)) == nb_gen
    # Check log giving only species taxid
    assert "From refseq: Downloading genomes with NCBI_taxid = 913079,1212561" in caplog.text

    # Check that assembly summary file was donwloaded as expected
    sum_file = os.path.join(outdir, "assembly_summary-salmo.txt")
    assert os.path.isfile(sum_file)

    # Check that the NCBI_genome_download output directory exists
    ngd_outdir = os.path.join(outdir, "refseq", "bacteria")
    assert os.path.isdir(ngd_outdir)

    # Redo, without the specific strain taxid. Should download the same -1 (not the specific strain)
    species_linked = "salmo"
    section = "refseq"
    NCBI_species = None
    NCBI_species_taxid = ""
    # 913079 is the subspecies Salmonella enterica subsp. enterica serovar Mississippi
    NCBI_taxid_1 = "913079"
    outdir_1 = os.path.join(GENEPATH, "test_download_refseq_2taxid_1")
    threads = 1
    levels = ""
    db_dir_1, nb_gen_1 = downg.download_from_ncbi(species_linked, section,
                                                  NCBI_species,
                                                  NCBI_species_taxid,
                                                  NCBI_taxid_1, NCBI_strains,
                                                  levels, outdir_1, threads)
    assert nb_gen == nb_gen_1 + 1
    assert "From refseq: Downloading genomes with NCBI_taxid = 913079" in caplog.text
Exemplo n.º 2
0
def test_download_diffSpeTaxID(caplog):
    """
    Test that, when a spe taxID and a species name are given, but those 2 elements do not
    match with the same genomes, it exits with error message

    We cannot compare log, as it is already catched by NCBI_genome_download
    """
    species_linked = "Acetobacter_orleanensis"
    section = "refseq"
    NCBI_species = "Acetobacter fabarum"
    NCBI_species_taxid = "104099"
    NCBI_taxid = ""
    NCBI_strains = ""
    outdir = os.path.join(GENEPATH, "test_download_refseq_wrongTaxID")
    threads = 1
    levels = ""
    with pytest.raises(SystemExit):
        downg.download_from_ncbi(species_linked, section, NCBI_species,
                                 NCBI_species_taxid, NCBI_taxid, NCBI_strains,
                                 levels, outdir, threads)

    # Check path to uncompressed files does not exist
    assert not os.path.isdir(os.path.join(outdir, "Database_init"))

    # Check that the NCBI_genome_download output directory was not created
    ngd_outdir = os.path.join(outdir, "refseq", "bacteria")
    assert not os.path.isdir(ngd_outdir)

    # Check logs
    caplog.set_level(logging.DEBUG)
    # assert ("In refseq, Downloading all genomes for NCBI_taxid = 39831") in caplog.text
    assert "ERROR" in caplog.text
    assert (
        "No strain correspond to your request. If you are sure there should have "
        "some, check that you gave valid NCBI taxid and/or "
        "NCBI species name and/or NCBI strain name. If you gave several, check that "
        "given taxIDs and names are compatible.") in caplog.text

    # Check that output directory was not created
    assert not os.path.isdir(outdir)
Exemplo n.º 3
0
def test_download_diff_specificStrain_species(caplog):
    """
    Test that, when a species name is given, as well as a specific strain name, but which 
    does not exist for this species. It should exit with error message, as no strain is found.

    """
    species_linked = "Acetobacter_orleanensis"
    section = "refseq"
    NCBI_species = "Acetobacter fabarum"
    NCBI_species_taxid = ""
    NCBI_taxid = ""
    NCBI_strains = "SB2390,AS001254"
    outdir = os.path.join(GENEPATH, "test_download_refseq_wrongTaxID")
    threads = 1
    levels = ""
    with pytest.raises(SystemExit):
        downg.download_from_ncbi(species_linked, section, NCBI_species,
                                 NCBI_species_taxid, NCBI_taxid, NCBI_strains,
                                 levels, outdir, threads)

    # Check path to uncompressed files does not exist
    assert not os.path.isdir(os.path.join(outdir, "Database_init"))

    # Check that the NCBI_genome_download output directory was not created
    ngd_outdir = os.path.join(outdir, "refseq", "bacteria")
    assert not os.path.isdir(ngd_outdir)

    # Check logs
    caplog.set_level(logging.DEBUG)
    # assert ("In refseq, Downloading all genomes for NCBI_taxid = 39831") in caplog.text
    assert "ERROR" in caplog.text
    assert (
        "No strain correspond to your request. If you are sure there should have "
        "some, check that you gave valid NCBI taxid and/or "
        "NCBI species name and/or NCBI strain name. If you gave several, check that "
        "given taxIDs and names are compatible.") in caplog.text

    # Check that output directory was not created
    assert not os.path.isdir(outdir)
Exemplo n.º 4
0
def test_download_taxid_and_spename(caplog):
    """
    Test that, given a taxid and a species name, it downloads only the genome(s) 
    corresponding to taxid (intersection)

    We cannot compare log, as it is already catched by NCBI_genome_download
    """
    caplog.set_level(logging.INFO)
    species_linked = "aceor"
    section = "refseq"
    NCBI_species = "Acetobacter orleanensis"
    NCBI_species_taxid = ""
    NCBI_taxid = "1231342"
    NCBI_strains = ""
    levels = ""
    threads = 1
    outdir = os.path.join(GENEPATH, "test_download_refseq_noSpeandSpecific")
    db_dir, nb_gen = downg.download_from_ncbi(species_linked, section,
                                              NCBI_species, NCBI_species_taxid,
                                              NCBI_taxid, NCBI_strains, levels,
                                              outdir, threads)

    # Check path to uncompressed files is as expected
    assert db_dir == os.path.join(outdir, "Database_init")
    # Check number of genomes downloaded. We cannot know the exact value,
    # as it is updated everyday. But in nov. 2019, there are 4 genomes.
    # So, there must be at least those 4 genomes
    assert nb_gen == 1
    # And that db_dir exists and contains nb_gen files
    assert os.path.isdir(db_dir)
    assert len(os.listdir(db_dir)) == 1
    # Check log giving only species taxid
    assert ("From refseq: Downloading genomes with "
            "NCBI_taxid = 1231342, which also have") in caplog.text
    assert ("NCBI species = Acetobacter orleanensis") in caplog.text

    # Check that assembly summary file was donwloaded as expected
    sum_file = os.path.join(outdir, "assembly_summary-aceor.txt")
    assert os.path.isfile(sum_file)

    # Check that the NCBI_genome_download output directory exists
    ngd_outdir = os.path.join(outdir, "refseq", "bacteria")
    # And that it contains folders
    assert os.path.isdir(ngd_outdir)
    assert len(os.listdir(ngd_outdir)) == 1
Exemplo n.º 5
0
def test_download_only_taxid(caplog):
    """
    Test that, given a taxid of a specific strain, it only downloads this one

    We cannot compare log, as it is already catched by NCBI_genome_download
    """
    caplog.set_level(logging.INFO)
    species_linked = "toto"
    NCBI_species = None
    section = "refseq"
    NCBI_species_taxid = ""
    NCBI_taxid = "1123862"
    NCBI_strains = ""
    outdir = os.path.join(GENEPATH, "test_download_refseq_specific")
    threads = 1
    levels = ""

    db_dir, nb_gen = downg.download_from_ncbi(species_linked, section,
                                              NCBI_species, NCBI_species_taxid,
                                              NCBI_taxid, NCBI_strains, levels,
                                              outdir, threads)

    # Check path to uncompressed files is as expected
    assert db_dir == os.path.join(outdir, "Database_init")
    # Check number of genomes downloaded. We cannot know the exact value,
    # as it is updated everyday. But in nov. 2019, there are 4 genomes.
    # So, there must be at least those 4 genomes
    assert nb_gen == 1
    # And that db_dir exists and contains nb_gen files
    assert os.path.isdir(db_dir)
    assert len(os.listdir(db_dir)) == nb_gen

    # Check that assembly summary file was donwloaded as expected
    sum_file = os.path.join(outdir, "assembly_summary-toto.txt")
    assert os.path.isfile(sum_file)

    # Check that the NCBI_genome_download output directory exists
    ngd_outdir = os.path.join(outdir, "refseq", "bacteria")
    # And that it contains folders
    assert os.path.isdir(ngd_outdir)
    assert len(os.listdir(ngd_outdir)) == 1

    # Check log giving only specific taxid
    assert "Downloading genomes with NCBI_taxid = 1123862" in caplog.text
Exemplo n.º 6
0
def test_download_all_info(caplog):
    """
    Giving species name, species taxID and sub-species taxid
    """
    caplog.set_level(logging.INFO)
    species_linked = "toto-spe"
    section = "refseq"
    NCBI_species = "Acetobacter orleanensis"
    NCBI_species_taxid = "104099"
    NCBI_taxid = "1231342"
    NCBI_strains = "JCM 7639T"
    levels = ""
    threads = 1
    outdir2 = os.path.join(GENEPATH, "test_download_allinfo")
    db_dir2, nb_gen2 = downg.download_from_ncbi(species_linked, section,
                                                NCBI_species,
                                                NCBI_species_taxid, NCBI_taxid,
                                                NCBI_strains, levels, outdir2,
                                                threads)

    # Check path to uncompressed files is as expected
    assert db_dir2 == os.path.join(outdir2, "Database_init")
    # Check number of genomes downloaded. We cannot know the exact value, as it is updated
    # everyday. But in nov. 2019, there are 4 genomes. So, there must be at least those 4 genomes
    assert nb_gen2 == 1
    # And that db_dir exists and contains nb_gen files
    assert os.path.isdir(db_dir2)
    assert len(os.listdir(db_dir2)) == 1
    # Check log giving only species taxid
    assert ("From refseq: Downloading the following specified strain(s): "
            "JCM 7639T, which also have") in caplog.text
    assert "NCBI species = Acetobacter orleanensis" in caplog.text
    assert "NCBI_species_taxid = 104099" in caplog.text
    assert "NCBI_taxid = 1231342" in caplog.text

    # Check that assembly summary file was donwloaded as expected
    sum_file = os.path.join(outdir2, "assembly_summary-toto-spe.txt")
    assert os.path.isfile(sum_file)

    # Check that the NCBI_genome_download output directory exists
    ngd_outdir2 = os.path.join(outdir2, "refseq", "bacteria")
    # And that it contains folders
    assert os.path.isdir(ngd_outdir2)
    assert len(os.listdir(ngd_outdir2)) == 1
Exemplo n.º 7
0
def test_download_only_spetaxid(caplog):
    """
    Test that, given a species taxid, it downloads all genomes of the species in .gz, 
    and uncompress them in the db folder (which is named as expected)

    """
    caplog.set_level(logging.INFO)
    species_linked = "toto"
    section = "refseq"
    NCBI_species = None
    NCBI_species_taxid = "104099"
    NCBI_taxid = ""
    NCBI_strains = ""
    outdir = os.path.join(GENEPATH, "test_download_refseq_noSpe")
    threads = 1
    levels = ""

    db_dir, nb_gen = downg.download_from_ncbi(species_linked, section,
                                              NCBI_species, NCBI_species_taxid,
                                              NCBI_taxid, NCBI_strains, levels,
                                              outdir, threads)

    # Check path to uncompressed files is as expected
    assert db_dir == os.path.join(outdir, "Database_init")
    # Check number of genomes downloaded. We cannot know the exact value, as it is updated everyday. But in nov. 2019, there are 4 genomes. So, there must be at least those 4 genomes
    assert nb_gen >= 4
    # And that db_dir exists and contains nb_gen files
    assert os.path.isdir(db_dir)
    assert len(os.listdir(db_dir)) == nb_gen
    # Check log giving only species taxid
    assert "Downloading all genomes of NCBI_species_taxid = 104099" in caplog.text

    # Check that assembly summary file was donwloaded as expected
    sum_file = os.path.join(outdir, "assembly_summary-toto.txt")
    assert os.path.isfile(sum_file)

    # Check that the NCBI_genome_download output directory exists
    ngd_outdir = os.path.join(outdir, "refseq", "bacteria")
    # And that it contains folders
    assert os.path.isdir(ngd_outdir)
    assert len(os.listdir(ngd_outdir)) == nb_gen
Exemplo n.º 8
0
def test_download_specify_strains_notaxid(caplog):
    """
    Giving specific strain names + corresponding species taxid for 1 but not the 3 others
    -> downloads only 1 of the specified strains
    """
    caplog.set_level(logging.INFO)

    species_linked = "Klebsiella_pneumoniae"
    section = "refseq"
    NCBI_species = ""  #"Klebsiella pneumoniae"
    NCBI_species_taxid = "104099"
    NCBI_taxid = ""
    # Download only those strains: "SB2390,AS001254,KPPR1" for KLPN 573, "LMG 1583" from 104099
    NCBI_strains = "SB2390,AS001254,KPPR1,LMG 1583"
    outdir = os.path.join(GENEPATH, "test_download_specify_strains")
    threads = 1
    levels = ""

    db_dir, nb_gen = downg.download_from_ncbi(species_linked, section,
                                              NCBI_species, NCBI_species_taxid,
                                              NCBI_taxid, NCBI_strains, levels,
                                              outdir, threads)

    # Check path to uncompressed files is as expected
    assert db_dir == os.path.join(outdir, "Database_init")
    # Check number of genomes downloaded
    assert nb_gen == 1
    # And that db_dir exists and contains nb_gen files
    assert os.path.isdir(db_dir)
    assert len(os.listdir(db_dir)) == nb_gen
    assert (
        "Downloading the following specified strain(s): SB2390,AS001254,KPPR1,LMG 1583, "
        "which also have:") in caplog.text
    # assert ("NCBI species = Klebsiella pneumoniae") in caplog.text
    assert ("NCBI_species_taxid = 104099") in caplog.text
    # Check that assembly summary file was donwloaded as expected
    sum_file = os.path.join(outdir,
                            "assembly_summary-Klebsiella_pneumoniae.txt")
    assert os.path.isfile(sum_file)
Exemplo n.º 9
0
def test_download_specify_strains_fromfile(caplog):
    """
    Giving specific strain names + corresponding species taxids
    -> downloads all specified strains
    """
    caplog.set_level(logging.INFO)

    species_linked = "spestrain"  #"Klebsiella_pneumoniae"
    section = "refseq"
    NCBI_species = ""  #"Klebsiella pneumoniae"
    NCBI_species_taxid = "573,104099"
    NCBI_taxid = ""
    # Download only those strains: "SB2390,AS001254,KPPR1" for KLPN 573, "LMG 1583" from 104099
    NCBI_strains = os.path.join(DATA_TEST_DIR, "test_files",
                                "test_list-strains.txt")
    outdir = os.path.join(GENEPATH, "test_download_specify_strains")
    threads = 1
    levels = ""

    db_dir, nb_gen = downg.download_from_ncbi(species_linked, section,
                                              NCBI_species, NCBI_species_taxid,
                                              NCBI_taxid, NCBI_strains, levels,
                                              outdir, threads)

    # Check path to uncompressed files is as expected
    assert db_dir == os.path.join(outdir, "Database_init")
    # Check number of genomes downloaded
    assert nb_gen >= 3
    # And that db_dir exists and contains nb_gen files
    assert os.path.isdir(db_dir)
    assert len(os.listdir(db_dir)) == nb_gen

    assert ("Downloading all strains specified in "
            "test/data/prepare/test_files/test_list-strains.txt file, "
            "which also have:") in caplog.text
    assert ("NCBI_species_taxid = 573,104099") in caplog.text
    # Check that assembly summary file was donwloaded as expected
    sum_file = os.path.join(outdir, "assembly_summary-spestrain.txt")
    assert os.path.isfile(sum_file)
Exemplo n.º 10
0
def main(cmd, ncbi_species_name, ncbi_species_taxid, ncbi_taxid, ncbi_strains,
         levels, ncbi_section, outdir, tmp_dir, threads, norefseq, db_dir,
         only_mash, info_file, l90, nbcont, cutn, min_dist, max_dist, verbose,
         quiet):
    """
    Main method, constructing the draft dataset for the given species

    verbosity:
    - defaut 0 : stdout contains INFO, stderr contains ERROR, .log contains INFO and more, .log.err contains warning and more
    - 1: same as 0 + WARNING in stderr
    - 2: same as 1 + DETAILS in stdout + DETAILS in .log.details
    - >=15: same as 2 + Add DEBUG in stdout + create .log.debug with everything from info to debug


    Parameters
    ----------
    cmd : str
        command line used to launch this program
    ncbi_species_name : str
        name of species to download, as given by NCBI
    ncbi_species_taxid : int
        species taxid given in NCBI
    ncbi_taxid : int
        NCBI taxid (sub-species)
    ncbi_strains : str
        specific strains to download
    levels: str
        Level of assembly to download. Choice between 'all', 'complete', 'chromosome',
        'scaffold', 'contig'. Default is 'all'
    outdir : str
        path to output directory (where created database will be saved).
    tmp_dir : str
        Path to directory where tmp files are saved (sequences split at each row of 5 'N')
    threads : int
        max number of threads to use
    norefseq : bool
        True if user does not want to download again the database
    db_dir : str
        Name of the folder where already downloaded fasta files are saved.
    only_mash : bool
        True if user user already has the database and quality of each genome (L90, #contigs etc.)
    info_file : str
        File containing information on QC if it was already ran before (columns to_annotate,
        gsize, nb_conts and L90).
    l90 : int
        Max L90 allowed to keep a genome
    nbcont : int
        Max number of contigs allowed to keep a genome
    cutn : int
        cut at each when there are 'cutn' N in a row. Don't cut if equal to 0
    min_dist : int
        lower limit of distance between 2 genomes to keep them
    max_dist : int
        upper limit of distance between 2 genomes to keep them (default is 0.06)
    verbose : int
        verbosity:
        - defaut 0 : stdout contains INFO, stderr contains ERROR, .log contains INFO and more,
          .log.err contains warning and more
        - 1: same as 0 + WARNING in stderr
        - 2: same as 1 + DETAILS in stdout + DETAILS in .log.details
        - >=15: same as 2 + Add DEBUG in stdout + create .log.debug with everything
          from info to debug
    quiet : bool
        True if nothing must be sent to stdout/stderr, False otherwise
    """

    # get species name in NCBI format
    # -> will be used to name output directory
    # -> will be used to download summary file if given species corresponds to NCBI name
    if ncbi_species_name:
        species_linked = "_".join(ncbi_species_name.split())
        species_linked = "_".join(species_linked.split("/"))

    # if species name not given by user, use species taxID (if given) to name output directory
    elif ncbi_species_taxid:
        species_linked = str(ncbi_species_taxid)
    # if species name not species taxid by user, use taxID (if given) to name output directory
    elif ncbi_taxid:
        species_linked = str(ncbi_taxid)
    # If no species nor taxID, get specific strain names
    elif ncbi_strains:
        if os.path.isfile(ncbi_strains):
            species_linked = os.path.basename(ncbi_strains)
            species_linked = os.path.splitext(species_linked)[0]
        else:
            species_linked = "_".join(ncbi_strains.split())
            species_linked = "-".join(species_linked.split("/"))
            species_linked = "_and_".join(species_linked.split(","))
    # if neither speName, speID, taxID nor strainName given (--norefseq, mashonly), name is NA
    else:
        species_linked = "NA"
    # Default outdir is species name if given, or species taxID
    if not outdir:
        outdir = species_linked
    # Default tmp_dir is outdir/tmp_files
    if not tmp_dir:
        tmp_dir = os.path.join(outdir, "tmp_files")
    # directory that will be created by ncbi_genome_download
    ncbidir = os.path.join(outdir, ncbi_section, "bacteria")
    os.makedirs(outdir, exist_ok=True)
    os.makedirs(tmp_dir, exist_ok=True)

    # Initialize logger
    # set level of logger: level is the minimum level that will be considered.
    if verbose <= 1:
        level = logging.INFO
    # for verbose = 2, ignore only debug
    if verbose >= 2 and verbose < 15:
        level = utils.detail_lvl()  # int corresponding to detail level
    # for verbose >= 15, write everything
    if verbose >= 15:
        level = logging.DEBUG
    logfile_base = os.path.join(outdir,
                                "PanACoTA_prepare_{}").format(species_linked)
    logfile_base, logger = utils.init_logger(logfile_base,
                                             level,
                                             'prepare',
                                             log_details=True,
                                             verbose=verbose,
                                             quiet=quiet)

    # Message on what will be done (cmd, cores used)
    logger.info(f'PanACoTA version {version}')
    logger.info("Command used\n \t > " + cmd)
    message = f"'PanACoTA prepare' will run on {threads} "
    message += f"cores" if threads > 1 else "core"
    logger.info(message)

    # Start prepare step
    # Run more than only mash filter (!only_mash):
    # - start from QC and mash (norefseq)
    # - start from genome download (!norefseq))
    if not only_mash:
        # Not only mash, so a new info file will be created. If the user still gave an info
        # file (he will be warned that it will be ignored), rename it with '.bak'
        # to avoid erasing it
        if info_file and os.path.isfile(info_file):
            os.rename(info_file, info_file + ".back")

        # 'norefseq = True" : Do not download genomes, just do QC and mash filter on given genomes
        # -> if not, error and exit
        if norefseq:
            logger.warning(f'You asked to skip {ncbi_section} downloads.')

            # -> if db_dir given, watch for sequences there. If does not exist, error and exit
            # (user gave a directory (even if it does not exist), so we won't look for
            # the sequences in other folders)
            if db_dir:
                if not os.path.exists(db_dir):
                    logger.error(
                        f"Database folder {db_dir} supposed to contain fasta "
                        "sequences does not "
                        "exist. Please give a valid folder, or leave the default "
                        "directory (no '-d' option).")
                    sys.exit(1)
            # -> If user did not give db_dir, genomes could be in
            # outdir/Database_init/<genome_name>.fna
            else:
                db_dir = os.path.join(outdir, "Database_init")
                # If it does not exist, check if default compressed files folder exists.
                if not os.path.exists(db_dir):
                    logger.warning(
                        f"Database folder {db_dir} supposed to contain fasta "
                        "sequences does not "
                        "exist. We will check if the download folder (with compressed "
                        "sequences) exists.")
                    # -> if not in database_init, genomes must be in
                    # outdir/refeq/bacteria/<genome_name>.fna.gz. In that case,
                    # uncompress and add them to Database_init
                    if not os.path.exists(ncbidir):
                        logger.error(
                            f"Folder {ncbidir} does not exist. You do not have any "
                            "genome to analyse. Possible reasons:\n"
                            "- if you want to rerun analysis in the same folder as "
                            "sequences were downloaded (my_outdir/Database_init or "
                            f"my_outdir/{ncbi_section}), make sure you have '-o my_outdir' "
                            "option\n"
                            "- if you want to rerun analysis and save them in a new "
                            "output folder called 'new_outdir', make sure you have "
                            "'-o new_outdir' option, "
                            "and you specified where the uncompressed sequences to "
                            "use are ('-d sequence_database_path'). ")
                        sys.exit(1)
                    # add genomes from refseq/bacteria folder to Database_init
                    nb_gen, _ = dgf.to_database(outdir, ncbi_section)
        # No sequence: Do all steps -> download, QC, mash filter
        else:
            # Download all genomes of the given taxID
            db_dir, nb_gen = dgf.download_from_ncbi(species_linked,
                                                    ncbi_section,
                                                    ncbi_species_name,
                                                    ncbi_species_taxid,
                                                    ncbi_taxid, ncbi_strains,
                                                    levels, outdir, threads)
            logger.info(f"{nb_gen} {ncbi_section} genome(s) downloaded")

        # Now that genomes are downloaded and uncompressed, check their quality to remove bad ones
        genomes = fg.check_quality(species_linked, db_dir, tmp_dir, l90,
                                   nbcont, cutn)

    # Do only mash filter. Genomes must be already downloaded, and there must be a file with
    # all information on these genomes (L90 etc.)
    else:
        logger.warning('You asked to run only mash steps.')
        if not os.path.exists(
                info_file):  # info-file missing -> error and exit
            logger.error(
                f"Your info file {info_file} does not exist. Please provide the  "
                "right name/path, or remove the '--mash-only option to rerun "
                "quality control.")
            sys.exit(1)
        logger.info(("You want to run only mash steps. Getting information "
                     "from {}").format(info_file))
        genomes = utils.read_genomes_info(
            info_file,
            species_linked,
        )

    # Run Mash
    # genomes : {genome_file: [genome_name, orig_name, path_to_seq_to_annotate, size, nbcont, l90]}
    # sorted_genome : [genome_file] ordered by L90/nbcont (keys of genomes)
    sorted_genomes = fg.sort_genomes_minhash(genomes, l90, nbcont)

    # Write discarded genomes to a file -> orig_name, to_annotate, gsize, nb_conts, L90
    discQC = f"by-L90_nbcont-{species_linked}.txt"
    utils.write_genomes_info(genomes, sorted_genomes, discQC, outdir)

    # Remove genomes not corresponding to mash filters
    removed = fg.iterative_mash(sorted_genomes, genomes, outdir,
                                species_linked, min_dist, max_dist, threads,
                                quiet)
    # Write list of genomes kept, and list of genomes discarded by mash step
    info_file = fg.write_outputfiles(genomes, sorted_genomes, removed, outdir,
                                     species_linked, min_dist, max_dist)
    logger.info("End")
    return info_file
Exemplo n.º 11
0
def test_download_refseq_vs_genbank(caplog):
    """
    Give a taxid of a subspecies, download strains from refseq, and then from genbank.
    Currently, no strains in refseq, and 2 in genbank.
    39831 = Klebsiella pneumoniae subsp. rhinoscleromatis
    Later, there can be some in refseq, but always at least 2 more in genbank
    """
    caplog.set_level(logging.INFO)
    species_linked = "refseq-genbank"
    section = "refseq"
    NCBI_species = None
    NCBI_species_taxid = ""
    NCBI_taxid = "39831"
    NCBI_strains = ""
    outdir = os.path.join(GENEPATH, "test_download_refseq_genbank")
    levels = ""
    threads = 1

    # With refseq, no genome found
    with pytest.raises(SystemExit):
        downg.download_from_ncbi(species_linked, section, NCBI_species,
                                 NCBI_species_taxid, NCBI_taxid, NCBI_strains,
                                 levels, outdir, threads)

    # Check path to uncompressed files does not exist
    assert not os.path.isdir(os.path.join(outdir, "Database_init"))

    # Check that the NCBI_genome_download output directory was not created
    ngd_outdir = os.path.join(outdir, "refseq", "bacteria")
    assert not os.path.isdir(ngd_outdir)

    # Check logs
    assert ("From refseq: Downloading genomes with NCBI_taxid = 39831"
            ) in caplog.text
    assert "ERROR" in caplog.text
    assert (
        "No strain correspond to your request. If you are sure there should have "
        "some, check that you gave valid NCBI taxid and/or "
        "NCBI species name and/or NCBI strain name. If you gave several, check that "
        "given taxIDs and names are compatible.") in caplog.text

    # REDO with genbank instead of refseq
    section = "genbank"
    outdir2 = os.path.join(GENEPATH, "test_download_genbank")
    db_dir, nb_gen = downg.download_from_ncbi(species_linked, section,
                                              NCBI_species, NCBI_species_taxid,
                                              NCBI_taxid, NCBI_strains, levels,
                                              outdir2, threads)

    # Check path to uncompressed files is as expected
    assert db_dir == os.path.join(outdir2, "Database_init")
    # Check number of genomes downloaded. We cannot know the exact value, as it is
    # updated everyday. But in nov. 2019, there are 4 genomes.
    # So, there must be at least those 4 genomes
    assert nb_gen >= 2
    # And that db_dir exists and contains nb_gen files
    assert os.path.isdir(db_dir)
    assert len(os.listdir(db_dir)) == nb_gen
    # Check log giving only species taxid
    assert ("From genbank: Downloading genomes with NCBI_taxid = 39831"
            ) in caplog.text
    # Check that assembly summary file was donwloaded as expected
    sum_file = os.path.join(outdir2, "assembly_summary-refseq-genbank.txt")
    assert os.path.isfile(sum_file)
    # Check that the NCBI_genome_download output directory exists
    ngd_outdir = os.path.join(outdir2, "genbank", "bacteria")
    assert os.path.isdir(ngd_outdir)
Exemplo n.º 12
0
def test_download_specify_level(caplog):
    """
    Test that, given a taxid, and a species name,
    it downloads only genomes with the specified level
    """
    caplog.set_level(logging.INFO)

    species_linked = "Acetobacter_orleanensis"
    section = "refseq"
    NCBI_species = "Acetobacter orleanensis"
    NCBI_species_taxid = "104099"
    NCBI_taxid = ""
    NCBI_strains = ""
    outdir = os.path.join(GENEPATH, "test_download_refseq")
    threads = 1
    levels = ""

    db_dir, nb_gen = downg.download_from_ncbi(species_linked, section,
                                              NCBI_species, NCBI_species_taxid,
                                              NCBI_taxid, NCBI_strains, levels,
                                              outdir, threads)
    # Check path to uncompressed files is as expected
    assert db_dir == os.path.join(outdir, "Database_init")
    # Check number of genomes downloaded. We cannot know the exact value, as it is updated
    # everyday. But in nov. 2019, there are 4 genomes. So, there must be at least those 4 genomes
    assert nb_gen >= 4
    # And that db_dir exists and contains nb_gen files
    assert os.path.isdir(db_dir)
    assert len(os.listdir(db_dir)) == nb_gen

    # Check that assembly summary file wwas donwloaded as expected
    sum_file = os.path.join(outdir,
                            "assembly_summary-Acetobacter_orleanensis.txt")
    assert os.path.isfile(sum_file)
    # Check number of genomes in summary file, and how many with scaffold or complete
    # assembly level -> will check that when asking only for those levels, we get the same number
    other = 0
    scaf = 0
    comp = 0
    with open(sum_file, "r") as sf:
        sf.readline()  # skip header
        for line in sf:
            if "complete" in line.split("\t")[13].lower():
                comp += 1
            elif "scaffold" in line.split("\t")[13].lower():
                scaf += 1
            else:
                other += 1
    assert other + scaf + comp == nb_gen

    # Check that the NCBI_genome_download output directory exists
    ngd_outdir = os.path.join(outdir, "refseq", "bacteria")
    # And that it contains folders
    assert os.path.isdir(ngd_outdir)
    assert len(os.listdir(ngd_outdir)) >= 4
    # Check log giving species name + species taxid
    assert (
        'Downloading all genomes of NCBI species = '
        'Acetobacter orleanensis (NCBI_species_taxid = 104099)') in caplog.text

    # Re-run, but only asking for complete and scaffold
    outdir2 = os.path.join(GENEPATH, "test_download_refseq_only-scaf")
    levels2 = "scaffold,complete"
    db_dir2, nb_gen2 = downg.download_from_ncbi(species_linked, section,
                                                NCBI_species,
                                                NCBI_species_taxid, NCBI_taxid,
                                                NCBI_strains, levels2, outdir2,
                                                threads)
    assert scaf + comp == nb_gen2
    assert db_dir2 == os.path.join(outdir2, "Database_init")
    # Check log giving species name + species taxid + levels given
    assert (
        "Downloading all genomes of NCBI species = Acetobacter orleanensis "
        "(NCBI_species_taxid = 104099). "
        "(Only those assembly levels: scaffold,complete)") in caplog.text