def test_download_2taxid(caplog): """ Give a taxid of a subspecies and a taxid of a specific strain. Should download all genomes of the subspecies + specific strain. If only the subspecies taxid, the specific strain is not downloaded. """ caplog.set_level(logging.INFO) species_linked = "salmo" section = "refseq" NCBI_species = None NCBI_species_taxid = "" # 913079 is the subspecies Salmonella enterica subsp. enterica serovar Mississippi # 1212561 = strain Salmonella enterica subsp. enterica serovar Mississippi strain 2010K-1406 NCBI_taxid = "913079,1212561" NCBI_strains = "" outdir = os.path.join(GENEPATH, "test_download_refseq_2taxid") threads = 1 levels = "" db_dir, nb_gen = downg.download_from_ncbi(species_linked, section, NCBI_species, NCBI_species_taxid, NCBI_taxid, NCBI_strains, levels, outdir, threads) # Check path to uncompressed files is as expected assert db_dir == os.path.join(outdir, "Database_init") # Check number of genomes downloaded. We cannot know the exact value, # as it is updated everyday. But in nov. 2019, there are 4 genomes. # So, there must be at least those 4 genomes assert nb_gen >= 13 # And that db_dir exists and contains nb_gen files assert os.path.isdir(db_dir) assert len(os.listdir(db_dir)) == nb_gen # Check log giving only species taxid assert "From refseq: Downloading genomes with NCBI_taxid = 913079,1212561" in caplog.text # Check that assembly summary file was donwloaded as expected sum_file = os.path.join(outdir, "assembly_summary-salmo.txt") assert os.path.isfile(sum_file) # Check that the NCBI_genome_download output directory exists ngd_outdir = os.path.join(outdir, "refseq", "bacteria") assert os.path.isdir(ngd_outdir) # Redo, without the specific strain taxid. Should download the same -1 (not the specific strain) species_linked = "salmo" section = "refseq" NCBI_species = None NCBI_species_taxid = "" # 913079 is the subspecies Salmonella enterica subsp. enterica serovar Mississippi NCBI_taxid_1 = "913079" outdir_1 = os.path.join(GENEPATH, "test_download_refseq_2taxid_1") threads = 1 levels = "" db_dir_1, nb_gen_1 = downg.download_from_ncbi(species_linked, section, NCBI_species, NCBI_species_taxid, NCBI_taxid_1, NCBI_strains, levels, outdir_1, threads) assert nb_gen == nb_gen_1 + 1 assert "From refseq: Downloading genomes with NCBI_taxid = 913079" in caplog.text
def test_download_diffSpeTaxID(caplog): """ Test that, when a spe taxID and a species name are given, but those 2 elements do not match with the same genomes, it exits with error message We cannot compare log, as it is already catched by NCBI_genome_download """ species_linked = "Acetobacter_orleanensis" section = "refseq" NCBI_species = "Acetobacter fabarum" NCBI_species_taxid = "104099" NCBI_taxid = "" NCBI_strains = "" outdir = os.path.join(GENEPATH, "test_download_refseq_wrongTaxID") threads = 1 levels = "" with pytest.raises(SystemExit): downg.download_from_ncbi(species_linked, section, NCBI_species, NCBI_species_taxid, NCBI_taxid, NCBI_strains, levels, outdir, threads) # Check path to uncompressed files does not exist assert not os.path.isdir(os.path.join(outdir, "Database_init")) # Check that the NCBI_genome_download output directory was not created ngd_outdir = os.path.join(outdir, "refseq", "bacteria") assert not os.path.isdir(ngd_outdir) # Check logs caplog.set_level(logging.DEBUG) # assert ("In refseq, Downloading all genomes for NCBI_taxid = 39831") in caplog.text assert "ERROR" in caplog.text assert ( "No strain correspond to your request. If you are sure there should have " "some, check that you gave valid NCBI taxid and/or " "NCBI species name and/or NCBI strain name. If you gave several, check that " "given taxIDs and names are compatible.") in caplog.text # Check that output directory was not created assert not os.path.isdir(outdir)
def test_download_diff_specificStrain_species(caplog): """ Test that, when a species name is given, as well as a specific strain name, but which does not exist for this species. It should exit with error message, as no strain is found. """ species_linked = "Acetobacter_orleanensis" section = "refseq" NCBI_species = "Acetobacter fabarum" NCBI_species_taxid = "" NCBI_taxid = "" NCBI_strains = "SB2390,AS001254" outdir = os.path.join(GENEPATH, "test_download_refseq_wrongTaxID") threads = 1 levels = "" with pytest.raises(SystemExit): downg.download_from_ncbi(species_linked, section, NCBI_species, NCBI_species_taxid, NCBI_taxid, NCBI_strains, levels, outdir, threads) # Check path to uncompressed files does not exist assert not os.path.isdir(os.path.join(outdir, "Database_init")) # Check that the NCBI_genome_download output directory was not created ngd_outdir = os.path.join(outdir, "refseq", "bacteria") assert not os.path.isdir(ngd_outdir) # Check logs caplog.set_level(logging.DEBUG) # assert ("In refseq, Downloading all genomes for NCBI_taxid = 39831") in caplog.text assert "ERROR" in caplog.text assert ( "No strain correspond to your request. If you are sure there should have " "some, check that you gave valid NCBI taxid and/or " "NCBI species name and/or NCBI strain name. If you gave several, check that " "given taxIDs and names are compatible.") in caplog.text # Check that output directory was not created assert not os.path.isdir(outdir)
def test_download_taxid_and_spename(caplog): """ Test that, given a taxid and a species name, it downloads only the genome(s) corresponding to taxid (intersection) We cannot compare log, as it is already catched by NCBI_genome_download """ caplog.set_level(logging.INFO) species_linked = "aceor" section = "refseq" NCBI_species = "Acetobacter orleanensis" NCBI_species_taxid = "" NCBI_taxid = "1231342" NCBI_strains = "" levels = "" threads = 1 outdir = os.path.join(GENEPATH, "test_download_refseq_noSpeandSpecific") db_dir, nb_gen = downg.download_from_ncbi(species_linked, section, NCBI_species, NCBI_species_taxid, NCBI_taxid, NCBI_strains, levels, outdir, threads) # Check path to uncompressed files is as expected assert db_dir == os.path.join(outdir, "Database_init") # Check number of genomes downloaded. We cannot know the exact value, # as it is updated everyday. But in nov. 2019, there are 4 genomes. # So, there must be at least those 4 genomes assert nb_gen == 1 # And that db_dir exists and contains nb_gen files assert os.path.isdir(db_dir) assert len(os.listdir(db_dir)) == 1 # Check log giving only species taxid assert ("From refseq: Downloading genomes with " "NCBI_taxid = 1231342, which also have") in caplog.text assert ("NCBI species = Acetobacter orleanensis") in caplog.text # Check that assembly summary file was donwloaded as expected sum_file = os.path.join(outdir, "assembly_summary-aceor.txt") assert os.path.isfile(sum_file) # Check that the NCBI_genome_download output directory exists ngd_outdir = os.path.join(outdir, "refseq", "bacteria") # And that it contains folders assert os.path.isdir(ngd_outdir) assert len(os.listdir(ngd_outdir)) == 1
def test_download_only_taxid(caplog): """ Test that, given a taxid of a specific strain, it only downloads this one We cannot compare log, as it is already catched by NCBI_genome_download """ caplog.set_level(logging.INFO) species_linked = "toto" NCBI_species = None section = "refseq" NCBI_species_taxid = "" NCBI_taxid = "1123862" NCBI_strains = "" outdir = os.path.join(GENEPATH, "test_download_refseq_specific") threads = 1 levels = "" db_dir, nb_gen = downg.download_from_ncbi(species_linked, section, NCBI_species, NCBI_species_taxid, NCBI_taxid, NCBI_strains, levels, outdir, threads) # Check path to uncompressed files is as expected assert db_dir == os.path.join(outdir, "Database_init") # Check number of genomes downloaded. We cannot know the exact value, # as it is updated everyday. But in nov. 2019, there are 4 genomes. # So, there must be at least those 4 genomes assert nb_gen == 1 # And that db_dir exists and contains nb_gen files assert os.path.isdir(db_dir) assert len(os.listdir(db_dir)) == nb_gen # Check that assembly summary file was donwloaded as expected sum_file = os.path.join(outdir, "assembly_summary-toto.txt") assert os.path.isfile(sum_file) # Check that the NCBI_genome_download output directory exists ngd_outdir = os.path.join(outdir, "refseq", "bacteria") # And that it contains folders assert os.path.isdir(ngd_outdir) assert len(os.listdir(ngd_outdir)) == 1 # Check log giving only specific taxid assert "Downloading genomes with NCBI_taxid = 1123862" in caplog.text
def test_download_all_info(caplog): """ Giving species name, species taxID and sub-species taxid """ caplog.set_level(logging.INFO) species_linked = "toto-spe" section = "refseq" NCBI_species = "Acetobacter orleanensis" NCBI_species_taxid = "104099" NCBI_taxid = "1231342" NCBI_strains = "JCM 7639T" levels = "" threads = 1 outdir2 = os.path.join(GENEPATH, "test_download_allinfo") db_dir2, nb_gen2 = downg.download_from_ncbi(species_linked, section, NCBI_species, NCBI_species_taxid, NCBI_taxid, NCBI_strains, levels, outdir2, threads) # Check path to uncompressed files is as expected assert db_dir2 == os.path.join(outdir2, "Database_init") # Check number of genomes downloaded. We cannot know the exact value, as it is updated # everyday. But in nov. 2019, there are 4 genomes. So, there must be at least those 4 genomes assert nb_gen2 == 1 # And that db_dir exists and contains nb_gen files assert os.path.isdir(db_dir2) assert len(os.listdir(db_dir2)) == 1 # Check log giving only species taxid assert ("From refseq: Downloading the following specified strain(s): " "JCM 7639T, which also have") in caplog.text assert "NCBI species = Acetobacter orleanensis" in caplog.text assert "NCBI_species_taxid = 104099" in caplog.text assert "NCBI_taxid = 1231342" in caplog.text # Check that assembly summary file was donwloaded as expected sum_file = os.path.join(outdir2, "assembly_summary-toto-spe.txt") assert os.path.isfile(sum_file) # Check that the NCBI_genome_download output directory exists ngd_outdir2 = os.path.join(outdir2, "refseq", "bacteria") # And that it contains folders assert os.path.isdir(ngd_outdir2) assert len(os.listdir(ngd_outdir2)) == 1
def test_download_only_spetaxid(caplog): """ Test that, given a species taxid, it downloads all genomes of the species in .gz, and uncompress them in the db folder (which is named as expected) """ caplog.set_level(logging.INFO) species_linked = "toto" section = "refseq" NCBI_species = None NCBI_species_taxid = "104099" NCBI_taxid = "" NCBI_strains = "" outdir = os.path.join(GENEPATH, "test_download_refseq_noSpe") threads = 1 levels = "" db_dir, nb_gen = downg.download_from_ncbi(species_linked, section, NCBI_species, NCBI_species_taxid, NCBI_taxid, NCBI_strains, levels, outdir, threads) # Check path to uncompressed files is as expected assert db_dir == os.path.join(outdir, "Database_init") # Check number of genomes downloaded. We cannot know the exact value, as it is updated everyday. But in nov. 2019, there are 4 genomes. So, there must be at least those 4 genomes assert nb_gen >= 4 # And that db_dir exists and contains nb_gen files assert os.path.isdir(db_dir) assert len(os.listdir(db_dir)) == nb_gen # Check log giving only species taxid assert "Downloading all genomes of NCBI_species_taxid = 104099" in caplog.text # Check that assembly summary file was donwloaded as expected sum_file = os.path.join(outdir, "assembly_summary-toto.txt") assert os.path.isfile(sum_file) # Check that the NCBI_genome_download output directory exists ngd_outdir = os.path.join(outdir, "refseq", "bacteria") # And that it contains folders assert os.path.isdir(ngd_outdir) assert len(os.listdir(ngd_outdir)) == nb_gen
def test_download_specify_strains_notaxid(caplog): """ Giving specific strain names + corresponding species taxid for 1 but not the 3 others -> downloads only 1 of the specified strains """ caplog.set_level(logging.INFO) species_linked = "Klebsiella_pneumoniae" section = "refseq" NCBI_species = "" #"Klebsiella pneumoniae" NCBI_species_taxid = "104099" NCBI_taxid = "" # Download only those strains: "SB2390,AS001254,KPPR1" for KLPN 573, "LMG 1583" from 104099 NCBI_strains = "SB2390,AS001254,KPPR1,LMG 1583" outdir = os.path.join(GENEPATH, "test_download_specify_strains") threads = 1 levels = "" db_dir, nb_gen = downg.download_from_ncbi(species_linked, section, NCBI_species, NCBI_species_taxid, NCBI_taxid, NCBI_strains, levels, outdir, threads) # Check path to uncompressed files is as expected assert db_dir == os.path.join(outdir, "Database_init") # Check number of genomes downloaded assert nb_gen == 1 # And that db_dir exists and contains nb_gen files assert os.path.isdir(db_dir) assert len(os.listdir(db_dir)) == nb_gen assert ( "Downloading the following specified strain(s): SB2390,AS001254,KPPR1,LMG 1583, " "which also have:") in caplog.text # assert ("NCBI species = Klebsiella pneumoniae") in caplog.text assert ("NCBI_species_taxid = 104099") in caplog.text # Check that assembly summary file was donwloaded as expected sum_file = os.path.join(outdir, "assembly_summary-Klebsiella_pneumoniae.txt") assert os.path.isfile(sum_file)
def test_download_specify_strains_fromfile(caplog): """ Giving specific strain names + corresponding species taxids -> downloads all specified strains """ caplog.set_level(logging.INFO) species_linked = "spestrain" #"Klebsiella_pneumoniae" section = "refseq" NCBI_species = "" #"Klebsiella pneumoniae" NCBI_species_taxid = "573,104099" NCBI_taxid = "" # Download only those strains: "SB2390,AS001254,KPPR1" for KLPN 573, "LMG 1583" from 104099 NCBI_strains = os.path.join(DATA_TEST_DIR, "test_files", "test_list-strains.txt") outdir = os.path.join(GENEPATH, "test_download_specify_strains") threads = 1 levels = "" db_dir, nb_gen = downg.download_from_ncbi(species_linked, section, NCBI_species, NCBI_species_taxid, NCBI_taxid, NCBI_strains, levels, outdir, threads) # Check path to uncompressed files is as expected assert db_dir == os.path.join(outdir, "Database_init") # Check number of genomes downloaded assert nb_gen >= 3 # And that db_dir exists and contains nb_gen files assert os.path.isdir(db_dir) assert len(os.listdir(db_dir)) == nb_gen assert ("Downloading all strains specified in " "test/data/prepare/test_files/test_list-strains.txt file, " "which also have:") in caplog.text assert ("NCBI_species_taxid = 573,104099") in caplog.text # Check that assembly summary file was donwloaded as expected sum_file = os.path.join(outdir, "assembly_summary-spestrain.txt") assert os.path.isfile(sum_file)
def main(cmd, ncbi_species_name, ncbi_species_taxid, ncbi_taxid, ncbi_strains, levels, ncbi_section, outdir, tmp_dir, threads, norefseq, db_dir, only_mash, info_file, l90, nbcont, cutn, min_dist, max_dist, verbose, quiet): """ Main method, constructing the draft dataset for the given species verbosity: - defaut 0 : stdout contains INFO, stderr contains ERROR, .log contains INFO and more, .log.err contains warning and more - 1: same as 0 + WARNING in stderr - 2: same as 1 + DETAILS in stdout + DETAILS in .log.details - >=15: same as 2 + Add DEBUG in stdout + create .log.debug with everything from info to debug Parameters ---------- cmd : str command line used to launch this program ncbi_species_name : str name of species to download, as given by NCBI ncbi_species_taxid : int species taxid given in NCBI ncbi_taxid : int NCBI taxid (sub-species) ncbi_strains : str specific strains to download levels: str Level of assembly to download. Choice between 'all', 'complete', 'chromosome', 'scaffold', 'contig'. Default is 'all' outdir : str path to output directory (where created database will be saved). tmp_dir : str Path to directory where tmp files are saved (sequences split at each row of 5 'N') threads : int max number of threads to use norefseq : bool True if user does not want to download again the database db_dir : str Name of the folder where already downloaded fasta files are saved. only_mash : bool True if user user already has the database and quality of each genome (L90, #contigs etc.) info_file : str File containing information on QC if it was already ran before (columns to_annotate, gsize, nb_conts and L90). l90 : int Max L90 allowed to keep a genome nbcont : int Max number of contigs allowed to keep a genome cutn : int cut at each when there are 'cutn' N in a row. Don't cut if equal to 0 min_dist : int lower limit of distance between 2 genomes to keep them max_dist : int upper limit of distance between 2 genomes to keep them (default is 0.06) verbose : int verbosity: - defaut 0 : stdout contains INFO, stderr contains ERROR, .log contains INFO and more, .log.err contains warning and more - 1: same as 0 + WARNING in stderr - 2: same as 1 + DETAILS in stdout + DETAILS in .log.details - >=15: same as 2 + Add DEBUG in stdout + create .log.debug with everything from info to debug quiet : bool True if nothing must be sent to stdout/stderr, False otherwise """ # get species name in NCBI format # -> will be used to name output directory # -> will be used to download summary file if given species corresponds to NCBI name if ncbi_species_name: species_linked = "_".join(ncbi_species_name.split()) species_linked = "_".join(species_linked.split("/")) # if species name not given by user, use species taxID (if given) to name output directory elif ncbi_species_taxid: species_linked = str(ncbi_species_taxid) # if species name not species taxid by user, use taxID (if given) to name output directory elif ncbi_taxid: species_linked = str(ncbi_taxid) # If no species nor taxID, get specific strain names elif ncbi_strains: if os.path.isfile(ncbi_strains): species_linked = os.path.basename(ncbi_strains) species_linked = os.path.splitext(species_linked)[0] else: species_linked = "_".join(ncbi_strains.split()) species_linked = "-".join(species_linked.split("/")) species_linked = "_and_".join(species_linked.split(",")) # if neither speName, speID, taxID nor strainName given (--norefseq, mashonly), name is NA else: species_linked = "NA" # Default outdir is species name if given, or species taxID if not outdir: outdir = species_linked # Default tmp_dir is outdir/tmp_files if not tmp_dir: tmp_dir = os.path.join(outdir, "tmp_files") # directory that will be created by ncbi_genome_download ncbidir = os.path.join(outdir, ncbi_section, "bacteria") os.makedirs(outdir, exist_ok=True) os.makedirs(tmp_dir, exist_ok=True) # Initialize logger # set level of logger: level is the minimum level that will be considered. if verbose <= 1: level = logging.INFO # for verbose = 2, ignore only debug if verbose >= 2 and verbose < 15: level = utils.detail_lvl() # int corresponding to detail level # for verbose >= 15, write everything if verbose >= 15: level = logging.DEBUG logfile_base = os.path.join(outdir, "PanACoTA_prepare_{}").format(species_linked) logfile_base, logger = utils.init_logger(logfile_base, level, 'prepare', log_details=True, verbose=verbose, quiet=quiet) # Message on what will be done (cmd, cores used) logger.info(f'PanACoTA version {version}') logger.info("Command used\n \t > " + cmd) message = f"'PanACoTA prepare' will run on {threads} " message += f"cores" if threads > 1 else "core" logger.info(message) # Start prepare step # Run more than only mash filter (!only_mash): # - start from QC and mash (norefseq) # - start from genome download (!norefseq)) if not only_mash: # Not only mash, so a new info file will be created. If the user still gave an info # file (he will be warned that it will be ignored), rename it with '.bak' # to avoid erasing it if info_file and os.path.isfile(info_file): os.rename(info_file, info_file + ".back") # 'norefseq = True" : Do not download genomes, just do QC and mash filter on given genomes # -> if not, error and exit if norefseq: logger.warning(f'You asked to skip {ncbi_section} downloads.') # -> if db_dir given, watch for sequences there. If does not exist, error and exit # (user gave a directory (even if it does not exist), so we won't look for # the sequences in other folders) if db_dir: if not os.path.exists(db_dir): logger.error( f"Database folder {db_dir} supposed to contain fasta " "sequences does not " "exist. Please give a valid folder, or leave the default " "directory (no '-d' option).") sys.exit(1) # -> If user did not give db_dir, genomes could be in # outdir/Database_init/<genome_name>.fna else: db_dir = os.path.join(outdir, "Database_init") # If it does not exist, check if default compressed files folder exists. if not os.path.exists(db_dir): logger.warning( f"Database folder {db_dir} supposed to contain fasta " "sequences does not " "exist. We will check if the download folder (with compressed " "sequences) exists.") # -> if not in database_init, genomes must be in # outdir/refeq/bacteria/<genome_name>.fna.gz. In that case, # uncompress and add them to Database_init if not os.path.exists(ncbidir): logger.error( f"Folder {ncbidir} does not exist. You do not have any " "genome to analyse. Possible reasons:\n" "- if you want to rerun analysis in the same folder as " "sequences were downloaded (my_outdir/Database_init or " f"my_outdir/{ncbi_section}), make sure you have '-o my_outdir' " "option\n" "- if you want to rerun analysis and save them in a new " "output folder called 'new_outdir', make sure you have " "'-o new_outdir' option, " "and you specified where the uncompressed sequences to " "use are ('-d sequence_database_path'). ") sys.exit(1) # add genomes from refseq/bacteria folder to Database_init nb_gen, _ = dgf.to_database(outdir, ncbi_section) # No sequence: Do all steps -> download, QC, mash filter else: # Download all genomes of the given taxID db_dir, nb_gen = dgf.download_from_ncbi(species_linked, ncbi_section, ncbi_species_name, ncbi_species_taxid, ncbi_taxid, ncbi_strains, levels, outdir, threads) logger.info(f"{nb_gen} {ncbi_section} genome(s) downloaded") # Now that genomes are downloaded and uncompressed, check their quality to remove bad ones genomes = fg.check_quality(species_linked, db_dir, tmp_dir, l90, nbcont, cutn) # Do only mash filter. Genomes must be already downloaded, and there must be a file with # all information on these genomes (L90 etc.) else: logger.warning('You asked to run only mash steps.') if not os.path.exists( info_file): # info-file missing -> error and exit logger.error( f"Your info file {info_file} does not exist. Please provide the " "right name/path, or remove the '--mash-only option to rerun " "quality control.") sys.exit(1) logger.info(("You want to run only mash steps. Getting information " "from {}").format(info_file)) genomes = utils.read_genomes_info( info_file, species_linked, ) # Run Mash # genomes : {genome_file: [genome_name, orig_name, path_to_seq_to_annotate, size, nbcont, l90]} # sorted_genome : [genome_file] ordered by L90/nbcont (keys of genomes) sorted_genomes = fg.sort_genomes_minhash(genomes, l90, nbcont) # Write discarded genomes to a file -> orig_name, to_annotate, gsize, nb_conts, L90 discQC = f"by-L90_nbcont-{species_linked}.txt" utils.write_genomes_info(genomes, sorted_genomes, discQC, outdir) # Remove genomes not corresponding to mash filters removed = fg.iterative_mash(sorted_genomes, genomes, outdir, species_linked, min_dist, max_dist, threads, quiet) # Write list of genomes kept, and list of genomes discarded by mash step info_file = fg.write_outputfiles(genomes, sorted_genomes, removed, outdir, species_linked, min_dist, max_dist) logger.info("End") return info_file
def test_download_refseq_vs_genbank(caplog): """ Give a taxid of a subspecies, download strains from refseq, and then from genbank. Currently, no strains in refseq, and 2 in genbank. 39831 = Klebsiella pneumoniae subsp. rhinoscleromatis Later, there can be some in refseq, but always at least 2 more in genbank """ caplog.set_level(logging.INFO) species_linked = "refseq-genbank" section = "refseq" NCBI_species = None NCBI_species_taxid = "" NCBI_taxid = "39831" NCBI_strains = "" outdir = os.path.join(GENEPATH, "test_download_refseq_genbank") levels = "" threads = 1 # With refseq, no genome found with pytest.raises(SystemExit): downg.download_from_ncbi(species_linked, section, NCBI_species, NCBI_species_taxid, NCBI_taxid, NCBI_strains, levels, outdir, threads) # Check path to uncompressed files does not exist assert not os.path.isdir(os.path.join(outdir, "Database_init")) # Check that the NCBI_genome_download output directory was not created ngd_outdir = os.path.join(outdir, "refseq", "bacteria") assert not os.path.isdir(ngd_outdir) # Check logs assert ("From refseq: Downloading genomes with NCBI_taxid = 39831" ) in caplog.text assert "ERROR" in caplog.text assert ( "No strain correspond to your request. If you are sure there should have " "some, check that you gave valid NCBI taxid and/or " "NCBI species name and/or NCBI strain name. If you gave several, check that " "given taxIDs and names are compatible.") in caplog.text # REDO with genbank instead of refseq section = "genbank" outdir2 = os.path.join(GENEPATH, "test_download_genbank") db_dir, nb_gen = downg.download_from_ncbi(species_linked, section, NCBI_species, NCBI_species_taxid, NCBI_taxid, NCBI_strains, levels, outdir2, threads) # Check path to uncompressed files is as expected assert db_dir == os.path.join(outdir2, "Database_init") # Check number of genomes downloaded. We cannot know the exact value, as it is # updated everyday. But in nov. 2019, there are 4 genomes. # So, there must be at least those 4 genomes assert nb_gen >= 2 # And that db_dir exists and contains nb_gen files assert os.path.isdir(db_dir) assert len(os.listdir(db_dir)) == nb_gen # Check log giving only species taxid assert ("From genbank: Downloading genomes with NCBI_taxid = 39831" ) in caplog.text # Check that assembly summary file was donwloaded as expected sum_file = os.path.join(outdir2, "assembly_summary-refseq-genbank.txt") assert os.path.isfile(sum_file) # Check that the NCBI_genome_download output directory exists ngd_outdir = os.path.join(outdir2, "genbank", "bacteria") assert os.path.isdir(ngd_outdir)
def test_download_specify_level(caplog): """ Test that, given a taxid, and a species name, it downloads only genomes with the specified level """ caplog.set_level(logging.INFO) species_linked = "Acetobacter_orleanensis" section = "refseq" NCBI_species = "Acetobacter orleanensis" NCBI_species_taxid = "104099" NCBI_taxid = "" NCBI_strains = "" outdir = os.path.join(GENEPATH, "test_download_refseq") threads = 1 levels = "" db_dir, nb_gen = downg.download_from_ncbi(species_linked, section, NCBI_species, NCBI_species_taxid, NCBI_taxid, NCBI_strains, levels, outdir, threads) # Check path to uncompressed files is as expected assert db_dir == os.path.join(outdir, "Database_init") # Check number of genomes downloaded. We cannot know the exact value, as it is updated # everyday. But in nov. 2019, there are 4 genomes. So, there must be at least those 4 genomes assert nb_gen >= 4 # And that db_dir exists and contains nb_gen files assert os.path.isdir(db_dir) assert len(os.listdir(db_dir)) == nb_gen # Check that assembly summary file wwas donwloaded as expected sum_file = os.path.join(outdir, "assembly_summary-Acetobacter_orleanensis.txt") assert os.path.isfile(sum_file) # Check number of genomes in summary file, and how many with scaffold or complete # assembly level -> will check that when asking only for those levels, we get the same number other = 0 scaf = 0 comp = 0 with open(sum_file, "r") as sf: sf.readline() # skip header for line in sf: if "complete" in line.split("\t")[13].lower(): comp += 1 elif "scaffold" in line.split("\t")[13].lower(): scaf += 1 else: other += 1 assert other + scaf + comp == nb_gen # Check that the NCBI_genome_download output directory exists ngd_outdir = os.path.join(outdir, "refseq", "bacteria") # And that it contains folders assert os.path.isdir(ngd_outdir) assert len(os.listdir(ngd_outdir)) >= 4 # Check log giving species name + species taxid assert ( 'Downloading all genomes of NCBI species = ' 'Acetobacter orleanensis (NCBI_species_taxid = 104099)') in caplog.text # Re-run, but only asking for complete and scaffold outdir2 = os.path.join(GENEPATH, "test_download_refseq_only-scaf") levels2 = "scaffold,complete" db_dir2, nb_gen2 = downg.download_from_ncbi(species_linked, section, NCBI_species, NCBI_species_taxid, NCBI_taxid, NCBI_strains, levels2, outdir2, threads) assert scaf + comp == nb_gen2 assert db_dir2 == os.path.join(outdir2, "Database_init") # Check log giving species name + species taxid + levels given assert ( "Downloading all genomes of NCBI species = Acetobacter orleanensis " "(NCBI_species_taxid = 104099). " "(Only those assembly levels: scaffold,complete)") in caplog.text