示例#1
0
def genome_download(name, output_path):
    path = ''.join([output_path + name.replace(" ", "_")])
    os.makedirs(path)
    ngd.download(group="bacteria",
                 genus=name,
                 file_format="fasta",
                 parallel=10,
                 dry_run=True)
    ngd.download(group="bacteria",
                 genus=name,
                 file_format="fasta",
                 parallel=10,
                 dry_run=False,
                 output=path)
    files = []
    for r, d, f in os.walk(path):
        for file in f:
            if '.gz' in file:
                files.append(os.path.join(r, file))

    for f in files:
        sh.gunzip(f)

    files2 = []
    for r, d, f in os.walk(path):
        for file in f:
            if '.fna' in file:
                files2.append(os.path.join(r, file))

    out = ''.join([output_path + "/" + name.replace(" ", "_") + ".fasta"])
    sh.cat(files2, _out=out)
    return path
def ngd_download(dir_path, acc_ID, data_folder):
    download = False
    print('+ Check data for ID: ', acc_ID)
    if os.path.exists(dir_path):
        print('+ Folder already exists: ', dir_path)
        ## get files download
        (genome, prot, gff, gbk) = get_files_download(dir_path)
        if all([genome, prot, gff, gbk]):
            download = False
        else:
            print('+ Not all necessary data is available. Download it again.')
            download = True
    else:
        download = True

    if download:
        print('+ Downloading:')
        ## download in data folder provided
        ngd.download(section='genbank',
                     file_formats='fasta,gff,protein-fasta,genbank',
                     assembly_accessions=acc_ID,
                     output=data_folder,
                     groups='bacteria')

        ## check if files are gunzip
        files = os.listdir(dir_path)
        files_list = []
        for f in files:
            if f.endswith('gz'):
                files_list.append(f)
                print("\t- Extracting files: ", f)
                HCGB_files.extract(dir_path + '/' + f, dir_path)
                #os.remove(dir_path + '/' + f)
    else:
        print('+ Data is already available, no need to download it again')
示例#3
0
文件: ngd.py 项目: manto32bit/evol_tk
def main(name=None,
         odir=None,
         formats='fasta',
         ids_list=None,
         size_of_batch=30,
         parallel=10):
    # name = "Nitrospirae;"
    # formats = 'fasta,protein-fasta'
    # odir = '/share/home-user/thliao/data/NCBI_genbank'
    # db_dir
    formats = formats.split(',')

    odir = realpath(odir)
    if ids_list:
        domain2aids, cinfos = id2domain_to_ids(ids_list)
    else:
        domain2aids, cinfos = from_name2ids(name)

    # filter with existing files
    downloaded_aids = []
    new_domain2aids = {}
    for d, aids in domain2aids.items():
        old_d = aids[::]
        curr_dir = join(db_dir, 'genbank', d)
        if 'fasta' in formats:
            # check whether other kinds of files have been downloaded
            sub_aids = [
                _ for _ in tqdm(aids)
                if not glob(join(curr_dir, _, '*.fna.gz'))
            ]
            new_domain2aids[d] = sub_aids
        downloaded_aids.extend(new_domain2aids[d])
        print(
            f"domain: {d}, original number of ids: {len(old_d)}, now ids: {len(new_domain2aids[d])} "
        )

    _d = {
        "assembly_accessions": '',
        "dry_run": False,
        "section": "genbank",
        "parallel": parallel,
        "output": db_dir,  # all genomes were downloaded to db_dir
        "file_formats": formats
    }
    print(f'params is {_d}')
    for batch_aids in tqdm(batch_iter(downloaded_aids, size_of_batch)):
        ngd.download(
            **{
                "assembly_accessions": ','.join(batch_aids),
                "dry_run": False,
                "section": "genbank",
                "parallel": parallel,
                "output": db_dir,  # all genomes were downloaded to db_dir
                "file_formats": formats
            })

    with open(join(odir, 'metadata.csv'), 'w') as f1:
        f1.write('\n'.join(cinfos))
def main():
    '''Build and parse command line'''
    parser = argparse.ArgumentParser()
    parser.add_argument('domain',
                        choices=['all'] + ncbi_genome_download.SUPPORTED_DOMAINS,
                        help='The NCBI "domain" to download')
    parser.add_argument('-s', '--section',
                        dest='section', default='refseq', choices=['refseq', 'genbank'],
                        help='NCBI section to download')
    parser.add_argument('-F', '--format',
                        dest='file_format', default='genbank',
                        choices=['all'] + list(ncbi_genome_download.FORMAT_NAME_MAP.keys()),
                        help='Which format to download (default: genbank)')
    parser.add_argument('-l', '--assembly-level',
                        dest='assembly_level', default='all',
                        choices=['all'] + list(ncbi_genome_download.ASSEMBLY_LEVEL_MAP.keys()),
                        help='Assembly level of genomes to download (default: all)')
    parser.add_argument('-g', '--genus',
                        dest='genus', default='',
                        help='Only download sequences of the provided genus. (default: unset, download all)')
    parser.add_argument('-T', '--species-taxid',
                        dest='species_taxid',
                        help='Only download sequences of the provided species NCBI taxonomy ID. (default: unset, download all)')
    parser.add_argument('-t', '--taxid',
                        dest='taxid',
                        help='Only download sequences of the provided NCBI taxonomy ID. (default: unset, download all)')
    parser.add_argument('-o', '--output-folder',
                        dest='output', default=os.getcwd(),
                        help='Create output hierarchy in specified folder (default: current directory)')
    parser.add_argument('-u', '--uri',
                        dest='uri', default=ncbi_genome_download.NCBI_URI,
                        help='NCBI base URI to use')
    parser.add_argument('-p', '--parallel',
                        dest='parallel', default=1, type=int, metavar="N",
                        help='Run N downloads in parallel (default: 1)')
    parser.add_argument('-v', '--verbose',
                        action='store_true', default=False,
                        help='increase output verbosity')
    parser.add_argument('-d', '--debug',
                        action='store_true', default=False,
                        help='print debugging information')
    parser.add_argument('-V', '--version',
                        action='version', version=ncbi_genome_download.__version__,
                        help='print version information')

    args = parser.parse_args()

    if args.debug:
        log_level = logging.DEBUG
    elif args.verbose:
        log_level = logging.INFO
    else:
        log_level = logging.WARNING

    logging.basicConfig(format='%(levelname)s: %(message)s', level=log_level)

    ncbi_genome_download.download(args)
示例#5
0
def download_Refseq_files(outdir,cpus=1,names=False,taxids=False):
	assemblies,species_tags = check_db(outdir)

	files = ["fasta","protein-fasta","assembly-stats"]
	if not (names or taxids):
		print("Must specify a name or a taxid.")
	elif os.path.exists(os.path.join(outdir,"refseq")):
		print("Refseq download already exists at", os.path.join(outdir,"refseq"))
		print("Delete before proceeding.")
	else:
		if names:
			for name in names.split(","):
				print("Downloading files for {}...".format(name))
				for f in files:
					print("\tworking on {} files...".format(f))
					if cpus == 1:
						ngd.download(group="bacteria",genus=name,file_format=f,section="refseq",output=outdir)
					else:
						ngd.download(group="bacteria",genus=name,file_format=f,section="refseq",output=outdir,parallel=cpus)
		if taxids:
			for taxid in taxids.split(","):
				print("Downloading files for {}...".format(str(taxid)))
				for f in files:
					print("\tworking on {} files...".format(f))
					if cpus == 1:
						ngd.download(group="bacteria",taxid=taxid,file_format=f,section="refseq",output=outdir)
					else:
						ngd.download(group="bacteria",taxid=taxid,file_format=f,section="refseq",output=outdir,parallel=cpus)
		process_Refseq(outdir,assemblies,species_tags)
		if os.path.exists(os.path.join(outdir,"refseq")):
			shutil.rmtree(os.path.join(outdir,"refseq"))
	return
示例#6
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('domain',
                        choices=['all'] + ncbi_genome_download.supported_domains,
                        help='The NCBI "domain" to download')
    parser.add_argument('-s', '--section',
                        dest='section', default='refseq', choices=['refseq', 'genbank'],
                        help='NCBI section to download')
    parser.add_argument('-F', '--format',
                        dest='file_format', default='genbank',
                        choices=['all'] + ncbi_genome_download.format_name_map.keys(),
                        help='Which format to download (default: genbank)')
    parser.add_argument('-o', '--output-folder',
                        dest='output', default=os.getcwd(),
                        help='Create output hierarchy in specified folder (default: current directory)')
    parser.add_argument('-u', '--uri',
                        dest='uri', default=ncbi_genome_download.NCBI_URI,
                        help='NCBI base URI to use')
    parser.add_argument('-v', '--verbose',
                        action='store_true', default=False,
                        help='increase output verbosity')
    parser.add_argument('-d', '--debug',
                        action='store_true', default=False,
                        help='print debugging information')
    parser.add_argument('-V', '--version',
                        action='version', version=ncbi_genome_download.__version__,
                        help='print version information')

    args = parser.parse_args()

    if args.debug:
        log_level = logging.DEBUG
    elif args.verbose:
        log_level = logging.INFO
    else:
        log_level = logging.WARNING

    logging.basicConfig(format='%(levelname)s: %(message)s', level=log_level)

    ncbi_genome_download.download(args)
示例#7
0
def main():
    args = parse_args()
    meta_file = "{}.meta".format(args.name.replace(" ", "_"))
    logger.info(args)

    if not args.build:
        if pathlib.Path(args.outdir).exists():
            sys.exit(
                "The folder {} exists. Please choose another name \n or rename that folder and run again"
                .format(args.outdir))
        print("Start downloading {}".format(args.taxid))
        print("Location {}".format(args.outdir))
        ngd.download(section=args.section,
                     taxid=args.taxid,
                     group=args.group,
                     output=args.outdir,
                     file_format='genbank',
                     assembly_level=args.assembly_level,
                     metadata_table=meta_file,
                     parallel=args.parallel)

        if not pathlib.Path(meta_file).exists():
            sys.exit("Download error! Please check log file")

        num_lines = sum(1 for line in open(meta_file))

        logger.info("Downloaded {} files for {}".format((num_lines - 1),
                                                        args.taxid))
    if not pathlib.Path(args.outdir).exists():
        sys.exit("Folder {} not existed!".format(args.outdir))

    print("Start building DB for: {}".format(args.name))

    make_database(path=args.outdir,
                  db_name=args.name,
                  ext=args.ext,
                  parallel=args.parallel)
    print("Finished!")
def main():
    """Build and parse command line"""
    parser = argparse.ArgumentParser()
    parser.add_argument(
        'group',
        choices=dflt.TAXONOMIC_GROUPS.choices,
        default=dflt.TAXONOMIC_GROUPS.default,
        help='The NCBI taxonomic group to download (default: %(default)s)')
    parser.add_argument('-s',
                        '--section',
                        dest='section',
                        choices=dflt.SECTIONS.choices,
                        default=dflt.SECTIONS.default,
                        help='NCBI section to download (default: %(default)s)')
    parser.add_argument('-F',
                        '--format',
                        dest='file_format',
                        choices=dflt.FORMATS.choices,
                        default=dflt.FORMATS.default,
                        help='Which format to download (default: %(default)s)')
    parser.add_argument(
        '-l',
        '--assembly-level',
        dest='assembly_level',
        choices=dflt.ASSEMBLY_LEVELS.choices,
        default=dflt.ASSEMBLY_LEVELS.default,
        help='Assembly level of genomes to download (default: %(default)s)')
    parser.add_argument(
        '-g',
        '--genus',
        dest='genus',
        default=dflt.GENUS.default,
        help=
        'Only download sequences of the provided genus. (default: %(default)s)'
    )
    parser.add_argument(
        '-T',
        '--species-taxid',
        dest='species_taxid',
        default=dflt.SPECIES_TAXID.default,
        help='Only download sequences of the provided species NCBI taxonomy ID. '
        '(default: %(default)s)')
    parser.add_argument(
        '-t',
        '--taxid',
        dest='taxid',
        default=dflt.TAXID.default,
        help='Only download sequences of the provided NCBI taxonomy ID. ('
        'default: %(default)s)')
    parser.add_argument(
        '-R',
        '--refseq-category',
        dest='refseq_category',
        choices=dflt.REFSEQ_CATEGORIES.choices,
        default=dflt.REFSEQ_CATEGORIES.default,
        help=
        'Only download sequences of the provided refseq category (default: %(default)s)'
    )
    parser.add_argument(
        '-o',
        '--output-folder',
        dest='output',
        default=dflt.OUTPUT.default,
        help=
        'Create output hierarchy in specified folder (default: %(default)s)')
    parser.add_argument(
        '-H',
        '--human-readable',
        dest='human_readable',
        action='store_true',
        help='Create links in human-readable hierarchy (might fail on Windows)'
    )
    parser.add_argument('-u',
                        '--uri',
                        dest='uri',
                        default=dflt.URI.default,
                        help='NCBI base URI to use (default: %(default)s)')
    parser.add_argument(
        '-p',
        '--parallel',
        dest='parallel',
        type=int,
        metavar="N",
        default=dflt.NB_PROCESSES.default,
        help='Run %(metavar)s downloads in parallel (default: %(default)s)')
    parser.add_argument(
        '-r',
        '--retries',
        dest='retries',
        type=int,
        metavar="N",
        default=0,
        help='Retry download %(metavar)s times when connection to NCBI fails ('
        'default: %(default)s)')
    parser.add_argument('-m',
                        '--metadata-table',
                        type=str,
                        help='Save tab-delimited file with genome metadata')
    parser.add_argument('-v',
                        '--verbose',
                        action='store_true',
                        help='increase output verbosity')
    parser.add_argument('-d',
                        '--debug',
                        action='store_true',
                        help='print debugging information')
    parser.add_argument('-V',
                        '--version',
                        action='version',
                        version=__version__,
                        help='print version information')

    args = parser.parse_args()

    if args.debug:
        log_level = logging.DEBUG
    elif args.verbose:
        log_level = logging.INFO
    else:
        log_level = logging.WARNING

    logging.basicConfig(format='%(levelname)s: %(message)s', level=log_level)

    kwargs = vars(args)
    del kwargs['debug']
    del kwargs['verbose']
    max_retries = kwargs.pop(
        'retries')  # Default value is set in parser argument
    attempts = 0
    ret = download(**kwargs)
    while ret == 75 and attempts < max_retries:
        attempts += 1
        logging.error(
            'Downloading from NCBI failed due to a connection error, retrying. Retries so far: %s',
            attempts)
        ret = download(**kwargs)

    return ret
示例#9
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('domain',
                        choices=['all'] +
                        ncbi_genome_download.supported_domains,
                        help='The NCBI "domain" to download')
    parser.add_argument('-s',
                        '--section',
                        dest='section',
                        default='refseq',
                        choices=['refseq', 'genbank'],
                        help='NCBI section to download')
    parser.add_argument('-F',
                        '--format',
                        dest='file_format',
                        default='genbank',
                        choices=['all'] +
                        list(ncbi_genome_download.format_name_map.keys()),
                        help='Which format to download (default: genbank)')
    parser.add_argument(
        '-l',
        '--assembly-level',
        dest='assembly_level',
        default='all',
        choices=['all'] + list(ncbi_genome_download.assembly_level_map.keys()),
        help='Assembly level of genomes to download (default: all)')
    parser.add_argument(
        '-g',
        '--genus',
        dest='genus',
        default='',
        help=
        'Only download sequences of the provided genus. (default: unset, download all)'
    )
    parser.add_argument(
        '-o',
        '--output-folder',
        dest='output',
        default=os.getcwd(),
        help=
        'Create output hierarchy in specified folder (default: current directory)'
    )
    parser.add_argument('-u',
                        '--uri',
                        dest='uri',
                        default=ncbi_genome_download.NCBI_URI,
                        help='NCBI base URI to use')
    parser.add_argument('-p',
                        '--parallel',
                        dest='parallel',
                        default=1,
                        type=int,
                        metavar="N",
                        help='Run N downloads in parallel (default: 1)')
    parser.add_argument('-v',
                        '--verbose',
                        action='store_true',
                        default=False,
                        help='increase output verbosity')
    parser.add_argument('-d',
                        '--debug',
                        action='store_true',
                        default=False,
                        help='print debugging information')
    parser.add_argument('-V',
                        '--version',
                        action='version',
                        version=ncbi_genome_download.__version__,
                        help='print version information')

    args = parser.parse_args()

    if args.debug:
        log_level = logging.DEBUG
    elif args.verbose:
        log_level = logging.INFO
    else:
        log_level = logging.WARNING

    logging.basicConfig(format='%(levelname)s: %(message)s', level=log_level)

    ncbi_genome_download.download(args)
示例#10
0
def main(name=None,
         odir=None,
         taxons=None,
         formats='fasta',
         ids_list=None,
         size_of_batch=30,
         parallel=10,
         enable_check=True,
         section='genbank',
         group='bacteria',
         dry_run=False):
    formats = formats.split(',')
    if odir is None:
        odir = db_dir
    else:
        odir = realpath(odir)
    if enable_check:
        if ids_list:
            # should be assembly ID list
            domain2aids, cinfos = id2domain_to_ids(ids_list)
        elif name is not None:
            domain2aids, cinfos = from_name2ids(name, dataset=section)
        elif taxons is not None:
            domain2aids, cinfos = from_tid2ids(taxons)

        # filter with existing files
        downloaded_aids = []
        new_domain2aids = {}
        for d, aids in domain2aids.items():
            sub_aids = check_not_down(formats, aids, d, odir)
            new_domain2aids[d] = sub_aids
            downloaded_aids.extend(new_domain2aids[d])
            tqdm.write(
                f"domain: {d}, original number of ids: {len(aids)}, now ids: {len(new_domain2aids[d])} "
            )
    elif not enable_check and ids_list:
        # disable the check and give a list of ids_list
        downloaded_aids = ids_list[::]
    if dry_run:
        with open(f'{odir}/downloaded_aids.list', 'w') as f1:
            f1.write('\n'.join(downloaded_aids))
    _d = {
        "dry_run": dry_run,
        "section": section,
        "groups": group,
        "parallel": parallel,
        "output": odir,
        "file_formats": formats
    }
    tqdm.write(f'params is {_d}')
    for batch_aids in tqdm(batch_iter(downloaded_aids, size_of_batch)):
        ngd.download(
            **{
                "assembly_accessions": ','.join(batch_aids),
                "dry_run": dry_run,
                "use_cache":
                True,  # to avoid it automatic download/update the summary file 
                "section": section,
                "parallel": parallel,
                "output": odir,
                "groups":
                group,  # if not assign this, it will take long time to iterate all groups
                "file_formats": formats
            })
    with open(join(odir, 'metadata.csv'), 'w') as f1:
        f1.write('\n'.join(cinfos))
def download_from_ncbi(species_linked, section, ncbi_species_name, 
    ncbi_species_taxid, ncbi_taxid, spe_strains, levels, outdir, threads):
    """
    Download ncbi genomes of given species

    Parameters
    ----------
    species_linked : str
        given NCBI species with '_' instead of spaces, or NCBI taxID if species
        name not given
    section : str
        genbank or only refseq (default = refseq)
    ncbi_species_name : str or None
        name of species to download: user given NCBI species. None if
        no species name given
    ncbi_species_taxid : int
        species taxid given in NCBI (-T option)
    ncbi_taxid : int
        taxid given in NCBI (-t option)
    spe_strains : str
        specific strain name, or comma-separated strain names 
        (or name of a file with one strain name per line)
    outdir : str
        Directory where downloaded sequences must be saved
    threads : int
        Number f threads to use to download genome sequences

    Returns
    -------
    str :
        Output filename of downloaded summary

    """
    # Name of summary file, with metadata for each strain:
    sumfile = os.path.join(outdir, f"assembly_summary-{species_linked}.txt")
    abs_sumfile = os.path.abspath(sumfile)
    # arguments needed to download all genomes of the given species
    abs_outdir = os.path.abspath(outdir)
    keyargs = {"section": section, "file_formats": "fasta", 
               "output": abs_outdir,
               "parallel": threads, "groups": "bacteria",
               "metadata_table":abs_sumfile}
    message = f"From {section}: "
    
    # Specific strains: downloaded only if compatible with ncbi species/taxids
    if spe_strains:
        keyargs["strains"] = spe_strains
        if os.path.isfile(spe_strains):
            message += f"Downloading all strains specified in {spe_strains} file"
        else:
            message += f"Downloading the following specified strain(s): {spe_strains}"
        if ncbi_species_name or ncbi_species_taxid or ncbi_taxid:
            message += ", which also have: "
        if ncbi_species_name:
            keyargs["genera"] = ncbi_species_name
            message += f"\n\t-NCBI species = {ncbi_species_name}"
        if ncbi_species_taxid:
            keyargs["species_taxids"] = ncbi_species_taxid
            message += f"\n\t-NCBI_species_taxid = {ncbi_species_taxid}"
        if ncbi_taxid:
            keyargs["taxids"] = ncbi_taxid
            message += f"\n\t-NCBI_taxid = {ncbi_taxid})."
    # Not downloading specific strains, but a sub-species: must be compatible with species given
    elif ncbi_taxid:
        keyargs["taxids"] = ncbi_taxid
        message += f"Downloading genomes with NCBI_taxid = {ncbi_taxid}"
        if ncbi_species_name or ncbi_species_taxid:
            message += ", which also have: "
        if ncbi_species_name:
            keyargs["genera"] = ncbi_species_name
            message += f"\n\t-NCBI species = {ncbi_species_name}"
        if ncbi_species_taxid:
            keyargs["species_taxids"] = ncbi_species_taxid
            message += f"\n\t-NCBI_species_taxid = {ncbi_species_taxid}"
    # Downloading all genomes of a species
    else:
        message += "Downloading all genomes of "
        # If NCBI species given, add it to arguments to download genomes, 
        # and write it to info message
        if ncbi_species_name:
            keyargs["genera"] = ncbi_species_name
            message += f"NCBI species = {ncbi_species_name}"
        # If NCBI species given, add it to arguments to download genomes, 
        # and write it to info message
        if ncbi_species_taxid:
            keyargs["species_taxids"] = ncbi_species_taxid
            if ncbi_species_name:
                message += f" (NCBI_species_taxid = {ncbi_species_taxid})."
            else:
                message += f"NCBI_species_taxid = {ncbi_species_taxid}"

    # If assembly level(s) given, add it to arguments, and write to info message
    if levels:
        keyargs["assembly_levels"] = levels
        message += f" (Only those assembly levels: {levels}). "

    logger.info(f"Metadata for all genomes will be saved in {sumfile}")
    logger.info(message)

    # Download genomes
    max_retries = 15 # If connection to NCBI fails, how many retry downloads must be done
    error_message = ("No strain correspond to your request. If you are sure there should have "
                     "some, check that you gave valid NCBI taxid and/or "
                     "NCBI species name and/or NCBI strain name. If you gave several, check that "
                     "given taxIDs and names are compatible.")
    # widgets = [progressbar.BouncingBar(marker=progressbar.RotatingMarker(markers="◐◓◑◒")),
    #            "  -  ", progressbar.Timer()]
    # bar = progressbar.ProgressBar(widgets=widgets, max_value=20, term_width=50)
    try:
        # Download genomes
        # ret = None
        # while True:
        #     if ret:
        #         break
        #     bar.update()
        ret = ngd.download(**keyargs)

    except: # pragma: no cover
        # Error message if crash during execution of ncbi_genome_download
        logger.error(error_message)
        # bar.finish()
        sys.exit(1)
    attempts = 0
    while ret == 75 and attempts < max_retries: # pragma: no cover
        # bar.update()
        attempts += 1
        logging.error(('Downloading from NCBI failed due to a connection error, '
                       'retrying. Already retried so far: %s'), attempts)
        ret = ngd.download(**keyargs)
    # bar.finish()
    # Message if NGD did not manage to download the genomes (wrong species name/taxid)
    if ret != 0:
        # Error message
        logger.error(error_message)
        sys.exit(1)
    nb_gen, db_dir = to_database(outdir, section)
    return db_dir, nb_gen
示例#12
0
from ncbi_genome_download import download
from subprocess import run
from glob import glob

accession_dict = { 'genbank': [], 'refseq': [] }
with open(input[0]) as fp:
    for accession in fp:
        accession = accession.strip()
            prefix, skip = accession.split('_', 1)
            section = 'refseq' if prefix == 'GCF' else 'genbank'
            if not glob(f'data/{section}/*/{accession}/*.fna*'):
                accession_dict[section].append(accession)

for section, accessions in accession_dict.items():
    if accessions:
        download(section = section, file_formats = 'fasta,protein-fasta', assembly_accessions = accessions, output = 'data', parallel = threads)
    gzip_files = glob(f'data/{section}/*/*/*.gz')
    if gzip_files:
        run([ 'gunzip' ] + gzip_files)

with open(output[0], 'w') as fp:
        fp.write('OK')
示例#13
0
def ngd_download(section_given,
                 acc_ID,
                 data_folder,
                 debug,
                 section='genbank',
                 assembly_level='complete',
                 group_given='bacteria'):
    '''
    Function that calls and retrieves data from NCBI using python package ngd.
    
    :param acc_ID:
    :param data_folder: Folder to store data. 
    :param debug: True/false for debugging messages
    
    :attention Module ngd requires to download data in bacteria/archaea subfolder under genbank or refseq folder.
    '''
    ##################################
    ## check if necessary to download
    ##################################

    ## get path
    print('+ Check data for ID: ', acc_ID)
    dir_path = os.path.join(data_folder, section_given, group_given, acc_ID)

    ## check if previously download
    download = False
    if os.path.exists(dir_path):
        print('+ Folder already exists: ', dir_path)
        ## get files download
        (genome, prot, gff,
         gbk) = BacDup.scripts.functions.get_files_annotation(dir_path, debug)
        if (gbk):  ## Only genbank format file is required
            download = False
        else:
            print('+ Not all necessary data is available. Download it again.')
            download = True
    else:
        download = True

    ## download data
    if download:
        print("\n+ Downloading data for: " + colored(acc_ID, 'green'))

        ## download in data folder provided
        if (debug):
            debug_message("ngd.download call", color="yellow")
            debug_message("dir_path: " + dir_path, color="yellow")
            debug_message("section_given: " + section_given, color="yellow")

        ## download
        if debug:
            debug_message(
                "section='%s', file_formats='genbank', assembly_level=%s, assembly_accessions=%s, output=%s, groups=%s"
                % (section_given, assembly_level, acc_ID, data_folder,
                   group_given),
                color="yellow")

        try:
            ngd.download(section=section_given,
                         file_formats='genbank',
                         assembly_levels=assembly_level,
                         assembly_accessions=acc_ID,
                         output=data_folder,
                         groups=group_given)
        except:
            raise (
                "A problem occurred when contacting NCBI for downloading id (%s) from %s"
                % (acc_ID, section_given))

        ## return empty
        if not os.path.isdir(dir_path):
            return False

        ## check if files are gunzip
        files = os.listdir(dir_path)
        files_list = []
        for f in files:
            if f.endswith('gz'):
                files_list.append(f)
                print("\t- Extracting files: ", f)
                HCGB.functions.files_functions.extract(dir_path + '/' + f,
                                                       dir_path)
                #os.remove(dir_path + '/' + f)

    ## skip
    else:
        print('\t+ Data is already available, no need to download it again')

    print()
    ## return path where data is
    return (dir_path)
示例#14
0
def main():
    '''Build and parse command line'''
    parser = argparse.ArgumentParser()
    parser.add_argument('domain',
                        choices=['all'] + ncbi_genome_download.SUPPORTED_DOMAINS,
                        help='The NCBI "domain" to download')
    parser.add_argument('-s', '--section',
                        dest='section', default='refseq', choices=['refseq', 'genbank'],
                        help='NCBI section to download')
    parser.add_argument('-F', '--format',
                        dest='file_format', default='genbank',
                        choices=['all'] + list(ncbi_genome_download.FORMAT_NAME_MAP.keys()),
                        help='Which format to download (default: genbank)')
    parser.add_argument('-l', '--assembly-level',
                        dest='assembly_level', default='all',
                        choices=['all'] + list(ncbi_genome_download.ASSEMBLY_LEVEL_MAP.keys()),
                        help='Assembly level of genomes to download (default: all)')
    parser.add_argument('-g', '--genus',
                        dest='genus', default='',
                        help='Only download sequences of the provided genus. (default: unset, download all)')
    parser.add_argument('-T', '--species-taxid',
                        dest='species_taxid',
                        help='Only download sequences of the provided species NCBI taxonomy ID. '
                             '(default: unset, download all)')
    parser.add_argument('-t', '--taxid',
                        dest='taxid',
                        help='Only download sequences of the provided NCBI taxonomy ID. (default: unset, download all)')
    parser.add_argument('-o', '--output-folder',
                        dest='output', default=os.getcwd(),
                        help='Create output hierarchy in specified folder (default: current directory)')
    parser.add_argument('-H', '--human-readable',
                        dest='human_readable', default=False, action='store_true',
                        help='Create links in human-readable hierarchy (might fail on Windows)')
    parser.add_argument('-u', '--uri',
                        dest='uri', default=ncbi_genome_download.NCBI_URI,
                        help='NCBI base URI to use')
    parser.add_argument('-p', '--parallel',
                        dest='parallel', default=1, type=int, metavar="N",
                        help='Run N downloads in parallel (default: 1)')
    parser.add_argument('-r', '--retries',
                        dest='retries', default=0, type=int, metavar="N",
                        help='Retry download N times when connection to NCBI fails (default: 0)')
    parser.add_argument('-v', '--verbose',
                        action='store_true', default=False,
                        help='increase output verbosity')
    parser.add_argument('-d', '--debug',
                        action='store_true', default=False,
                        help='print debugging information')
    parser.add_argument('-V', '--version',
                        action='version', version=ncbi_genome_download.__version__,
                        help='print version information')

    args = parser.parse_args()

    if args.debug:
        log_level = logging.DEBUG
    elif args.verbose:
        log_level = logging.INFO
    else:
        log_level = logging.WARNING

    logging.basicConfig(format='%(levelname)s: %(message)s', level=log_level)

    retries = 0
    ret = ncbi_genome_download.download(args)
    while ret == 75 and retries < args.retries:
        retries += 1
        logging.error('Downloading from NCBI failed due to a connection error, retrying. Retries so far: %s',
                      retries)
        ret = ncbi_genome_download.download(args)

    return ret