def set_up_tempdir(tempdir_arg,no_temp_arg,cwd,outdir,config): if no_temp_arg: tempdir = outdir config[KEY_TEMPDIR] = tempdir print(green(f"\n--no-temp: ") + f"all intermediate files will be written to {outdir}\n") elif tempdir_arg: to_be_dir = os.path.join(cwd, tempdir_arg) try: if not os.path.exists(to_be_dir): os.mkdir(to_be_dir) except: sys.stderr.write(cyan(f'Error: cannot create temp directory {to_be_dir}.\n')) sys.exit(-1) tempdir = tempfile.mkdtemp(dir=to_be_dir) config[KEY_TEMPDIR] = tempdir else: tempdir = tempfile.mkdtemp() config[KEY_TEMPDIR] = tempdir try: if not os.path.exists(tempdir): os.mkdir(tempdir) except: sys.stderr.write(cyan(f'Error: cannot create temp directory {tempdir}.\n')) sys.exit(-1) try: with open(os.path.join(tempdir, "test.txt"),"w") as fw: fw.write("Test") except: sys.stderr.write(cyan(f'Error: cannot write to temp directory {tempdir}.\n')) sys.exit(-1)
def check_dependencies(dependency_list, module_list, usher_arg, cache_arg): missing = [] if usher_arg: dependency_list.append("usher") if cache_arg: dependency_list.append("pangolin-assignment") for dependency in dependency_list: check_this_dependency(dependency, missing) for module in module_list: check_module(module, missing) if missing: if len(missing) == 1: sys.stderr.write( cyan(f'Error: Missing dependency `{missing[0]}`.') + '\nPlease update your pangolin environment.\n') sys.exit(-1) else: dependencies = "" for i in missing: dependencies += f"\t- {i}\n" sys.stderr.write( cyan(f'Error: Missing dependencies.') + f'\n{dependencies}Please update your pangolin environment.\n') sys.exit(-1) else: print(green("All dependencies satisfied."))
def check_dependencies(): missing = [] dependency_list = ["gofasta", "minimap2", "snakemake", "usher"] module_list = [ "Bio", "sklearn", "pandas", "joblib", "pysam", "pangoLEARN", "constellations" ] for dependency in dependency_list: check_this_dependency(dependency, missing) for module in module_list: check_module(module, missing) if missing: if len(missing) == 1: sys.stderr.write( cyan(f'Error: Missing dependency `{missing[0]}`.') + '\nPlease update your pangolin environment.\n') sys.exit(-1) else: dependencies = "" for i in missing: dependencies += f"\t- {i}\n" sys.stderr.write( cyan(f'Error: Missing dependencies.') + f'\n{dependencies}Please update your pangolin environment.\n') sys.exit(-1) else: print(green("All dependencies satisfied."))
def get_assignment_cache(cache_file, config): cache = "" try: import pangolin_assignment pangolin_assignment_dir = pangolin_assignment.__path__[0] for r, d, f in os.walk(pangolin_assignment_dir): for fn in f: if fn == cache_file and cache == "": cache = os.path.join(r, fn) if not os.path.exists(cache): sys.stderr.write( cyan( f'Error: cannot find assignment cache file {cache_file} in pangolin_assignment\n' )) sys.exit(-1) except: sys.stderr.write( cyan( '\nError: "pangolin --add-assignment-cache" is required before ' '"pangolin --use-assignment-cache", in order to install optional ' 'pangolin-assignment repository (that will make future data updates slower).\n' )) sys.exit(-1) # Check versions of pangolin-data and pangolin-assignment to make sure they are consistent. if pangolin_assignment.__version__.lstrip( 'v') != config[KEY_PANGOLIN_DATA_VERSION].lstrip('v'): print( cyan( f'Error: pangolin_assignment cache version {pangolin_assignment.__version__} ' f'does not match pangolin_data version {config[KEY_PANGOLIN_DATA_VERSION]}. ' 'Run "pangolin --update-data" to fetch latest versions of both.' )) sys.exit(-1) try: with gzip.open(cache, 'rt') as f: line = f.readline() except: with open(cache, 'r') as f: line = f.readline() if "git-lfs.github.com" in line: sys.stderr.write( cyan( 'Error: Git LFS file not pulled successfully. Please install git-lfs \nusing conda or an alternative (not pip) then re-install pangolin-assignment \nwith pip install git+https://github.com/cov-lineages/pangolin-assignment.git\n' )) sys.exit(-1) return cache
def get_snakefile(thisdir,analysis_mode): # in this case now, the snakefile used should be the name of the analysis mode (i.e. pangolearn, usher or preprocessing) snakefile = os.path.join(thisdir, 'scripts',f'{analysis_mode}.smk') if not os.path.exists(snakefile): sys.stderr.write(cyan(f'Error: cannot find Snakefile at {snakefile}. Check installation\n')) sys.exit(-1) return snakefile
def get_latest_release(dependency): """ Using the github releases API check for the latest release of dependency and its tarball """ try: latest_release = request.urlopen( f"https://api.github.com/repos/cov-lineages/{dependency}/releases") # to catch and give a useful error message when people try to run this # either update option on clusters without external connectivity # or have exceeded the github API limit temporarily # this may also catch genuine bugs when version and release tags diverge # so if this is thrown and there is definitely connectivity then # double check the version labels except Exception as e: sys.stderr.write( cyan("Unable to connect to reach github API " "--update/--data_update requires internet " "connectivity so may not work on certain " "systems or if your IP has exceeded the " f"5,000 request per hour limit\n{e}\n")) sys.exit(-1) latest_release = json.load(latest_release) latest_release_tarball = latest_release[0]['tarball_url'] # extract and clean up latest release version latest_release = latest_release[0]['tag_name'] return latest_release, latest_release_tarball
def get_snakefile(thisdir): snakefile = os.path.join(thisdir, 'scripts', 'pangolearn.smk') if not os.path.exists(snakefile): sys.stderr.write( cyan( f'Error: cannot find Snakefile at {snakefile}\n Check installation\n' )) sys.exit(-1) return snakefile
def set_up_outdir(outdir_arg,cwd,outdir): if outdir_arg: outdir = os.path.join(cwd, outdir_arg) if not os.path.exists(outdir): try: os.mkdir(outdir) except: sys.stderr.write(cyan(f'Error: cannot create directory:') + f"{outdir}") sys.exit(-1) return outdir
def check_datadir(datadir_arg): datadir = None # find the data if datadir_arg: # this needs to be an absolute path when we pass it to scorpio datadir = os.path.abspath(datadir_arg) if not os.path.exists(datadir): sys.stderr.write(cyan(f"Cannot find data directory specified: {datadir}\n")) sys.exit(-1) return datadir
def package_data_check(filename, directory, key, config): try: package_datafile = os.path.join(directory, filename) data = pkg_resources.resource_filename('pangolin', package_datafile) config[key] = data except: sys.stderr.write( cyan(f'Error: Missing package data.') + f'\n\t- {filename}\nPlease install the latest pangolin version with `pangolin --update`.\n' ) sys.exit(-1)
def find_query_file(cwd, tempdir, query_arg): if len(query_arg) > 1: print(cyan(f"Error: Too many query (input) fasta files supplied: {query_arg}\nPlease supply one only.")) sys.exit(-1) # find the query fasta try: if not os.path.exists(os.path.join(cwd, query_arg[0])): if select.select([sys.stdin,],[],[],0.0)[0]: query = os.path.join(tempdir, "stdin_query.fasta") with open(query,"w") as fw: for l in sys.stdin: l= l.rstrip("\n") fw.write(l + '\n') print(green("Query:\t") + "reading from stdin.") elif not select.select([sys.stdin,],[],[],0.0)[0]: tried_path = os.path.join(cwd, query_arg[0]) if tried_path.endswith("-"): sys.stderr.write(cyan( f'Error: cannot find query (input) fasta file using stdin.\n' + 'Please enter your fasta sequence file and refer to pangolin usage at: https://cov-lineages.org/pangolin.html' + ' for detailed instructions.\n')) sys.exit(-1) else: sys.stderr.write(cyan(f'Error: cannot find query (input) fasta file at:') + f'{tried_path}\n' + 'Please enter your fasta sequence file and refer to pangolin usage at: https://cov-lineages.org/pangolin.html' + ' for detailed instructions.\n') sys.exit(-1) else: query = os.path.join(cwd, query_arg[0]) print(green(f"Query file:\t") + f"{query}") except IndexError: sys.stderr.write(cyan( f'Error: input query fasta could not be detected from a filepath or through stdin.\n' + 'Please enter your fasta sequence file and refer to pangolin usage at: https://cov-lineages.org/pangolin.html' + ' for detailed instructions.\n')) sys.exit(-1) return query
def git_lfs_install(): """ 'git-lfs install' must be run after installing git-lfs and before cloning a repo that uses Git LFS. """ try: subprocess.run(['git-lfs', 'install'], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) except CalledProcessError as e: sys.stderr.write(cyan(f'Error: "git-lfs install" failed: {e}')) sys.exit(-1)
def find_designation_cache_and_alias(datadir, designation_cache_file, alias_file): designation_cache = "" alias = "" for r, d, f in os.walk(datadir): for fn in f: if fn == designation_cache_file: designation_cache = os.path.join(r, fn) elif fn == alias_file: alias = os.path.join(r, fn) if designation_cache == "": sys.stderr.write( cyan( f'Error: Missing designation cache file. Either supply a datadir with a {designation_cache_file} file, or specify `--skip-designation-cache`\n' )) sys.exit(-1) elif alias == "": sys.stderr.write( cyan( f'Error: Missing alias file. Please supply a datadir with a {alias_file} file or check installation of pangolin-data dependency.\n' )) sys.exit(-1) return designation_cache, alias
def get_datafiles(datadir, file_dict, config): datafiles = {} for r, d, f in os.walk(datadir): for fn in f: if fn in file_dict: datafiles[file_dict[fn]] = os.path.join(r, fn) for fn in datafiles: config[fn] = datafiles[fn] for fn in file_dict: if file_dict[fn] not in config: sys.stderr.write( cyan( f'Error: Cannot find {fn} in datadir. Please supply a datadir with required files or specify an alternative analysis mode.\nPlease see https://cov-lineages.org/pangolin.html for full installation and updating instructions.' )) sys.exit(-1) print(green("****\nData files found:")) for fn in datafiles: print(f"{fn}:\t{datafiles[fn]}") config[fn] = datafiles[fn] print(green("****"))
def set_up_analysis_mode(analysis_arg, default_mode): """ the logic here - takes the default mode set in the config dict (accurate) - it equates the usher arg to accurate arg and pangolearn to fast - checks if incompatible flags were used (only one of accurate, fast or cache) - overwrites default if any other analysis mode flagged - returns new analysis mode """ analysis_mode = default_mode if analysis_arg: if not analysis_arg in ["usher","pangolearn","fast","accurate"]: sys.stderr.write(cyan(f"Invalid `--analysis-mode` option specified: please select one of `fast`,`accurate`,`pangolearn` or`usher`\n")) sys.exit(-1) if analysis_arg in ['pangolearn','fast']: analysis_mode = "pangolearn" elif analysis_arg in ['usher','accurate']: analysis_mode = "usher" return analysis_mode
def quick_check_query_file(cwd, query_arg, query): if os.path.exists(os.path.join(cwd, query_arg[0])): file_ending = query.split(".")[-1] if file_ending in ["gz","gzip","tgz"]: query = gzip.open(query, 'rt') elif file_ending in ["xz","lzma"]: query = lzma.open(query, 'rt') try: parse= True c = 0 for record in SeqIO.parse(query, "fasta"): if parse == False: break parse = False except UnicodeDecodeError: sys.stderr.write(cyan( f'Error: the input query fasta could not be parsed.\n' + 'Double check your query fasta and that compressed stdin was not passed.\n' + 'Please enter your fasta sequence file and refer to pangolin usage at: https://cov-lineages.org/pangolin.html' + ' for detailed instructions.\n')) sys.exit(-1)
def main(sysargs=sys.argv[1:]): parser = argparse.ArgumentParser( prog=_program, description= 'pangolin: Phylogenetic Assignment of Named Global Outbreak LINeages', usage='''pangolin <query> [options]''') io_group = parser.add_argument_group('Input-Output options') io_group.add_argument('query', nargs="*", help='Query fasta file of sequences to analyse.') io_group.add_argument( '-o', '--outdir', action="store", help="Output directory. Default: current working directory") io_group.add_argument( '--outfile', action="store", help="Optional output file name. Default: lineage_report.csv") io_group.add_argument( '--tempdir', action="store", help="Specify where you want the temp stuff to go. Default: $TMPDIR") io_group.add_argument( "--no-temp", action="store_true", help="Output all intermediate files, for dev purposes.") io_group.add_argument('--alignment', action="store_true", help="Output multiple sequence alignment.") io_group.add_argument('--alignment-file', action="store", help="Multiple sequence alignment file name.") io_group.add_argument( '--expanded-lineage', action="store_true", default=False, help="Optional expanded lineage from alias.json in report.") a_group = parser.add_argument_group('Analysis options') a_group.add_argument( '--analysis-mode', action="store", help= "Specify which inference engine to use. Options: accurate (UShER), fast (pangoLEARN), pangolearn, usher. Default: UShER inference." ) a_group.add_argument( "--skip-designation-cache", action='store_true', default=False, help= "Developer option - do not use designation cache to assign lineages.", dest="skip_designation_cache") a_group.add_argument( "--skip-scorpio", action='store_true', default=False, help= "Developer option - do not use scorpio to check VOC/VUI lineage assignments.", dest="skip_scorpio") a_group.add_argument( '--max-ambig', action="store", default=0.3, type=float, help= "Maximum proportion of Ns allowed for pangolin to attempt assignment. Default: 0.3", dest="maxambig") a_group.add_argument( '--min-length', action="store", default=25000, type=int, help= "Minimum query length allowed for pangolin to attempt assignment. Default: 25000", dest="minlen") a_group.add_argument('--usher', action='store_true', default=False, help=argparse.SUPPRESS) d_group = parser.add_argument_group('Data options') d_group.add_argument( "--update", action='store_true', default=False, help= "Automatically updates to latest release of pangolin, pangolin-data, scorpio and constellations (and pangolin-assignment if it has been installed using --add-assignment-cache), then exits." ) d_group.add_argument( "--update-data", action='store_true', dest="update_data", default=False, help= "Automatically updates to latest release of constellations and pangolin-data, including the pangoLEARN model, UShER tree file and alias file (also pangolin-assignment if it has been installed using --add-assignment-cache), then exits." ) d_group.add_argument( '--add-assignment-cache', action='store_true', dest="add_assignment_cache", default=False, help= "Install the pangolin-assignment repository for use with --use-assignment-cache. This makes updates slower and makes pangolin slower for small numbers of input sequences but much faster for large numbers of input sequences." ) d_group.add_argument( '--use-assignment-cache', action='store_true', dest="use_assignment_cache", default=False, help= "Use assignment cache from optional pangolin-assignment repository. NOTE: the repository must be installed by --add-assignment-cache before using --use-assignment-cache." ) d_group.add_argument( '-d', '--datadir', action='store', dest="datadir", help= "Data directory minimally containing the pangoLEARN model, header files and UShER tree. Default: Installed pangolin-data package." ) d_group.add_argument( '--usher-tree', action='store', dest='usher_protobuf', help= "UShER Mutation Annotated Tree protobuf file to use instead of default from pangolin-data repository or --datadir." ) d_group.add_argument( '--assignment-cache', action='store', dest='assignment_cache', help= "Cached precomputed assignment file to use instead of default from pangolin-assignment repository. Does not require installation of pangolin-assignment." ) m_group = parser.add_argument_group('Misc options') m_group.add_argument("--aliases", action='store_true', default=False, help="Print Pango alias_key.json and exit.") m_group.add_argument("-v", "--version", action='version', version=f"pangolin {__version__}") m_group.add_argument( "-pv", "--pangolin-data-version", action='version', version=f"pangolin-data {pangolin_data.__version__}", help= "show version number of pangolin data files (UShER tree and pangoLEARN model files) and exit." ) m_group.add_argument( "--all-versions", action='store_true', dest="all_versions", default=False, help="Print all tool, dependency, and data versions then exit.") m_group.add_argument("--verbose", action="store_true", help="Print lots of stuff to screen") m_group.add_argument("-t", "--threads", action="store", default=1, type=int, help="Number of threads") if len(sysargs) < 1: parser.print_help() sys.exit(-1) else: args = parser.parse_args(sysargs) # Initialise config dict config = setup_config_dict(cwd) data_checks.check_install(config) set_up_verbosity(config) if args.usher: sys.stderr.write( cyan( f"--usher is a pangolin v3 option and is deprecated in pangolin v4. UShER is now the default analysis mode. Use --analysis-mode to explicitly set mode.\n" )) setup_data(args.datadir, config[KEY_ANALYSIS_MODE], config) if args.add_assignment_cache: update.install_pangolin_assignment() if args.update: version_dictionary = { 'pangolin': __version__, 'pangolin-data': config[KEY_PANGOLIN_DATA_VERSION], 'constellations': config[KEY_CONSTELLATIONS_VERSION], 'scorpio': config[KEY_SCORPIO_VERSION] } update.add_pangolin_assignment_if_installed(version_dictionary) update.update(version_dictionary) if args.update_data: version_dictionary = { 'pangolin-data': config[KEY_PANGOLIN_DATA_VERSION], 'constellations': config[KEY_CONSTELLATIONS_VERSION] } update.add_pangolin_assignment_if_installed(version_dictionary) update.update(version_dictionary, args.datadir) # install_pangolin_assignment doesn't exit so that --update/--update-data can be given at the # same time (or a query file). If --add-assignment-cache is the only arg, exit without error. if args.add_assignment_cache and not args.query: sys.exit(0) # add flag to config for whether to run scorpio if args.skip_scorpio: print(green(f"****\nPangolin skipping scorpio steps.\n****")) config[KEY_SKIP_SCORPIO] = True if args.expanded_lineage: print(green(f"****\nAdding expanded lineage column to output.\n****")) config[KEY_EXPANDED_LINEAGE] = True # Parsing analysis mode flags to return one of 'usher' or 'pangolearn' config[KEY_ANALYSIS_MODE] = set_up_analysis_mode(args.analysis_mode, config[KEY_ANALYSIS_MODE]) snakefile = get_snakefile(thisdir, config[KEY_ANALYSIS_MODE]) config[KEY_DESIGNATION_CACHE], config[ KEY_ALIAS_FILE] = data_checks.find_designation_cache_and_alias( config[KEY_DATADIR], DESIGNATION_CACHE_FILE, ALIAS_FILE) if args.aliases: print_alias_file_exit(config[KEY_ALIAS_FILE]) if args.all_versions: print_versions_exit(config) # to enable not having to pass a query if running update # by allowing query to accept 0 to many arguments print( green( f"****\nPangolin running in {config[KEY_ANALYSIS_MODE]} mode.\n****" )) print_ram_warning(config[KEY_ANALYSIS_MODE]) # setup outdir and outfiles config[KEY_OUTDIR] = io.set_up_outdir(args.outdir, cwd, config[KEY_OUTDIR]) config[KEY_OUTFILE] = io.set_up_outfile(args.outfile, config[KEY_OUTFILE], config[KEY_OUTDIR]) io.set_up_tempdir(args.tempdir, args.no_temp, cwd, config[KEY_OUTDIR], config) config[KEY_ALIGNMENT_FILE], config[ KEY_ALIGNMENT_OUT] = io.parse_alignment_options( args.alignment, config[KEY_OUTDIR], config[KEY_TEMPDIR], args.alignment_file, config[KEY_ALIGNMENT_FILE]) config[KEY_QUERY_FASTA] = io.find_query_file(cwd, config[KEY_TEMPDIR], args.query) io.quick_check_query_file(cwd, args.query, config[KEY_QUERY_FASTA]) if config[KEY_ANALYSIS_MODE] == "usher": # Find usher protobuf file (and if specified, assignment cache file too) data_checks.get_datafiles(config[KEY_DATADIR], usher_files, config) if args.usher_protobuf: config[KEY_USHER_PB] = data_checks.check_file_arg( args.usher_protobuf, cwd, '--usher-tree') print(green(f"Using usher tree file {args.usher_protobuf}")) if args.assignment_cache: config[KEY_ASSIGNMENT_CACHE] = data_checks.check_file_arg( args.assignment_cache, cwd, '--assignment-cache') print( green(f"Using assignment cache file {args.assignment_cache}")) elif args.use_assignment_cache: config[KEY_ASSIGNMENT_CACHE] = data_checks.get_assignment_cache( USHER_ASSIGNMENT_CACHE_FILE, config) print(green("Using pangolin-assignment cache")) else: config[KEY_ASSIGNMENT_CACHE] = "" elif config[KEY_ANALYSIS_MODE] == "pangolearn": # find designation cache and the model files data_checks.get_datafiles(config[KEY_DATADIR], pangolearn_files, config) if args.use_assignment_cache or args.assignment_cache: sys.stderr.write( cyan( f"Warning: --use-assignment-cache and --assignment-cache are ignored when --analysis-mode is 'fast' or 'pangolearn'.\n" )) preprocessing_snakefile = get_snakefile(thisdir, "preprocessing") if args.verbose: print(green("\n**** CONFIG ****")) for k in sorted(config): print(green(k), config[k]) status = snakemake.snakemake(preprocessing_snakefile, printshellcmds=True, forceall=True, force_incomplete=True, workdir=config[KEY_TEMPDIR], config=config, cores=args.threads, lock=False) else: logger = custom_logger.Logger() status = snakemake.snakemake(preprocessing_snakefile, printshellcmds=False, forceall=True, force_incomplete=True, workdir=config[KEY_TEMPDIR], config=config, cores=args.threads, lock=False, quiet=True, log_handler=logger.log_handler) if status: # translate "success" into shell exit code of 0 if config[KEY_VERBOSE]: print(green("\n**** CONFIG ****")) for k in sorted(config): print(green(k), config[k]) status = snakemake.snakemake(snakefile, printshellcmds=True, forceall=True, force_incomplete=True, workdir=config[KEY_TEMPDIR], config=config, cores=args.threads, lock=False) else: logger = custom_logger.Logger() status = snakemake.snakemake(snakefile, printshellcmds=False, forceall=True, force_incomplete=True, workdir=config[KEY_TEMPDIR], config=config, cores=args.threads, lock=False, quiet=True, log_handler=logger.log_handler) if status: ## Collate the report here preprocessing_csv = os.path.join(config[KEY_TEMPDIR], "preprocessing.csv") inference_csv = os.path.join(config[KEY_TEMPDIR], "inference_report.csv") cached_csv = os.path.join(config[KEY_TEMPDIR], "cache_assigned.csv") constellation_list = get_voc_list( os.path.join(config[KEY_TEMPDIR], "get_constellations.txt"), config[KEY_ALIAS_FILE]) generate_final_report(preprocessing_csv, inference_csv, cached_csv, config[KEY_ALIAS_FILE], constellation_list, config[KEY_PANGOLIN_DATA_VERSION], config[KEY_ANALYSIS_MODE], args.skip_designation_cache, config[KEY_OUTFILE], config) print( green(f"****\nOutput file written to: ") + config[KEY_OUTFILE]) if config[KEY_ALIGNMENT_OUT]: print( green(f"****\nOutput alignment written to: ") + config[KEY_ALIGNMENT_FILE]) return 0 return 1 return 1
try: import constellations except: data_checks.install_error( "constellations", "https://github.com/cov-lineages/constellations.git") import os import sys import argparse try: import snakemake except: sys.stderr.write( cyan( f'Error: package `{snakemake}` not found, please install snakemake or update pangolin environment.\n' )) sys.exit(-1) from pangolin.utils.log_colours import green, cyan from pangolin.utils import dependency_checks from pangolin.utils import update from pangolin.utils.config import * from pangolin.utils.initialising import * import pangolin.utils.io_parsing as io from pangolin.utils.report_collation import generate_final_report, get_voc_list thisdir = os.path.abspath(os.path.dirname(__file__))
def main(sysargs=sys.argv[1:]): parser = argparse.ArgumentParser( prog=_program, description= 'pangolin: Phylogenetic Assignment of Named Global Outbreak LINeages', usage='''pangolin <query> [options]''') parser.add_argument('query', nargs="*", help='Query fasta file of sequences to analyse.') parser.add_argument('--alignment', action="store_true", help="Optional alignment output.") parser.add_argument('--usher', action="store_true", help="Use UShER model instead of default pangoLEARN") parser.add_argument( '--usher-tree', action='store', dest='usher_protobuf', help= "UShER Mutation Annotated Tree protobuf file to use instead of --usher default from pangoLEARN repository or --datadir" ) parser.add_argument( '--max-ambig', action="store", default=0.3, type=float, help= "Maximum proportion of Ns allowed for pangolin to attempt assignment. Default: 0.3", dest="maxambig") parser.add_argument( '--min-length', action="store", default=25000, type=int, help= "Minimum query length allowed for pangolin to attempt assignment. Default: 25000", dest="minlen") parser.add_argument( '-o', '--outdir', action="store", help="Output directory. Default: current working directory") parser.add_argument( '--outfile', action="store", help="Optional output file name. Default: lineage_report.csv") parser.add_argument( '--tempdir', action="store", help="Specify where you want the temp stuff to go. Default: $TMPDIR") parser.add_argument( "--no-temp", action="store_true", help="Output all intermediate files, for dev purposes.") parser.add_argument( '-d', '--datadir', action='store', dest="datadir", help= "Data directory minimally containing a fasta alignment and guide tree") parser.add_argument( '--decompress-model', action="store_true", dest="decompress", help= "Permanently decompress the model file to save time running pangolin.") parser.add_argument("--verbose", action="store_true", help="Print lots of stuff to screen") parser.add_argument("-t", "--threads", action="store", default=1, type=int, help="Number of threads") parser.add_argument("-v", "--version", action='version', version=f"pangolin {__version__}") parser.add_argument("-pv", "--pangoLEARN-version", action='version', version=f"pangoLEARN {pangoLEARN.__version__}", help="show pangoLEARN's version number and exit") parser.add_argument( "-dv", "--pango-designation-version", action='version', version= f"pango-designation {PANGO_VERSION} used for pangoLEARN and UShER training", help="show pango-designation version number used for training and exit" ) parser.add_argument("--aliases", action='store_true', default=False, help="print pango-designation alias_key.json and exit") parser.add_argument( "--update", action='store_true', default=False, help= "Automatically updates to latest release of pangolin, pangoLEARN and constellations, then exits" ) parser.add_argument( "--update-data", action='store_true', dest="update_data", default=False, help= "Automatically updates to latest release of pangoLEARN and constellations, then exits" ) if len(sysargs) < 1: parser.print_help() sys.exit(-1) else: args = parser.parse_args(sysargs) args = parser.parse_args() if args.update: update({ 'pangolin': __version__, 'pangolearn': pangoLEARN.__version__, 'constellations': constellations.__version__, 'scorpio': scorpio.__version__, 'pango-designation': pango_designation.__version__ }) if args.update_data: update({ 'pangolearn': pangoLEARN.__version__, 'constellations': constellations.__version__, 'pango-designation': pango_designation.__version__ }) alias_file = None pango_designation_dir = pango_designation.__path__[0] for r, d, f in os.walk(pango_designation_dir): for fn in f: if fn == "alias_key.json": alias_file = os.path.join(r, fn) if not alias_file: sys.stderr.write( cyan( 'Could not find alias file: please update pango-designation with \n' ) + "pip install git+https://github.com/cov-lineages/pango-designation.git" ) sys.exit(-1) if args.aliases: with open(alias_file, 'r') as handle: for line in handle: print(line.rstrip()) sys.exit(0) dependency_checks.check_dependencies() # to enable not having to pass a query if running update # by allowing query to accept 0 to many arguments if len(args.query) > 1: print( cyan( f"Error: Too many query (input) fasta files supplied: {args.query}\nPlease supply one only" )) parser.print_help() sys.exit(-1) else: # find the query fasta query = os.path.join(cwd, args.query[0]) if not os.path.exists(query): sys.stderr.write( cyan(f'Error: cannot find query (input) fasta file at:') + f'{query}\n' + 'Please enter your fasta sequence file and refer to pangolin usage at: https://cov-lineages.org/pangolin.html' + ' for detailed instructions.\n') sys.exit(-1) else: print(green(f"The query file is:") + f"{query}") # default output dir if args.outdir: outdir = os.path.join(cwd, args.outdir) if not os.path.exists(outdir): try: os.mkdir(outdir) except: sys.stderr.write( cyan(f'Error: cannot create directory:') + f"{outdir}") sys.exit(-1) else: outdir = cwd if args.outfile: outfile = os.path.join(outdir, args.outfile) else: outfile = os.path.join(outdir, "lineage_report.csv") if args.tempdir: to_be_dir = os.path.join(cwd, args.tempdir) if not os.path.exists(to_be_dir): os.mkdir(to_be_dir) temporary_directory = tempfile.TemporaryDirectory(suffix=None, prefix=None, dir=to_be_dir) tempdir = temporary_directory.name else: temporary_directory = tempfile.TemporaryDirectory(suffix=None, prefix=None, dir=None) tempdir = temporary_directory.name if args.no_temp: print( green(f"\n--no-temp: ") + f"all intermediate files will be written to {outdir}\n") tempdir = outdir if args.alignment: align_dir = outdir alignment_out = True else: align_dir = tempdir alignment_out = False """ QC steps: 1) check no empty seqs 2) check N content 3) write a file that contains just the seqs to run """ do_not_run = [] run = [] total_input = 0 print(green("** Sequence QC **")) fmt = "{:<30}\t{:>25}\t{:<10}\n" print("{:<30}\t{:>25}\t{:<10}\n".format("Sequence name", "Reason", "Value")) file_ending = query.split(".")[-1] if file_ending in ["gz", "gzip", "tgz"]: query = gzip.open(query, 'rt') elif file_ending in ["xz", "lzma"]: query = lzma.open(query, 'rt') for record in SeqIO.parse(query, "fasta"): total_input += 1 # replace spaces in sequence headers with underscores record.description = record.description.replace(' ', '_') record.id = record.description if "," in record.id: record.id = record.id.replace(",", "_") if len(record) < args.minlen: record.description = record.description + f" fail=seq_len:{len(record)}" do_not_run.append(record) print(fmt.format(record.id, "Seq too short", len(record))) # print(record.id, "\t\tsequence too short") else: num_N = str(record.seq).upper().count("N") prop_N = round((num_N) / len(record.seq), 2) if prop_N > args.maxambig: record.description = record.description + f" fail=N_content:{prop_N}" do_not_run.append(record) print(fmt.format(record.id, "N content too high", prop_N)) # print("{record.id} | has an N content of {prop_N}") else: run.append(record) print(green("\nNumber of sequences detected: ") + f"{total_input}") print(green("Total passing QC: ") + f"{len(run)}") if run == []: with open(outfile, "w") as fw: fw.write( "taxon,lineage,conflict,ambiguity_score,scorpio_call,scorpio_support,scorpio_conflict,version,pangolin_version,pangoLEARN_version,pango_version,status,note\n" ) for record in do_not_run: desc = record.description.split(" ") reason = "" for item in desc: if item.startswith("fail="): reason = item.split("=")[1] fw.write( f"{record.id},None,,,,,,PANGO-{PANGO_VERSION},{__version__},{pangoLEARN.__version__},{PANGO_VERSION},fail,{reason}\n" ) print(cyan(f'Note: no query sequences have passed the qc\n')) sys.exit(0) post_qc_query = os.path.join(tempdir, 'query.post_qc.fasta') with open(post_qc_query, "w") as fw: SeqIO.write(run, fw, "fasta") qc_fail = os.path.join(tempdir, 'query.failed_qc.fasta') with open(qc_fail, "w") as fw: SeqIO.write(do_not_run, fw, "fasta") config = { "query_fasta": post_qc_query, "outdir": outdir, "outfile": outfile, "tempdir": tempdir, "aligndir": align_dir, "alignment_out": alignment_out, "trim_start": 265, # where to pad to using datafunk "trim_end": 29674, # where to pad after using datafunk "qc_fail": qc_fail, "alias_file": alias_file, "verbose": args.verbose, "pangoLEARN_version": pangoLEARN.__version__, "pangolin_version": __version__, "pango_version": PANGO_VERSION, "threads": args.threads } data_install_checks.check_install(config) snakefile = data_install_checks.get_snakefile(thisdir) dependency_checks.set_up_verbosity(config) # find the data if args.datadir: data_dir = os.path.join(cwd, args.datadir) version = "Unknown" for r, d, f in os.walk(data_dir): for fn in f: if fn == "__init__.py": print("Found __init__.py") with open(os.path.join(r, fn), "r") as fr: for l in fr: if l.startswith("__version__"): l = l.rstrip("\n") version = l.split('=')[1] version = version.replace('"', "").replace(" ", "") print("pangoLEARN version", version) config["pangoLEARN_version"] = version else: pangoLEARN_dir = pangoLEARN.__path__[0] data_dir = os.path.join(pangoLEARN_dir, "data") # print(f"Looking in {data_dir} for data files...") trained_model = "" header_file = "" designated_hash = "" use_usher = args.usher if args.usher_protobuf: usher_protobuf = os.path.join(cwd, args.usher_protobuf) if not os.path.exists(usher_protobuf): sys.stderr.write( 'Error: cannot find --usher-tree file at {}\n'.format( usher_protobuf)) sys.exit(-1) use_usher = True else: usher_protobuf = "" for r, d, f in os.walk(data_dir): for fn in f: if fn == "decisionTreeHeaders_v1.joblib": header_file = os.path.join(r, fn) elif fn == "decisionTree_v1.joblib": trained_model = os.path.join(r, fn) elif fn == "lineages.hash.csv": designated_hash = os.path.join(r, fn) elif fn == "lineageTree.pb" and usher_protobuf == "": usher_protobuf = os.path.join(r, fn) if ((use_usher and (usher_protobuf == "" or designated_hash == "") or (not use_usher and (trained_model == "" or header_file == "" or designated_hash == "")))): print( cyan("""pangoLEARN version should be >= 2021-05-27. \n Appropriate data files not found from the installed pangoLEARN repo. Please see https://cov-lineages.org/pangolin.html for installation and updating instructions.""" )) exit(1) else: if args.decompress: prev_size = os.path.getsize(trained_model) print("Decompressing model and header files.") model = joblib.load(trained_model) joblib.dump(model, trained_model, compress=0) headers = joblib.load(header_file) joblib.dump(headers, header_file, compress=0) if os.path.getsize(trained_model) >= prev_size: print( green(f'Success! Decompressed the model file. Exiting\n')) sys.exit(0) else: print(cyan(f'Error: failed to decompress model. Exiting\n')) sys.exit(0) print(green("\nData files found:")) if use_usher: print(f"UShER tree:\t{usher_protobuf}") print(f"Designated hash:\t{designated_hash}") else: print(f"Trained model:\t{trained_model}") print(f"Header file:\t{header_file}") print(f"Designated hash:\t{designated_hash}") config["trained_model"] = trained_model config["header_file"] = header_file config["designated_hash"] = designated_hash if use_usher: config["usher_protobuf"] = usher_protobuf if config['verbose']: print(green("\n**** CONFIG ****")) for k in sorted(config): print(green(k), config[k]) status = snakemake.snakemake(snakefile, printshellcmds=True, forceall=True, force_incomplete=True, workdir=tempdir, config=config, cores=args.threads, lock=False) else: logger = custom_logger.Logger() status = snakemake.snakemake(snakefile, printshellcmds=False, forceall=True, force_incomplete=True, workdir=tempdir, config=config, cores=args.threads, lock=False, quiet=True, log_handler=config["log_api"]) if status: # translate "success" into shell exit code of 0 return 0 return 1
def update(version_dictionary): """ Using the github releases API check for the latest current release of the set of depdencies provided e.g., pangolin, scorpio, pangolearn and constellations for complete --update and just pangolearn and constellations for --update_data. Dictionary keys must be one of pangolin, scorpio, pangolearn, or constellations Compare these to the currently running versions and if newer releases exist update to them accordingly (or do nothing if current). Afterwards, exit program safely with a 0 exit code. version_dictionary: dictionary keyed with dependency names and version for that dependency e.g. {pangolin: string containing the __version__ data for the currently running pangolin module pangolearn: string containing the __version__ data for the imported pangoLEARN data module scorpio: string containing the __version__ data for the imported scorpio module constellations: string containing the __version__ data for the imported constellations data module pango-designation: string containing the __version__ data for the imported pango_designation data module} """ # flag if any element is update if everything is the latest release # we want to just continue running for dependency, version in version_dictionary.items(): try: latest_release = request.urlopen( f"https://api.github.com/repos/cov-lineages/{dependency}/releases" ) # to catch and give a useful error message when people try to run this # either update option on clusters without external connectivity # or have exceeded the github API limit temporarily # this may also catch genuine bugs when version and release tags diverge # so if this is thrown and there is definitely connectivity then # double check the version labels except Exception as e: sys.stderr.write( cyan("Unable to connect to reach github API " "--update/--data_update requires internet " "connectivity so may not work on certain " "systems or if your IP has exceeded the " f"5,000 request per hour limit\n{e}\n")) sys.exit(-1) latest_release = json.load(latest_release) latest_release = LooseVersion(latest_release[0]['tag_name']) #print(dependency, version, latest_release) # to match the tag names add a v to the pangolin internal version if dependency in ['pangolin', 'scorpio', 'pango-designation']: version = "v" + version # to match the tag names for pangoLEARN add data release elif dependency == 'pangolearn': version = version.replace(' ', ' data release ') # to match the tag names for the constellations data release elif dependency == 'constellations': version = version.replace(' ', ' data release ') else: raise ValueError("Dependency name for auto-update must be one " "of: 'pangolin', 'pangolearn', scorpio', " "'constellations', 'pango-designation'") # convert to LooseVersion to have proper ordering of versions # this prevents someone using the latest commit/HEAD from being # downgraded to the last stable release version = LooseVersion(version) if version < latest_release: subprocess.run([ sys.executable, '-m', 'pip', 'install', '--upgrade', f"git+https://github.com/cov-lineages/{dependency}.git@{latest_release}" ], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) print(f"{dependency} updated to {latest_release}", file=sys.stderr) elif version > latest_release: print( f"{dependency} ({version}) is newer than latest stable " f"release ({latest_release}), not updating.", file=sys.stderr) else: print(f"{dependency} already latest release ({latest_release})", file=sys.stderr) sys.exit(0)
import json from tempfile import gettempdir import tempfile import pprint import json import gzip import lzma import os import joblib from pangolin.utils.log_colours import green, cyan, red try: import pangoLEARN except: sys.stderr.write( cyan('Error: please install `pangoLEARN` with \n') + "pip install git+https://github.com/cov-lineages/pangoLEARN.git") sys.exit(-1) try: import scorpio except: sys.stderr.write( cyan('Error: please install `scorpio` with \n') + "pip install git+https://github.com/cov-lineages/scorpio.git") sys.exit(-1) try: from pangoLEARN import PANGO_VERSION except: sys.stderr.write(
def install_error(package, url): sys.stderr.write( cyan(f'Error: please install `{package}` with \n') + f"pip install git+{url}\n") sys.exit(-1)
def setup_data(datadir_arg,analysis_mode, config): datadir = check_datadir(datadir_arg) pangolin_data_dir = pangolin_data.__path__[0] constellations_dir = constellations.__path__[0] constellation_files = [] data_locations = [os.walk(constellations_dir)] if datadir: data_locations.append(os.walk(datadir)) # the logic of this is to search the "built-in" constellations # path first and then if as custom datadir is passed, follow up with those, so that # any files found in the datadir supercede the "built-in" modules. The assumption # here is that the datadir contains newer (user updated) data for r, _, f in itertools.chain.from_iterable(data_locations): if r.endswith('/constellations') or r.endswith('/constellations/definitions'): constellation_files = [] # only collect the constellations from the last directory found for fn in f: if r.endswith('/constellations') and fn == '__init__.py': constellations_version = version_from_init(os.path.join(r, fn)) elif (r.endswith('/constellations') or r.endswith('/constellations/definitions')) and fn.endswith('.json'): constellation_files.append(os.path.join(r, fn)) pangolin_data_version = pangolin_data.__version__ use_datadir = False datadir_too_old = False if datadir: version = "Unknown" for r,d,f in os.walk(datadir): for fn in f: # pangolin-data/__init__.py not constellations/__init__.py: if r.endswith('data') and fn == "__init__.py": # print("Found " + os.path.join(r, fn)) version = version_from_init(os.path.join(r, fn)) if not version: continue if LooseVersion(version) >= LooseVersion(pangolin_data.__version__): # only use this if the version is >= than what we already have pangolin_data_version = version use_datadir = True else: datadir_too_old = True sys.stderr.write(cyan(f"Warning: Ignoring specified datadir {datadir} - it contains pangoLEARN model files older ({version}) than those installed ({pangolin_data.__version__})\n")) if use_datadir == False: # we haven't got a viable datadir from searching args.datadir if datadir and not datadir_too_old: sys.stderr.write(cyan( f"Warning: Ignoring specified datadir {datadir} - could not find __init__.py file to check versions \n")) pangolin_data_dir = pangolin_data.__path__[0] datadir = os.path.join(pangolin_data_dir,"data") config[KEY_PANGOLIN_DATA_VERSION] = pangolin_data_version config[KEY_CONSTELLATIONS_VERSION] = constellations_version config[KEY_DATADIR] = datadir config[KEY_CONSTELLATION_FILES] = constellation_files
import subprocess import json from tempfile import gettempdir import tempfile import pprint import json import gzip import lzma import os import joblib from pangolin.utils.log_colours import green,cyan,red try: import pangoLEARN except: sys.stderr.write(cyan('Error: please install `pangoLEARN` with \n') + "pip install git+https://github.com/cov-lineages/pangoLEARN.git") sys.exit(-1) try: import scorpio except: sys.stderr.write(cyan('Error: please install `scorpio` with \n') + "pip install git+https://github.com/cov-lineages/scorpio.git") sys.exit(-1) try: from pangoLEARN import PANGO_VERSION except: sys.stderr.write(cyan('Error: please update to pangoLEARN version >= 2021-05-27\n')) sys.exit(-1)
def print_ram_warning(analysis_mode): if analysis_mode == "pangolearn": print(cyan("Warning: pangoLEARN mode may use a significant amount of RAM, be aware that it will not suit every system."))