Пример #1
0
def set_up_tempdir(tempdir_arg,no_temp_arg,cwd,outdir,config):

    if no_temp_arg:
        tempdir = outdir
        config[KEY_TEMPDIR] = tempdir
        print(green(f"\n--no-temp: ") + f"all intermediate files will be written to {outdir}\n")
    elif tempdir_arg:
        to_be_dir = os.path.join(cwd, tempdir_arg)
        try:
            if not os.path.exists(to_be_dir):
                os.mkdir(to_be_dir)
        except:
            sys.stderr.write(cyan(f'Error: cannot create temp directory {to_be_dir}.\n'))
            sys.exit(-1)
        tempdir = tempfile.mkdtemp(dir=to_be_dir)
        config[KEY_TEMPDIR] = tempdir
    else:
        tempdir = tempfile.mkdtemp()
        config[KEY_TEMPDIR] = tempdir
        try:
            if not os.path.exists(tempdir):
                os.mkdir(tempdir)
        except:
            sys.stderr.write(cyan(f'Error: cannot create temp directory {tempdir}.\n'))
            sys.exit(-1)
        
        try:
            with open(os.path.join(tempdir, "test.txt"),"w") as fw:
                fw.write("Test")
        except:
            sys.stderr.write(cyan(f'Error: cannot write to temp directory {tempdir}.\n'))
            sys.exit(-1)
Пример #2
0
def check_dependencies(dependency_list, module_list, usher_arg, cache_arg):

    missing = []

    if usher_arg:
        dependency_list.append("usher")

    if cache_arg:
        dependency_list.append("pangolin-assignment")

    for dependency in dependency_list:
        check_this_dependency(dependency, missing)

    for module in module_list:
        check_module(module, missing)

    if missing:
        if len(missing) == 1:
            sys.stderr.write(
                cyan(f'Error: Missing dependency `{missing[0]}`.') +
                '\nPlease update your pangolin environment.\n')
            sys.exit(-1)
        else:
            dependencies = ""
            for i in missing:
                dependencies += f"\t- {i}\n"

            sys.stderr.write(
                cyan(f'Error: Missing dependencies.') +
                f'\n{dependencies}Please update your pangolin environment.\n')
            sys.exit(-1)
    else:
        print(green("All dependencies satisfied."))
Пример #3
0
def check_dependencies():

    missing = []

    dependency_list = ["gofasta", "minimap2", "snakemake", "usher"]
    module_list = [
        "Bio", "sklearn", "pandas", "joblib", "pysam", "pangoLEARN",
        "constellations"
    ]

    for dependency in dependency_list:
        check_this_dependency(dependency, missing)

    for module in module_list:
        check_module(module, missing)

    if missing:
        if len(missing) == 1:
            sys.stderr.write(
                cyan(f'Error: Missing dependency `{missing[0]}`.') +
                '\nPlease update your pangolin environment.\n')
            sys.exit(-1)
        else:
            dependencies = ""
            for i in missing:
                dependencies += f"\t- {i}\n"

            sys.stderr.write(
                cyan(f'Error: Missing dependencies.') +
                f'\n{dependencies}Please update your pangolin environment.\n')
            sys.exit(-1)
    else:
        print(green("All dependencies satisfied."))
Пример #4
0
def get_assignment_cache(cache_file, config):
    cache = ""
    try:
        import pangolin_assignment
        pangolin_assignment_dir = pangolin_assignment.__path__[0]
        for r, d, f in os.walk(pangolin_assignment_dir):
            for fn in f:
                if fn == cache_file and cache == "":
                    cache = os.path.join(r, fn)
        if not os.path.exists(cache):
            sys.stderr.write(
                cyan(
                    f'Error: cannot find assignment cache file {cache_file} in pangolin_assignment\n'
                ))
            sys.exit(-1)
    except:
        sys.stderr.write(
            cyan(
                '\nError: "pangolin --add-assignment-cache" is required before '
                '"pangolin --use-assignment-cache", in order to install optional '
                'pangolin-assignment repository (that will make future data updates slower).\n'
            ))
        sys.exit(-1)

    # Check versions of pangolin-data and pangolin-assignment to make sure they are consistent.
    if pangolin_assignment.__version__.lstrip(
            'v') != config[KEY_PANGOLIN_DATA_VERSION].lstrip('v'):
        print(
            cyan(
                f'Error: pangolin_assignment cache version {pangolin_assignment.__version__} '
                f'does not match pangolin_data version {config[KEY_PANGOLIN_DATA_VERSION]}. '
                'Run "pangolin --update-data" to fetch latest versions of both.'
            ))
        sys.exit(-1)

    try:
        with gzip.open(cache, 'rt') as f:
            line = f.readline()
    except:
        with open(cache, 'r') as f:
            line = f.readline()
            if "git-lfs.github.com" in line:
                sys.stderr.write(
                    cyan(
                        'Error: Git LFS file not pulled successfully. Please install git-lfs \nusing conda or an alternative (not pip) then re-install pangolin-assignment \nwith pip install git+https://github.com/cov-lineages/pangolin-assignment.git\n'
                    ))
                sys.exit(-1)
    return cache
Пример #5
0
def get_snakefile(thisdir,analysis_mode):
    # in this case now, the snakefile used should be the name of the analysis mode (i.e. pangolearn, usher or preprocessing)
    snakefile = os.path.join(thisdir, 'scripts',f'{analysis_mode}.smk')
    if not os.path.exists(snakefile):
        sys.stderr.write(cyan(f'Error: cannot find Snakefile at {snakefile}. Check installation\n'))
        sys.exit(-1)
    return snakefile
Пример #6
0
def get_latest_release(dependency):
    """
    Using the github releases API check for the latest release of dependency and its tarball
    """
    try:
        latest_release = request.urlopen(
            f"https://api.github.com/repos/cov-lineages/{dependency}/releases")
    # to catch and give a useful error message when people try to run this
    # either update option on clusters without external connectivity
    # or have exceeded the github API limit temporarily
    # this may also catch genuine bugs when version and release tags diverge
    # so if this is thrown and there is definitely connectivity then
    # double check the version labels
    except Exception as e:
        sys.stderr.write(
            cyan("Unable to connect to reach github API "
                 "--update/--data_update requires internet "
                 "connectivity so may not work on certain "
                 "systems or if your IP has exceeded the "
                 f"5,000 request per hour limit\n{e}\n"))
        sys.exit(-1)

    latest_release = json.load(latest_release)
    latest_release_tarball = latest_release[0]['tarball_url']
    # extract and clean up latest release version
    latest_release = latest_release[0]['tag_name']
    return latest_release, latest_release_tarball
Пример #7
0
def get_snakefile(thisdir):
    snakefile = os.path.join(thisdir, 'scripts', 'pangolearn.smk')
    if not os.path.exists(snakefile):
        sys.stderr.write(
            cyan(
                f'Error: cannot find Snakefile at {snakefile}\n Check installation\n'
            ))
        sys.exit(-1)
    return snakefile
Пример #8
0
def set_up_outdir(outdir_arg,cwd,outdir):
    if outdir_arg:
        outdir = os.path.join(cwd, outdir_arg)
        if not os.path.exists(outdir):
            try:
                os.mkdir(outdir)
            except:
                sys.stderr.write(cyan(f'Error: cannot create directory:') + f"{outdir}")
                sys.exit(-1)
    return outdir
Пример #9
0
def check_datadir(datadir_arg):
    datadir = None
    # find the data
    if datadir_arg:
        # this needs to be an absolute path when we pass it to scorpio
        datadir = os.path.abspath(datadir_arg)
        if not os.path.exists(datadir):
            sys.stderr.write(cyan(f"Cannot find data directory specified: {datadir}\n"))
            sys.exit(-1)
    return datadir
Пример #10
0
def package_data_check(filename, directory, key, config):
    try:
        package_datafile = os.path.join(directory, filename)
        data = pkg_resources.resource_filename('pangolin', package_datafile)
        config[key] = data
    except:
        sys.stderr.write(
            cyan(f'Error: Missing package data.') +
            f'\n\t- {filename}\nPlease install the latest pangolin version with `pangolin --update`.\n'
        )
        sys.exit(-1)
Пример #11
0
def find_query_file(cwd, tempdir, query_arg):
    if len(query_arg) > 1:
        print(cyan(f"Error: Too many query (input) fasta files supplied: {query_arg}\nPlease supply one only."))
        sys.exit(-1)

    # find the query fasta
    try:
        if not os.path.exists(os.path.join(cwd, query_arg[0])):
            if select.select([sys.stdin,],[],[],0.0)[0]:
                query = os.path.join(tempdir, "stdin_query.fasta")
                with open(query,"w") as fw:
                    for l in sys.stdin:
                        l= l.rstrip("\n")
                        fw.write(l + '\n')
                
                print(green("Query:\t") + "reading from stdin.")
            elif not select.select([sys.stdin,],[],[],0.0)[0]:
                tried_path = os.path.join(cwd, query_arg[0])
                if tried_path.endswith("-"):
                    sys.stderr.write(cyan(
                        f'Error: cannot find query (input) fasta file using stdin.\n' +
                                    'Please enter your fasta sequence file and refer to pangolin usage at: https://cov-lineages.org/pangolin.html' +
                                    ' for detailed instructions.\n'))
                    sys.exit(-1)
                else:
                    sys.stderr.write(cyan(f'Error: cannot find query (input) fasta file at:') + f'{tried_path}\n' +
                                    'Please enter your fasta sequence file and refer to pangolin usage at: https://cov-lineages.org/pangolin.html' +
                                    ' for detailed instructions.\n')
                    sys.exit(-1)
        else:
            query = os.path.join(cwd, query_arg[0])
            print(green(f"Query file:\t") + f"{query}")
    except IndexError:
        sys.stderr.write(cyan(
            f'Error: input query fasta could not be detected from a filepath or through stdin.\n' +
            'Please enter your fasta sequence file and refer to pangolin usage at: https://cov-lineages.org/pangolin.html' +
            ' for detailed instructions.\n'))
        sys.exit(-1)

    return query
Пример #12
0
def git_lfs_install():
    """
    'git-lfs install' must be run after installing git-lfs and before cloning a repo
    that uses Git LFS.
    """
    try:
        subprocess.run(['git-lfs', 'install'],
                       check=True,
                       stdout=subprocess.DEVNULL,
                       stderr=subprocess.DEVNULL)
    except CalledProcessError as e:
        sys.stderr.write(cyan(f'Error: "git-lfs install" failed: {e}'))
        sys.exit(-1)
Пример #13
0
def find_designation_cache_and_alias(datadir, designation_cache_file,
                                     alias_file):
    designation_cache = ""
    alias = ""
    for r, d, f in os.walk(datadir):
        for fn in f:
            if fn == designation_cache_file:
                designation_cache = os.path.join(r, fn)
            elif fn == alias_file:
                alias = os.path.join(r, fn)
    if designation_cache == "":
        sys.stderr.write(
            cyan(
                f'Error: Missing designation cache file. Either supply a datadir with a {designation_cache_file} file, or specify `--skip-designation-cache`\n'
            ))
        sys.exit(-1)
    elif alias == "":
        sys.stderr.write(
            cyan(
                f'Error: Missing alias file. Please supply a datadir with a {alias_file} file or check installation of pangolin-data dependency.\n'
            ))
        sys.exit(-1)
    return designation_cache, alias
Пример #14
0
def get_datafiles(datadir, file_dict, config):
    datafiles = {}
    for r, d, f in os.walk(datadir):
        for fn in f:
            if fn in file_dict:
                datafiles[file_dict[fn]] = os.path.join(r, fn)
    for fn in datafiles:
        config[fn] = datafiles[fn]
    for fn in file_dict:
        if file_dict[fn] not in config:
            sys.stderr.write(
                cyan(
                    f'Error: Cannot find {fn} in datadir. Please supply a datadir with required files or specify an alternative analysis mode.\nPlease see https://cov-lineages.org/pangolin.html for full installation and updating instructions.'
                ))
            sys.exit(-1)

    print(green("****\nData files found:"))
    for fn in datafiles:
        print(f"{fn}:\t{datafiles[fn]}")
        config[fn] = datafiles[fn]
    print(green("****"))
Пример #15
0
def set_up_analysis_mode(analysis_arg, default_mode):
    """
    the logic here 
    - takes the default mode set in the config dict (accurate)
    - it equates the usher arg to accurate arg and pangolearn to fast
    - checks if incompatible flags were used (only one of accurate, fast or cache)
    - overwrites default if any other analysis mode flagged
    - returns new analysis mode
    """
    
    analysis_mode = default_mode
    if analysis_arg:
        if not analysis_arg in ["usher","pangolearn","fast","accurate"]:
            sys.stderr.write(cyan(f"Invalid `--analysis-mode` option specified: please select one of `fast`,`accurate`,`pangolearn` or`usher`\n"))
            sys.exit(-1)

        if analysis_arg in ['pangolearn','fast']:
            analysis_mode = "pangolearn"
        elif analysis_arg in ['usher','accurate']:
            analysis_mode = "usher"

    return analysis_mode
Пример #16
0
def quick_check_query_file(cwd, query_arg, query):

    if os.path.exists(os.path.join(cwd, query_arg[0])):
        file_ending = query.split(".")[-1]
        if file_ending in ["gz","gzip","tgz"]:
            query = gzip.open(query, 'rt')
        elif file_ending in ["xz","lzma"]:
            query = lzma.open(query, 'rt')
    try:
        parse= True
        c = 0
        
        for record in SeqIO.parse(query, "fasta"):
            if parse == False:
                break
            parse = False
    except UnicodeDecodeError:
        sys.stderr.write(cyan(
            f'Error: the input query fasta could not be parsed.\n' +
            'Double check your query fasta and that compressed stdin was not passed.\n' +
            'Please enter your fasta sequence file and refer to pangolin usage at: https://cov-lineages.org/pangolin.html' +
            ' for detailed instructions.\n'))
        sys.exit(-1)
Пример #17
0
def main(sysargs=sys.argv[1:]):
    parser = argparse.ArgumentParser(
        prog=_program,
        description=
        'pangolin: Phylogenetic Assignment of Named Global Outbreak LINeages',
        usage='''pangolin <query> [options]''')

    io_group = parser.add_argument_group('Input-Output options')
    io_group.add_argument('query',
                          nargs="*",
                          help='Query fasta file of sequences to analyse.')
    io_group.add_argument(
        '-o',
        '--outdir',
        action="store",
        help="Output directory. Default: current working directory")
    io_group.add_argument(
        '--outfile',
        action="store",
        help="Optional output file name. Default: lineage_report.csv")
    io_group.add_argument(
        '--tempdir',
        action="store",
        help="Specify where you want the temp stuff to go. Default: $TMPDIR")
    io_group.add_argument(
        "--no-temp",
        action="store_true",
        help="Output all intermediate files, for dev purposes.")
    io_group.add_argument('--alignment',
                          action="store_true",
                          help="Output multiple sequence alignment.")
    io_group.add_argument('--alignment-file',
                          action="store",
                          help="Multiple sequence alignment file name.")
    io_group.add_argument(
        '--expanded-lineage',
        action="store_true",
        default=False,
        help="Optional expanded lineage from alias.json in report.")

    a_group = parser.add_argument_group('Analysis options')
    a_group.add_argument(
        '--analysis-mode',
        action="store",
        help=
        "Specify which inference engine to use. Options: accurate (UShER), fast (pangoLEARN), pangolearn, usher. Default: UShER inference."
    )

    a_group.add_argument(
        "--skip-designation-cache",
        action='store_true',
        default=False,
        help=
        "Developer option - do not use designation cache to assign lineages.",
        dest="skip_designation_cache")
    a_group.add_argument(
        "--skip-scorpio",
        action='store_true',
        default=False,
        help=
        "Developer option - do not use scorpio to check VOC/VUI lineage assignments.",
        dest="skip_scorpio")

    a_group.add_argument(
        '--max-ambig',
        action="store",
        default=0.3,
        type=float,
        help=
        "Maximum proportion of Ns allowed for pangolin to attempt assignment. Default: 0.3",
        dest="maxambig")
    a_group.add_argument(
        '--min-length',
        action="store",
        default=25000,
        type=int,
        help=
        "Minimum query length allowed for pangolin to attempt assignment. Default: 25000",
        dest="minlen")
    a_group.add_argument('--usher',
                         action='store_true',
                         default=False,
                         help=argparse.SUPPRESS)

    d_group = parser.add_argument_group('Data options')
    d_group.add_argument(
        "--update",
        action='store_true',
        default=False,
        help=
        "Automatically updates to latest release of pangolin, pangolin-data, scorpio and constellations (and pangolin-assignment if it has been installed using --add-assignment-cache), then exits."
    )
    d_group.add_argument(
        "--update-data",
        action='store_true',
        dest="update_data",
        default=False,
        help=
        "Automatically updates to latest release of constellations and pangolin-data, including the pangoLEARN model, UShER tree file and alias file (also pangolin-assignment if it has been installed using --add-assignment-cache), then exits."
    )
    d_group.add_argument(
        '--add-assignment-cache',
        action='store_true',
        dest="add_assignment_cache",
        default=False,
        help=
        "Install the pangolin-assignment repository for use with --use-assignment-cache.  This makes updates slower and makes pangolin slower for small numbers of input sequences but much faster for large numbers of input sequences."
    )
    d_group.add_argument(
        '--use-assignment-cache',
        action='store_true',
        dest="use_assignment_cache",
        default=False,
        help=
        "Use assignment cache from optional pangolin-assignment repository. NOTE: the repository must be installed by --add-assignment-cache before using --use-assignment-cache."
    )
    d_group.add_argument(
        '-d',
        '--datadir',
        action='store',
        dest="datadir",
        help=
        "Data directory minimally containing the pangoLEARN model, header files and UShER tree. Default: Installed pangolin-data package."
    )
    d_group.add_argument(
        '--usher-tree',
        action='store',
        dest='usher_protobuf',
        help=
        "UShER Mutation Annotated Tree protobuf file to use instead of default from pangolin-data repository or --datadir."
    )
    d_group.add_argument(
        '--assignment-cache',
        action='store',
        dest='assignment_cache',
        help=
        "Cached precomputed assignment file to use instead of default from pangolin-assignment repository.  Does not require installation of pangolin-assignment."
    )

    m_group = parser.add_argument_group('Misc options')
    m_group.add_argument("--aliases",
                         action='store_true',
                         default=False,
                         help="Print Pango alias_key.json and exit.")
    m_group.add_argument("-v",
                         "--version",
                         action='version',
                         version=f"pangolin {__version__}")
    m_group.add_argument(
        "-pv",
        "--pangolin-data-version",
        action='version',
        version=f"pangolin-data {pangolin_data.__version__}",
        help=
        "show version number of pangolin data files (UShER tree and pangoLEARN model files) and exit."
    )
    m_group.add_argument(
        "--all-versions",
        action='store_true',
        dest="all_versions",
        default=False,
        help="Print all tool, dependency, and data versions then exit.")
    m_group.add_argument("--verbose",
                         action="store_true",
                         help="Print lots of stuff to screen")
    m_group.add_argument("-t",
                         "--threads",
                         action="store",
                         default=1,
                         type=int,
                         help="Number of threads")

    if len(sysargs) < 1:
        parser.print_help()
        sys.exit(-1)
    else:
        args = parser.parse_args(sysargs)

    # Initialise config dict
    config = setup_config_dict(cwd)
    data_checks.check_install(config)
    set_up_verbosity(config)

    if args.usher:
        sys.stderr.write(
            cyan(
                f"--usher is a pangolin v3 option and is deprecated in pangolin v4.  UShER is now the default analysis mode.  Use --analysis-mode to explicitly set mode.\n"
            ))

    setup_data(args.datadir, config[KEY_ANALYSIS_MODE], config)

    if args.add_assignment_cache:
        update.install_pangolin_assignment()

    if args.update:
        version_dictionary = {
            'pangolin': __version__,
            'pangolin-data': config[KEY_PANGOLIN_DATA_VERSION],
            'constellations': config[KEY_CONSTELLATIONS_VERSION],
            'scorpio': config[KEY_SCORPIO_VERSION]
        }
        update.add_pangolin_assignment_if_installed(version_dictionary)
        update.update(version_dictionary)

    if args.update_data:
        version_dictionary = {
            'pangolin-data': config[KEY_PANGOLIN_DATA_VERSION],
            'constellations': config[KEY_CONSTELLATIONS_VERSION]
        }
        update.add_pangolin_assignment_if_installed(version_dictionary)
        update.update(version_dictionary, args.datadir)

    # install_pangolin_assignment doesn't exit so that --update/--update-data can be given at the
    # same time (or a query file).  If --add-assignment-cache is the only arg, exit without error.
    if args.add_assignment_cache and not args.query:
        sys.exit(0)

    # add flag to config for whether to run scorpio
    if args.skip_scorpio:
        print(green(f"****\nPangolin skipping scorpio steps.\n****"))
        config[KEY_SKIP_SCORPIO] = True

    if args.expanded_lineage:
        print(green(f"****\nAdding expanded lineage column to output.\n****"))
        config[KEY_EXPANDED_LINEAGE] = True

    # Parsing analysis mode flags to return one of 'usher' or 'pangolearn'
    config[KEY_ANALYSIS_MODE] = set_up_analysis_mode(args.analysis_mode,
                                                     config[KEY_ANALYSIS_MODE])

    snakefile = get_snakefile(thisdir, config[KEY_ANALYSIS_MODE])

    config[KEY_DESIGNATION_CACHE], config[
        KEY_ALIAS_FILE] = data_checks.find_designation_cache_and_alias(
            config[KEY_DATADIR], DESIGNATION_CACHE_FILE, ALIAS_FILE)
    if args.aliases:
        print_alias_file_exit(config[KEY_ALIAS_FILE])

    if args.all_versions:
        print_versions_exit(config)

    # to enable not having to pass a query if running update
    # by allowing query to accept 0 to many arguments

    print(
        green(
            f"****\nPangolin running in {config[KEY_ANALYSIS_MODE]} mode.\n****"
        ))
    print_ram_warning(config[KEY_ANALYSIS_MODE])

    #   setup outdir and outfiles
    config[KEY_OUTDIR] = io.set_up_outdir(args.outdir, cwd, config[KEY_OUTDIR])
    config[KEY_OUTFILE] = io.set_up_outfile(args.outfile, config[KEY_OUTFILE],
                                            config[KEY_OUTDIR])
    io.set_up_tempdir(args.tempdir, args.no_temp, cwd, config[KEY_OUTDIR],
                      config)
    config[KEY_ALIGNMENT_FILE], config[
        KEY_ALIGNMENT_OUT] = io.parse_alignment_options(
            args.alignment, config[KEY_OUTDIR], config[KEY_TEMPDIR],
            args.alignment_file, config[KEY_ALIGNMENT_FILE])

    config[KEY_QUERY_FASTA] = io.find_query_file(cwd, config[KEY_TEMPDIR],
                                                 args.query)

    io.quick_check_query_file(cwd, args.query, config[KEY_QUERY_FASTA])

    if config[KEY_ANALYSIS_MODE] == "usher":
        # Find usher protobuf file (and if specified, assignment cache file too)
        data_checks.get_datafiles(config[KEY_DATADIR], usher_files, config)
        if args.usher_protobuf:
            config[KEY_USHER_PB] = data_checks.check_file_arg(
                args.usher_protobuf, cwd, '--usher-tree')
            print(green(f"Using usher tree file {args.usher_protobuf}"))
        if args.assignment_cache:
            config[KEY_ASSIGNMENT_CACHE] = data_checks.check_file_arg(
                args.assignment_cache, cwd, '--assignment-cache')
            print(
                green(f"Using assignment cache file {args.assignment_cache}"))
        elif args.use_assignment_cache:
            config[KEY_ASSIGNMENT_CACHE] = data_checks.get_assignment_cache(
                USHER_ASSIGNMENT_CACHE_FILE, config)
            print(green("Using pangolin-assignment cache"))
        else:
            config[KEY_ASSIGNMENT_CACHE] = ""

    elif config[KEY_ANALYSIS_MODE] == "pangolearn":
        # find designation cache and the model files
        data_checks.get_datafiles(config[KEY_DATADIR], pangolearn_files,
                                  config)
        if args.use_assignment_cache or args.assignment_cache:
            sys.stderr.write(
                cyan(
                    f"Warning: --use-assignment-cache and --assignment-cache are ignored when --analysis-mode is 'fast' or 'pangolearn'.\n"
                ))

    preprocessing_snakefile = get_snakefile(thisdir, "preprocessing")

    if args.verbose:
        print(green("\n**** CONFIG ****"))
        for k in sorted(config):
            print(green(k), config[k])

        status = snakemake.snakemake(preprocessing_snakefile,
                                     printshellcmds=True,
                                     forceall=True,
                                     force_incomplete=True,
                                     workdir=config[KEY_TEMPDIR],
                                     config=config,
                                     cores=args.threads,
                                     lock=False)
    else:
        logger = custom_logger.Logger()
        status = snakemake.snakemake(preprocessing_snakefile,
                                     printshellcmds=False,
                                     forceall=True,
                                     force_incomplete=True,
                                     workdir=config[KEY_TEMPDIR],
                                     config=config,
                                     cores=args.threads,
                                     lock=False,
                                     quiet=True,
                                     log_handler=logger.log_handler)
    if status:  # translate "success" into shell exit code of 0

        if config[KEY_VERBOSE]:
            print(green("\n**** CONFIG ****"))
            for k in sorted(config):
                print(green(k), config[k])

            status = snakemake.snakemake(snakefile,
                                         printshellcmds=True,
                                         forceall=True,
                                         force_incomplete=True,
                                         workdir=config[KEY_TEMPDIR],
                                         config=config,
                                         cores=args.threads,
                                         lock=False)
        else:
            logger = custom_logger.Logger()
            status = snakemake.snakemake(snakefile,
                                         printshellcmds=False,
                                         forceall=True,
                                         force_incomplete=True,
                                         workdir=config[KEY_TEMPDIR],
                                         config=config,
                                         cores=args.threads,
                                         lock=False,
                                         quiet=True,
                                         log_handler=logger.log_handler)

        if status:

            ## Collate the report here

            preprocessing_csv = os.path.join(config[KEY_TEMPDIR],
                                             "preprocessing.csv")
            inference_csv = os.path.join(config[KEY_TEMPDIR],
                                         "inference_report.csv")
            cached_csv = os.path.join(config[KEY_TEMPDIR],
                                      "cache_assigned.csv")
            constellation_list = get_voc_list(
                os.path.join(config[KEY_TEMPDIR], "get_constellations.txt"),
                config[KEY_ALIAS_FILE])

            generate_final_report(preprocessing_csv, inference_csv, cached_csv,
                                  config[KEY_ALIAS_FILE], constellation_list,
                                  config[KEY_PANGOLIN_DATA_VERSION],
                                  config[KEY_ANALYSIS_MODE],
                                  args.skip_designation_cache,
                                  config[KEY_OUTFILE], config)

            print(
                green(f"****\nOutput file written to: ") + config[KEY_OUTFILE])

            if config[KEY_ALIGNMENT_OUT]:
                print(
                    green(f"****\nOutput alignment written to: ") +
                    config[KEY_ALIGNMENT_FILE])

            return 0

        return 1
    return 1
Пример #18
0
try:
    import constellations
except:
    data_checks.install_error(
        "constellations", "https://github.com/cov-lineages/constellations.git")

import os
import sys
import argparse

try:
    import snakemake
except:
    sys.stderr.write(
        cyan(
            f'Error: package `{snakemake}` not found, please install snakemake or update pangolin environment.\n'
        ))
    sys.exit(-1)

from pangolin.utils.log_colours import green, cyan
from pangolin.utils import dependency_checks

from pangolin.utils import update

from pangolin.utils.config import *
from pangolin.utils.initialising import *
import pangolin.utils.io_parsing as io

from pangolin.utils.report_collation import generate_final_report, get_voc_list

thisdir = os.path.abspath(os.path.dirname(__file__))
Пример #19
0
def main(sysargs=sys.argv[1:]):

    parser = argparse.ArgumentParser(
        prog=_program,
        description=
        'pangolin: Phylogenetic Assignment of Named Global Outbreak LINeages',
        usage='''pangolin <query> [options]''')

    parser.add_argument('query',
                        nargs="*",
                        help='Query fasta file of sequences to analyse.')
    parser.add_argument('--alignment',
                        action="store_true",
                        help="Optional alignment output.")
    parser.add_argument('--usher',
                        action="store_true",
                        help="Use UShER model instead of default pangoLEARN")
    parser.add_argument(
        '--usher-tree',
        action='store',
        dest='usher_protobuf',
        help=
        "UShER Mutation Annotated Tree protobuf file to use instead of --usher default from pangoLEARN repository or --datadir"
    )
    parser.add_argument(
        '--max-ambig',
        action="store",
        default=0.3,
        type=float,
        help=
        "Maximum proportion of Ns allowed for pangolin to attempt assignment. Default: 0.3",
        dest="maxambig")
    parser.add_argument(
        '--min-length',
        action="store",
        default=25000,
        type=int,
        help=
        "Minimum query length allowed for pangolin to attempt assignment. Default: 25000",
        dest="minlen")
    parser.add_argument(
        '-o',
        '--outdir',
        action="store",
        help="Output directory. Default: current working directory")
    parser.add_argument(
        '--outfile',
        action="store",
        help="Optional output file name. Default: lineage_report.csv")
    parser.add_argument(
        '--tempdir',
        action="store",
        help="Specify where you want the temp stuff to go. Default: $TMPDIR")
    parser.add_argument(
        "--no-temp",
        action="store_true",
        help="Output all intermediate files, for dev purposes.")
    parser.add_argument(
        '-d',
        '--datadir',
        action='store',
        dest="datadir",
        help=
        "Data directory minimally containing a fasta alignment and guide tree")
    parser.add_argument(
        '--decompress-model',
        action="store_true",
        dest="decompress",
        help=
        "Permanently decompress the model file to save time running pangolin.")
    parser.add_argument("--verbose",
                        action="store_true",
                        help="Print lots of stuff to screen")
    parser.add_argument("-t",
                        "--threads",
                        action="store",
                        default=1,
                        type=int,
                        help="Number of threads")
    parser.add_argument("-v",
                        "--version",
                        action='version',
                        version=f"pangolin {__version__}")
    parser.add_argument("-pv",
                        "--pangoLEARN-version",
                        action='version',
                        version=f"pangoLEARN {pangoLEARN.__version__}",
                        help="show pangoLEARN's version number and exit")
    parser.add_argument(
        "-dv",
        "--pango-designation-version",
        action='version',
        version=
        f"pango-designation {PANGO_VERSION} used for pangoLEARN and UShER training",
        help="show pango-designation version number used for training and exit"
    )
    parser.add_argument("--aliases",
                        action='store_true',
                        default=False,
                        help="print pango-designation alias_key.json and exit")
    parser.add_argument(
        "--update",
        action='store_true',
        default=False,
        help=
        "Automatically updates to latest release of pangolin, pangoLEARN and constellations, then exits"
    )
    parser.add_argument(
        "--update-data",
        action='store_true',
        dest="update_data",
        default=False,
        help=
        "Automatically updates to latest release of pangoLEARN and constellations, then exits"
    )

    if len(sysargs) < 1:
        parser.print_help()
        sys.exit(-1)
    else:
        args = parser.parse_args(sysargs)
    args = parser.parse_args()

    if args.update:
        update({
            'pangolin': __version__,
            'pangolearn': pangoLEARN.__version__,
            'constellations': constellations.__version__,
            'scorpio': scorpio.__version__,
            'pango-designation': pango_designation.__version__
        })

    if args.update_data:
        update({
            'pangolearn': pangoLEARN.__version__,
            'constellations': constellations.__version__,
            'pango-designation': pango_designation.__version__
        })

    alias_file = None
    pango_designation_dir = pango_designation.__path__[0]
    for r, d, f in os.walk(pango_designation_dir):
        for fn in f:
            if fn == "alias_key.json":
                alias_file = os.path.join(r, fn)
    if not alias_file:
        sys.stderr.write(
            cyan(
                'Could not find alias file: please update pango-designation with \n'
            ) +
            "pip install git+https://github.com/cov-lineages/pango-designation.git"
        )
        sys.exit(-1)

    if args.aliases:
        with open(alias_file, 'r') as handle:
            for line in handle:
                print(line.rstrip())
        sys.exit(0)

    dependency_checks.check_dependencies()

    # to enable not having to pass a query if running update
    # by allowing query to accept 0 to many arguments
    if len(args.query) > 1:
        print(
            cyan(
                f"Error: Too many query (input) fasta files supplied: {args.query}\nPlease supply one only"
            ))
        parser.print_help()
        sys.exit(-1)
    else:
        # find the query fasta
        query = os.path.join(cwd, args.query[0])
        if not os.path.exists(query):
            sys.stderr.write(
                cyan(f'Error: cannot find query (input) fasta file at:') +
                f'{query}\n' +
                'Please enter your fasta sequence file and refer to pangolin usage at: https://cov-lineages.org/pangolin.html'
                + ' for detailed instructions.\n')
            sys.exit(-1)
        else:
            print(green(f"The query file is:") + f"{query}")

        # default output dir

    if args.outdir:
        outdir = os.path.join(cwd, args.outdir)
        if not os.path.exists(outdir):
            try:
                os.mkdir(outdir)
            except:
                sys.stderr.write(
                    cyan(f'Error: cannot create directory:') + f"{outdir}")
                sys.exit(-1)
    else:
        outdir = cwd

    if args.outfile:
        outfile = os.path.join(outdir, args.outfile)
    else:
        outfile = os.path.join(outdir, "lineage_report.csv")

    if args.tempdir:
        to_be_dir = os.path.join(cwd, args.tempdir)
        if not os.path.exists(to_be_dir):
            os.mkdir(to_be_dir)
        temporary_directory = tempfile.TemporaryDirectory(suffix=None,
                                                          prefix=None,
                                                          dir=to_be_dir)
        tempdir = temporary_directory.name
    else:
        temporary_directory = tempfile.TemporaryDirectory(suffix=None,
                                                          prefix=None,
                                                          dir=None)
        tempdir = temporary_directory.name

    if args.no_temp:
        print(
            green(f"\n--no-temp: ") +
            f"all intermediate files will be written to {outdir}\n")
        tempdir = outdir

    if args.alignment:
        align_dir = outdir
        alignment_out = True
    else:
        align_dir = tempdir
        alignment_out = False
    """
    QC steps:
    1) check no empty seqs
    2) check N content
    3) write a file that contains just the seqs to run
    """

    do_not_run = []
    run = []
    total_input = 0
    print(green("** Sequence QC **"))
    fmt = "{:<30}\t{:>25}\t{:<10}\n"

    print("{:<30}\t{:>25}\t{:<10}\n".format("Sequence name", "Reason",
                                            "Value"))

    file_ending = query.split(".")[-1]
    if file_ending in ["gz", "gzip", "tgz"]:
        query = gzip.open(query, 'rt')
    elif file_ending in ["xz", "lzma"]:
        query = lzma.open(query, 'rt')

    for record in SeqIO.parse(query, "fasta"):
        total_input += 1
        # replace spaces in sequence headers with underscores
        record.description = record.description.replace(' ', '_')
        record.id = record.description
        if "," in record.id:
            record.id = record.id.replace(",", "_")

        if len(record) < args.minlen:
            record.description = record.description + f" fail=seq_len:{len(record)}"
            do_not_run.append(record)
            print(fmt.format(record.id, "Seq too short", len(record)))
            # print(record.id, "\t\tsequence too short")
        else:
            num_N = str(record.seq).upper().count("N")
            prop_N = round((num_N) / len(record.seq), 2)
            if prop_N > args.maxambig:
                record.description = record.description + f" fail=N_content:{prop_N}"
                do_not_run.append(record)
                print(fmt.format(record.id, "N content too high", prop_N))
                # print("{record.id} | has an N content of {prop_N}")
            else:
                run.append(record)

    print(green("\nNumber of sequences detected: ") + f"{total_input}")
    print(green("Total passing QC: ") + f"{len(run)}")

    if run == []:
        with open(outfile, "w") as fw:
            fw.write(
                "taxon,lineage,conflict,ambiguity_score,scorpio_call,scorpio_support,scorpio_conflict,version,pangolin_version,pangoLEARN_version,pango_version,status,note\n"
            )
            for record in do_not_run:
                desc = record.description.split(" ")
                reason = ""
                for item in desc:
                    if item.startswith("fail="):
                        reason = item.split("=")[1]
                fw.write(
                    f"{record.id},None,,,,,,PANGO-{PANGO_VERSION},{__version__},{pangoLEARN.__version__},{PANGO_VERSION},fail,{reason}\n"
                )
        print(cyan(f'Note: no query sequences have passed the qc\n'))
        sys.exit(0)

    post_qc_query = os.path.join(tempdir, 'query.post_qc.fasta')
    with open(post_qc_query, "w") as fw:
        SeqIO.write(run, fw, "fasta")
    qc_fail = os.path.join(tempdir, 'query.failed_qc.fasta')
    with open(qc_fail, "w") as fw:
        SeqIO.write(do_not_run, fw, "fasta")

    config = {
        "query_fasta": post_qc_query,
        "outdir": outdir,
        "outfile": outfile,
        "tempdir": tempdir,
        "aligndir": align_dir,
        "alignment_out": alignment_out,
        "trim_start": 265,  # where to pad to using datafunk
        "trim_end": 29674,  # where to pad after using datafunk
        "qc_fail": qc_fail,
        "alias_file": alias_file,
        "verbose": args.verbose,
        "pangoLEARN_version": pangoLEARN.__version__,
        "pangolin_version": __version__,
        "pango_version": PANGO_VERSION,
        "threads": args.threads
    }

    data_install_checks.check_install(config)
    snakefile = data_install_checks.get_snakefile(thisdir)

    dependency_checks.set_up_verbosity(config)

    # find the data
    if args.datadir:
        data_dir = os.path.join(cwd, args.datadir)
        version = "Unknown"
        for r, d, f in os.walk(data_dir):
            for fn in f:
                if fn == "__init__.py":
                    print("Found __init__.py")
                    with open(os.path.join(r, fn), "r") as fr:
                        for l in fr:
                            if l.startswith("__version__"):
                                l = l.rstrip("\n")
                                version = l.split('=')[1]
                                version = version.replace('"',
                                                          "").replace(" ", "")
                                print("pangoLEARN version", version)
        config["pangoLEARN_version"] = version

    else:
        pangoLEARN_dir = pangoLEARN.__path__[0]
        data_dir = os.path.join(pangoLEARN_dir, "data")
    # print(f"Looking in {data_dir} for data files...")
    trained_model = ""
    header_file = ""
    designated_hash = ""
    use_usher = args.usher
    if args.usher_protobuf:
        usher_protobuf = os.path.join(cwd, args.usher_protobuf)
        if not os.path.exists(usher_protobuf):
            sys.stderr.write(
                'Error: cannot find --usher-tree file at {}\n'.format(
                    usher_protobuf))
            sys.exit(-1)
        use_usher = True
    else:
        usher_protobuf = ""

    for r, d, f in os.walk(data_dir):
        for fn in f:
            if fn == "decisionTreeHeaders_v1.joblib":
                header_file = os.path.join(r, fn)
            elif fn == "decisionTree_v1.joblib":
                trained_model = os.path.join(r, fn)
            elif fn == "lineages.hash.csv":
                designated_hash = os.path.join(r, fn)
            elif fn == "lineageTree.pb" and usher_protobuf == "":
                usher_protobuf = os.path.join(r, fn)
    if ((use_usher and (usher_protobuf == "" or designated_hash == "")
         or (not use_usher and (trained_model == "" or header_file == ""
                                or designated_hash == "")))):
        print(
            cyan("""pangoLEARN version should be >= 2021-05-27. \n
Appropriate data files not found from the installed pangoLEARN repo.
Please see https://cov-lineages.org/pangolin.html for installation and updating instructions."""
                 ))
        exit(1)
    else:
        if args.decompress:
            prev_size = os.path.getsize(trained_model)

            print("Decompressing model and header files.")
            model = joblib.load(trained_model)
            joblib.dump(model, trained_model, compress=0)
            headers = joblib.load(header_file)
            joblib.dump(headers, header_file, compress=0)

            if os.path.getsize(trained_model) >= prev_size:
                print(
                    green(f'Success! Decompressed the model file. Exiting\n'))
                sys.exit(0)
            else:
                print(cyan(f'Error: failed to decompress model. Exiting\n'))
                sys.exit(0)

        print(green("\nData files found:"))
        if use_usher:
            print(f"UShER tree:\t{usher_protobuf}")
            print(f"Designated hash:\t{designated_hash}")
        else:
            print(f"Trained model:\t{trained_model}")
            print(f"Header file:\t{header_file}")
            print(f"Designated hash:\t{designated_hash}")

        config["trained_model"] = trained_model
        config["header_file"] = header_file
        config["designated_hash"] = designated_hash

    if use_usher:
        config["usher_protobuf"] = usher_protobuf

    if config['verbose']:
        print(green("\n**** CONFIG ****"))
        for k in sorted(config):
            print(green(k), config[k])

        status = snakemake.snakemake(snakefile,
                                     printshellcmds=True,
                                     forceall=True,
                                     force_incomplete=True,
                                     workdir=tempdir,
                                     config=config,
                                     cores=args.threads,
                                     lock=False)
    else:
        logger = custom_logger.Logger()
        status = snakemake.snakemake(snakefile,
                                     printshellcmds=False,
                                     forceall=True,
                                     force_incomplete=True,
                                     workdir=tempdir,
                                     config=config,
                                     cores=args.threads,
                                     lock=False,
                                     quiet=True,
                                     log_handler=config["log_api"])

    if status:  # translate "success" into shell exit code of 0
        return 0

    return 1
Пример #20
0
def update(version_dictionary):
    """
    Using the github releases API check for the latest current release
    of the set of depdencies provided e.g., pangolin, scorpio, pangolearn and
    constellations for complete --update and just pangolearn and constellations
    for --update_data.

    Dictionary keys must be one of pangolin, scorpio, pangolearn, or constellations

    Compare these to the currently running versions and if newer releases
    exist update to them accordingly (or do nothing if current).
    Afterwards, exit program safely with a 0 exit code.

    version_dictionary: dictionary keyed with dependency names and version for
                        that dependency
                        e.g.
    {pangolin: string containing the __version__ data for the currently
                      running pangolin module
    pangolearn: string containing the __version__ data for the imported
                       pangoLEARN data module
    scorpio: string containing the __version__ data for the imported
                       scorpio module
    constellations: string containing the __version__ data for the imported
                       constellations data module
    pango-designation: string containing the __version__ data for the imported
                       pango_designation data module}

    """
    # flag if any element is update if everything is the latest release
    # we want to just continue running
    for dependency, version in version_dictionary.items():

        try:
            latest_release = request.urlopen(
                f"https://api.github.com/repos/cov-lineages/{dependency}/releases"
            )
        # to catch and give a useful error message when people try to run this
        # either update option on clusters without external connectivity
        # or have exceeded the github API limit temporarily
        # this may also catch genuine bugs when version and release tags diverge
        # so if this is thrown and there is definitely connectivity then
        # double check the version labels
        except Exception as e:
            sys.stderr.write(
                cyan("Unable to connect to reach github API "
                     "--update/--data_update requires internet "
                     "connectivity so may not work on certain "
                     "systems or if your IP has exceeded the "
                     f"5,000 request per hour limit\n{e}\n"))
            sys.exit(-1)

        latest_release = json.load(latest_release)
        latest_release = LooseVersion(latest_release[0]['tag_name'])

        #print(dependency, version, latest_release)
        # to match the tag names add a v to the pangolin internal version
        if dependency in ['pangolin', 'scorpio', 'pango-designation']:
            version = "v" + version
        # to match the tag names for pangoLEARN add data release
        elif dependency == 'pangolearn':
            version = version.replace(' ', ' data release ')
        # to match the tag names for the constellations data release
        elif dependency == 'constellations':
            version = version.replace(' ', ' data release ')
        else:
            raise ValueError("Dependency name for auto-update must be one "
                             "of: 'pangolin', 'pangolearn', scorpio', "
                             "'constellations', 'pango-designation'")

        # convert to LooseVersion to have proper ordering of versions
        # this prevents someone using the latest commit/HEAD from being
        # downgraded to the last stable release
        version = LooseVersion(version)

        if version < latest_release:
            subprocess.run([
                sys.executable, '-m', 'pip', 'install', '--upgrade',
                f"git+https://github.com/cov-lineages/{dependency}.git@{latest_release}"
            ],
                           check=True,
                           stdout=subprocess.DEVNULL,
                           stderr=subprocess.DEVNULL)
            print(f"{dependency} updated to {latest_release}", file=sys.stderr)
        elif version > latest_release:
            print(
                f"{dependency} ({version}) is newer than latest stable "
                f"release ({latest_release}), not updating.",
                file=sys.stderr)
        else:
            print(f"{dependency} already latest release ({latest_release})",
                  file=sys.stderr)

    sys.exit(0)
Пример #21
0
import json
from tempfile import gettempdir
import tempfile
import pprint
import json
import gzip
import lzma
import os
import joblib
from pangolin.utils.log_colours import green, cyan, red

try:
    import pangoLEARN
except:
    sys.stderr.write(
        cyan('Error: please install `pangoLEARN` with \n') +
        "pip install git+https://github.com/cov-lineages/pangoLEARN.git")
    sys.exit(-1)

try:
    import scorpio
except:
    sys.stderr.write(
        cyan('Error: please install `scorpio` with \n') +
        "pip install git+https://github.com/cov-lineages/scorpio.git")
    sys.exit(-1)

try:
    from pangoLEARN import PANGO_VERSION
except:
    sys.stderr.write(
Пример #22
0
def install_error(package, url):
    sys.stderr.write(
        cyan(f'Error: please install `{package}` with \n') +
        f"pip install git+{url}\n")
    sys.exit(-1)
Пример #23
0
def setup_data(datadir_arg,analysis_mode, config):

    datadir = check_datadir(datadir_arg)

    pangolin_data_dir = pangolin_data.__path__[0]
    constellations_dir = constellations.__path__[0]
    constellation_files = []

    data_locations = [os.walk(constellations_dir)]

    if datadir:
        data_locations.append(os.walk(datadir))

    # the logic of this is to search the "built-in" constellations
    # path first and then if as custom datadir is passed, follow up with those, so that
    # any files found in the datadir supercede the "built-in" modules. The assumption
    # here is that the datadir contains newer (user updated) data
    for r, _, f in itertools.chain.from_iterable(data_locations):
        if r.endswith('/constellations') or r.endswith('/constellations/definitions'):
            constellation_files = []  # only collect the constellations from the last directory found
        for fn in f:
            if r.endswith('/constellations') and fn == '__init__.py':
                constellations_version = version_from_init(os.path.join(r, fn))
            elif (r.endswith('/constellations') or r.endswith('/constellations/definitions')) and fn.endswith('.json'):
                constellation_files.append(os.path.join(r, fn))

    pangolin_data_version = pangolin_data.__version__
    use_datadir = False
    datadir_too_old = False
    if datadir:
        version = "Unknown"
        for r,d,f in os.walk(datadir):
            for fn in f:
                # pangolin-data/__init__.py not constellations/__init__.py:
                if r.endswith('data') and fn == "__init__.py":
                    # print("Found " + os.path.join(r, fn))
                    version = version_from_init(os.path.join(r, fn))
                    if not version:
                        continue
                    
                    if LooseVersion(version) >= LooseVersion(pangolin_data.__version__):
                        # only use this if the version is >= than what we already have
                        pangolin_data_version = version
                        use_datadir = True
                    else:
                        datadir_too_old = True
                        sys.stderr.write(cyan(f"Warning: Ignoring specified datadir {datadir} - it contains pangoLEARN model files older ({version}) than those installed ({pangolin_data.__version__})\n"))

    if use_datadir == False:
        # we haven't got a viable datadir from searching args.datadir
        if datadir and not datadir_too_old:
            sys.stderr.write(cyan(
                f"Warning: Ignoring specified datadir {datadir} - could not find __init__.py file to check versions \n"))

        pangolin_data_dir = pangolin_data.__path__[0]
        datadir = os.path.join(pangolin_data_dir,"data")

    config[KEY_PANGOLIN_DATA_VERSION] = pangolin_data_version
    config[KEY_CONSTELLATIONS_VERSION] = constellations_version
    config[KEY_DATADIR] = datadir
    config[KEY_CONSTELLATION_FILES] = constellation_files
Пример #24
0
import subprocess
import json
from tempfile import gettempdir
import tempfile
import pprint
import json
import gzip
import lzma
import os
import joblib
from pangolin.utils.log_colours import green,cyan,red

try:
    import pangoLEARN
except:
    sys.stderr.write(cyan('Error: please install `pangoLEARN` with \n') + 
    "pip install git+https://github.com/cov-lineages/pangoLEARN.git")
    sys.exit(-1)
try:
    import scorpio
except:
    sys.stderr.write(cyan('Error: please install `scorpio` with \n') + 
    "pip install git+https://github.com/cov-lineages/scorpio.git")
    sys.exit(-1)

try:
    from pangoLEARN import PANGO_VERSION
except:
    sys.stderr.write(cyan('Error: please update to pangoLEARN version >= 2021-05-27\n'))
    sys.exit(-1)
Пример #25
0
def print_ram_warning(analysis_mode):
    if analysis_mode == "pangolearn":
        print(cyan("Warning: pangoLEARN mode may use a significant amount of RAM, be aware that it will not suit every system."))