예제 #1
0
def set_up_tempdir(tempdir_arg,no_temp_arg,cwd,outdir,config):

    if no_temp_arg:
        tempdir = outdir
        config[KEY_TEMPDIR] = tempdir
        print(green(f"\n--no-temp: ") + f"all intermediate files will be written to {outdir}\n")
    elif tempdir_arg:
        to_be_dir = os.path.join(cwd, tempdir_arg)
        try:
            if not os.path.exists(to_be_dir):
                os.mkdir(to_be_dir)
        except:
            sys.stderr.write(cyan(f'Error: cannot create temp directory {to_be_dir}.\n'))
            sys.exit(-1)
        tempdir = tempfile.mkdtemp(dir=to_be_dir)
        config[KEY_TEMPDIR] = tempdir
    else:
        tempdir = tempfile.mkdtemp()
        config[KEY_TEMPDIR] = tempdir
        try:
            if not os.path.exists(tempdir):
                os.mkdir(tempdir)
        except:
            sys.stderr.write(cyan(f'Error: cannot create temp directory {tempdir}.\n'))
            sys.exit(-1)
        
        try:
            with open(os.path.join(tempdir, "test.txt"),"w") as fw:
                fw.write("Test")
        except:
            sys.stderr.write(cyan(f'Error: cannot write to temp directory {tempdir}.\n'))
            sys.exit(-1)
예제 #2
0
def check_dependencies(dependency_list, module_list, usher_arg, cache_arg):

    missing = []

    if usher_arg:
        dependency_list.append("usher")

    if cache_arg:
        dependency_list.append("pangolin-assignment")

    for dependency in dependency_list:
        check_this_dependency(dependency, missing)

    for module in module_list:
        check_module(module, missing)

    if missing:
        if len(missing) == 1:
            sys.stderr.write(
                cyan(f'Error: Missing dependency `{missing[0]}`.') +
                '\nPlease update your pangolin environment.\n')
            sys.exit(-1)
        else:
            dependencies = ""
            for i in missing:
                dependencies += f"\t- {i}\n"

            sys.stderr.write(
                cyan(f'Error: Missing dependencies.') +
                f'\n{dependencies}Please update your pangolin environment.\n')
            sys.exit(-1)
    else:
        print(green("All dependencies satisfied."))
예제 #3
0
def check_dependencies():

    missing = []

    dependency_list = ["gofasta", "minimap2", "snakemake", "usher"]
    module_list = [
        "Bio", "sklearn", "pandas", "joblib", "pysam", "pangoLEARN",
        "constellations"
    ]

    for dependency in dependency_list:
        check_this_dependency(dependency, missing)

    for module in module_list:
        check_module(module, missing)

    if missing:
        if len(missing) == 1:
            sys.stderr.write(
                cyan(f'Error: Missing dependency `{missing[0]}`.') +
                '\nPlease update your pangolin environment.\n')
            sys.exit(-1)
        else:
            dependencies = ""
            for i in missing:
                dependencies += f"\t- {i}\n"

            sys.stderr.write(
                cyan(f'Error: Missing dependencies.') +
                f'\n{dependencies}Please update your pangolin environment.\n')
            sys.exit(-1)
    else:
        print(green("All dependencies satisfied."))
예제 #4
0
def get_datafiles(datadir, file_dict, config):
    datafiles = {}
    for r, d, f in os.walk(datadir):
        for fn in f:
            if fn in file_dict:
                datafiles[file_dict[fn]] = os.path.join(r, fn)
    for fn in datafiles:
        config[fn] = datafiles[fn]
    for fn in file_dict:
        if file_dict[fn] not in config:
            sys.stderr.write(
                cyan(
                    f'Error: Cannot find {fn} in datadir. Please supply a datadir with required files or specify an alternative analysis mode.\nPlease see https://cov-lineages.org/pangolin.html for full installation and updating instructions.'
                ))
            sys.exit(-1)

    print(green("****\nData files found:"))
    for fn in datafiles:
        print(f"{fn}:\t{datafiles[fn]}")
        config[fn] = datafiles[fn]
    print(green("****"))
예제 #5
0
def find_query_file(cwd, tempdir, query_arg):
    if len(query_arg) > 1:
        print(cyan(f"Error: Too many query (input) fasta files supplied: {query_arg}\nPlease supply one only."))
        sys.exit(-1)

    # find the query fasta
    try:
        if not os.path.exists(os.path.join(cwd, query_arg[0])):
            if select.select([sys.stdin,],[],[],0.0)[0]:
                query = os.path.join(tempdir, "stdin_query.fasta")
                with open(query,"w") as fw:
                    for l in sys.stdin:
                        l= l.rstrip("\n")
                        fw.write(l + '\n')
                
                print(green("Query:\t") + "reading from stdin.")
            elif not select.select([sys.stdin,],[],[],0.0)[0]:
                tried_path = os.path.join(cwd, query_arg[0])
                if tried_path.endswith("-"):
                    sys.stderr.write(cyan(
                        f'Error: cannot find query (input) fasta file using stdin.\n' +
                                    'Please enter your fasta sequence file and refer to pangolin usage at: https://cov-lineages.org/pangolin.html' +
                                    ' for detailed instructions.\n'))
                    sys.exit(-1)
                else:
                    sys.stderr.write(cyan(f'Error: cannot find query (input) fasta file at:') + f'{tried_path}\n' +
                                    'Please enter your fasta sequence file and refer to pangolin usage at: https://cov-lineages.org/pangolin.html' +
                                    ' for detailed instructions.\n')
                    sys.exit(-1)
        else:
            query = os.path.join(cwd, query_arg[0])
            print(green(f"Query file:\t") + f"{query}")
    except IndexError:
        sys.stderr.write(cyan(
            f'Error: input query fasta could not be detected from a filepath or through stdin.\n' +
            'Please enter your fasta sequence file and refer to pangolin usage at: https://cov-lineages.org/pangolin.html' +
            ' for detailed instructions.\n'))
        sys.exit(-1)

    return query
예제 #6
0
def main(sysargs=sys.argv[1:]):
    parser = argparse.ArgumentParser(
        prog=_program,
        description=
        'pangolin: Phylogenetic Assignment of Named Global Outbreak LINeages',
        usage='''pangolin <query> [options]''')

    io_group = parser.add_argument_group('Input-Output options')
    io_group.add_argument('query',
                          nargs="*",
                          help='Query fasta file of sequences to analyse.')
    io_group.add_argument(
        '-o',
        '--outdir',
        action="store",
        help="Output directory. Default: current working directory")
    io_group.add_argument(
        '--outfile',
        action="store",
        help="Optional output file name. Default: lineage_report.csv")
    io_group.add_argument(
        '--tempdir',
        action="store",
        help="Specify where you want the temp stuff to go. Default: $TMPDIR")
    io_group.add_argument(
        "--no-temp",
        action="store_true",
        help="Output all intermediate files, for dev purposes.")
    io_group.add_argument('--alignment',
                          action="store_true",
                          help="Output multiple sequence alignment.")
    io_group.add_argument('--alignment-file',
                          action="store",
                          help="Multiple sequence alignment file name.")
    io_group.add_argument(
        '--expanded-lineage',
        action="store_true",
        default=False,
        help="Optional expanded lineage from alias.json in report.")

    a_group = parser.add_argument_group('Analysis options')
    a_group.add_argument(
        '--analysis-mode',
        action="store",
        help=
        "Specify which inference engine to use. Options: accurate (UShER), fast (pangoLEARN), pangolearn, usher. Default: UShER inference."
    )

    a_group.add_argument(
        "--skip-designation-cache",
        action='store_true',
        default=False,
        help=
        "Developer option - do not use designation cache to assign lineages.",
        dest="skip_designation_cache")
    a_group.add_argument(
        "--skip-scorpio",
        action='store_true',
        default=False,
        help=
        "Developer option - do not use scorpio to check VOC/VUI lineage assignments.",
        dest="skip_scorpio")

    a_group.add_argument(
        '--max-ambig',
        action="store",
        default=0.3,
        type=float,
        help=
        "Maximum proportion of Ns allowed for pangolin to attempt assignment. Default: 0.3",
        dest="maxambig")
    a_group.add_argument(
        '--min-length',
        action="store",
        default=25000,
        type=int,
        help=
        "Minimum query length allowed for pangolin to attempt assignment. Default: 25000",
        dest="minlen")
    a_group.add_argument('--usher',
                         action='store_true',
                         default=False,
                         help=argparse.SUPPRESS)

    d_group = parser.add_argument_group('Data options')
    d_group.add_argument(
        "--update",
        action='store_true',
        default=False,
        help=
        "Automatically updates to latest release of pangolin, pangolin-data, scorpio and constellations (and pangolin-assignment if it has been installed using --add-assignment-cache), then exits."
    )
    d_group.add_argument(
        "--update-data",
        action='store_true',
        dest="update_data",
        default=False,
        help=
        "Automatically updates to latest release of constellations and pangolin-data, including the pangoLEARN model, UShER tree file and alias file (also pangolin-assignment if it has been installed using --add-assignment-cache), then exits."
    )
    d_group.add_argument(
        '--add-assignment-cache',
        action='store_true',
        dest="add_assignment_cache",
        default=False,
        help=
        "Install the pangolin-assignment repository for use with --use-assignment-cache.  This makes updates slower and makes pangolin slower for small numbers of input sequences but much faster for large numbers of input sequences."
    )
    d_group.add_argument(
        '--use-assignment-cache',
        action='store_true',
        dest="use_assignment_cache",
        default=False,
        help=
        "Use assignment cache from optional pangolin-assignment repository. NOTE: the repository must be installed by --add-assignment-cache before using --use-assignment-cache."
    )
    d_group.add_argument(
        '-d',
        '--datadir',
        action='store',
        dest="datadir",
        help=
        "Data directory minimally containing the pangoLEARN model, header files and UShER tree. Default: Installed pangolin-data package."
    )
    d_group.add_argument(
        '--usher-tree',
        action='store',
        dest='usher_protobuf',
        help=
        "UShER Mutation Annotated Tree protobuf file to use instead of default from pangolin-data repository or --datadir."
    )
    d_group.add_argument(
        '--assignment-cache',
        action='store',
        dest='assignment_cache',
        help=
        "Cached precomputed assignment file to use instead of default from pangolin-assignment repository.  Does not require installation of pangolin-assignment."
    )

    m_group = parser.add_argument_group('Misc options')
    m_group.add_argument("--aliases",
                         action='store_true',
                         default=False,
                         help="Print Pango alias_key.json and exit.")
    m_group.add_argument("-v",
                         "--version",
                         action='version',
                         version=f"pangolin {__version__}")
    m_group.add_argument(
        "-pv",
        "--pangolin-data-version",
        action='version',
        version=f"pangolin-data {pangolin_data.__version__}",
        help=
        "show version number of pangolin data files (UShER tree and pangoLEARN model files) and exit."
    )
    m_group.add_argument(
        "--all-versions",
        action='store_true',
        dest="all_versions",
        default=False,
        help="Print all tool, dependency, and data versions then exit.")
    m_group.add_argument("--verbose",
                         action="store_true",
                         help="Print lots of stuff to screen")
    m_group.add_argument("-t",
                         "--threads",
                         action="store",
                         default=1,
                         type=int,
                         help="Number of threads")

    if len(sysargs) < 1:
        parser.print_help()
        sys.exit(-1)
    else:
        args = parser.parse_args(sysargs)

    # Initialise config dict
    config = setup_config_dict(cwd)
    data_checks.check_install(config)
    set_up_verbosity(config)

    if args.usher:
        sys.stderr.write(
            cyan(
                f"--usher is a pangolin v3 option and is deprecated in pangolin v4.  UShER is now the default analysis mode.  Use --analysis-mode to explicitly set mode.\n"
            ))

    setup_data(args.datadir, config[KEY_ANALYSIS_MODE], config)

    if args.add_assignment_cache:
        update.install_pangolin_assignment()

    if args.update:
        version_dictionary = {
            'pangolin': __version__,
            'pangolin-data': config[KEY_PANGOLIN_DATA_VERSION],
            'constellations': config[KEY_CONSTELLATIONS_VERSION],
            'scorpio': config[KEY_SCORPIO_VERSION]
        }
        update.add_pangolin_assignment_if_installed(version_dictionary)
        update.update(version_dictionary)

    if args.update_data:
        version_dictionary = {
            'pangolin-data': config[KEY_PANGOLIN_DATA_VERSION],
            'constellations': config[KEY_CONSTELLATIONS_VERSION]
        }
        update.add_pangolin_assignment_if_installed(version_dictionary)
        update.update(version_dictionary, args.datadir)

    # install_pangolin_assignment doesn't exit so that --update/--update-data can be given at the
    # same time (or a query file).  If --add-assignment-cache is the only arg, exit without error.
    if args.add_assignment_cache and not args.query:
        sys.exit(0)

    # add flag to config for whether to run scorpio
    if args.skip_scorpio:
        print(green(f"****\nPangolin skipping scorpio steps.\n****"))
        config[KEY_SKIP_SCORPIO] = True

    if args.expanded_lineage:
        print(green(f"****\nAdding expanded lineage column to output.\n****"))
        config[KEY_EXPANDED_LINEAGE] = True

    # Parsing analysis mode flags to return one of 'usher' or 'pangolearn'
    config[KEY_ANALYSIS_MODE] = set_up_analysis_mode(args.analysis_mode,
                                                     config[KEY_ANALYSIS_MODE])

    snakefile = get_snakefile(thisdir, config[KEY_ANALYSIS_MODE])

    config[KEY_DESIGNATION_CACHE], config[
        KEY_ALIAS_FILE] = data_checks.find_designation_cache_and_alias(
            config[KEY_DATADIR], DESIGNATION_CACHE_FILE, ALIAS_FILE)
    if args.aliases:
        print_alias_file_exit(config[KEY_ALIAS_FILE])

    if args.all_versions:
        print_versions_exit(config)

    # to enable not having to pass a query if running update
    # by allowing query to accept 0 to many arguments

    print(
        green(
            f"****\nPangolin running in {config[KEY_ANALYSIS_MODE]} mode.\n****"
        ))
    print_ram_warning(config[KEY_ANALYSIS_MODE])

    #   setup outdir and outfiles
    config[KEY_OUTDIR] = io.set_up_outdir(args.outdir, cwd, config[KEY_OUTDIR])
    config[KEY_OUTFILE] = io.set_up_outfile(args.outfile, config[KEY_OUTFILE],
                                            config[KEY_OUTDIR])
    io.set_up_tempdir(args.tempdir, args.no_temp, cwd, config[KEY_OUTDIR],
                      config)
    config[KEY_ALIGNMENT_FILE], config[
        KEY_ALIGNMENT_OUT] = io.parse_alignment_options(
            args.alignment, config[KEY_OUTDIR], config[KEY_TEMPDIR],
            args.alignment_file, config[KEY_ALIGNMENT_FILE])

    config[KEY_QUERY_FASTA] = io.find_query_file(cwd, config[KEY_TEMPDIR],
                                                 args.query)

    io.quick_check_query_file(cwd, args.query, config[KEY_QUERY_FASTA])

    if config[KEY_ANALYSIS_MODE] == "usher":
        # Find usher protobuf file (and if specified, assignment cache file too)
        data_checks.get_datafiles(config[KEY_DATADIR], usher_files, config)
        if args.usher_protobuf:
            config[KEY_USHER_PB] = data_checks.check_file_arg(
                args.usher_protobuf, cwd, '--usher-tree')
            print(green(f"Using usher tree file {args.usher_protobuf}"))
        if args.assignment_cache:
            config[KEY_ASSIGNMENT_CACHE] = data_checks.check_file_arg(
                args.assignment_cache, cwd, '--assignment-cache')
            print(
                green(f"Using assignment cache file {args.assignment_cache}"))
        elif args.use_assignment_cache:
            config[KEY_ASSIGNMENT_CACHE] = data_checks.get_assignment_cache(
                USHER_ASSIGNMENT_CACHE_FILE, config)
            print(green("Using pangolin-assignment cache"))
        else:
            config[KEY_ASSIGNMENT_CACHE] = ""

    elif config[KEY_ANALYSIS_MODE] == "pangolearn":
        # find designation cache and the model files
        data_checks.get_datafiles(config[KEY_DATADIR], pangolearn_files,
                                  config)
        if args.use_assignment_cache or args.assignment_cache:
            sys.stderr.write(
                cyan(
                    f"Warning: --use-assignment-cache and --assignment-cache are ignored when --analysis-mode is 'fast' or 'pangolearn'.\n"
                ))

    preprocessing_snakefile = get_snakefile(thisdir, "preprocessing")

    if args.verbose:
        print(green("\n**** CONFIG ****"))
        for k in sorted(config):
            print(green(k), config[k])

        status = snakemake.snakemake(preprocessing_snakefile,
                                     printshellcmds=True,
                                     forceall=True,
                                     force_incomplete=True,
                                     workdir=config[KEY_TEMPDIR],
                                     config=config,
                                     cores=args.threads,
                                     lock=False)
    else:
        logger = custom_logger.Logger()
        status = snakemake.snakemake(preprocessing_snakefile,
                                     printshellcmds=False,
                                     forceall=True,
                                     force_incomplete=True,
                                     workdir=config[KEY_TEMPDIR],
                                     config=config,
                                     cores=args.threads,
                                     lock=False,
                                     quiet=True,
                                     log_handler=logger.log_handler)
    if status:  # translate "success" into shell exit code of 0

        if config[KEY_VERBOSE]:
            print(green("\n**** CONFIG ****"))
            for k in sorted(config):
                print(green(k), config[k])

            status = snakemake.snakemake(snakefile,
                                         printshellcmds=True,
                                         forceall=True,
                                         force_incomplete=True,
                                         workdir=config[KEY_TEMPDIR],
                                         config=config,
                                         cores=args.threads,
                                         lock=False)
        else:
            logger = custom_logger.Logger()
            status = snakemake.snakemake(snakefile,
                                         printshellcmds=False,
                                         forceall=True,
                                         force_incomplete=True,
                                         workdir=config[KEY_TEMPDIR],
                                         config=config,
                                         cores=args.threads,
                                         lock=False,
                                         quiet=True,
                                         log_handler=logger.log_handler)

        if status:

            ## Collate the report here

            preprocessing_csv = os.path.join(config[KEY_TEMPDIR],
                                             "preprocessing.csv")
            inference_csv = os.path.join(config[KEY_TEMPDIR],
                                         "inference_report.csv")
            cached_csv = os.path.join(config[KEY_TEMPDIR],
                                      "cache_assigned.csv")
            constellation_list = get_voc_list(
                os.path.join(config[KEY_TEMPDIR], "get_constellations.txt"),
                config[KEY_ALIAS_FILE])

            generate_final_report(preprocessing_csv, inference_csv, cached_csv,
                                  config[KEY_ALIAS_FILE], constellation_list,
                                  config[KEY_PANGOLIN_DATA_VERSION],
                                  config[KEY_ANALYSIS_MODE],
                                  args.skip_designation_cache,
                                  config[KEY_OUTFILE], config)

            print(
                green(f"****\nOutput file written to: ") + config[KEY_OUTFILE])

            if config[KEY_ALIGNMENT_OUT]:
                print(
                    green(f"****\nOutput alignment written to: ") +
                    config[KEY_ALIGNMENT_FILE])

            return 0

        return 1
    return 1
예제 #7
0
def main(sysargs=sys.argv[1:]):

    parser = argparse.ArgumentParser(
        prog=_program,
        description=
        'pangolin: Phylogenetic Assignment of Named Global Outbreak LINeages',
        usage='''pangolin <query> [options]''')

    parser.add_argument('query',
                        nargs="*",
                        help='Query fasta file of sequences to analyse.')
    parser.add_argument('--alignment',
                        action="store_true",
                        help="Optional alignment output.")
    parser.add_argument('--usher',
                        action="store_true",
                        help="Use UShER model instead of default pangoLEARN")
    parser.add_argument(
        '--usher-tree',
        action='store',
        dest='usher_protobuf',
        help=
        "UShER Mutation Annotated Tree protobuf file to use instead of --usher default from pangoLEARN repository or --datadir"
    )
    parser.add_argument(
        '--max-ambig',
        action="store",
        default=0.3,
        type=float,
        help=
        "Maximum proportion of Ns allowed for pangolin to attempt assignment. Default: 0.3",
        dest="maxambig")
    parser.add_argument(
        '--min-length',
        action="store",
        default=25000,
        type=int,
        help=
        "Minimum query length allowed for pangolin to attempt assignment. Default: 25000",
        dest="minlen")
    parser.add_argument(
        '-o',
        '--outdir',
        action="store",
        help="Output directory. Default: current working directory")
    parser.add_argument(
        '--outfile',
        action="store",
        help="Optional output file name. Default: lineage_report.csv")
    parser.add_argument(
        '--tempdir',
        action="store",
        help="Specify where you want the temp stuff to go. Default: $TMPDIR")
    parser.add_argument(
        "--no-temp",
        action="store_true",
        help="Output all intermediate files, for dev purposes.")
    parser.add_argument(
        '-d',
        '--datadir',
        action='store',
        dest="datadir",
        help=
        "Data directory minimally containing a fasta alignment and guide tree")
    parser.add_argument(
        '--decompress-model',
        action="store_true",
        dest="decompress",
        help=
        "Permanently decompress the model file to save time running pangolin.")
    parser.add_argument("--verbose",
                        action="store_true",
                        help="Print lots of stuff to screen")
    parser.add_argument("-t",
                        "--threads",
                        action="store",
                        default=1,
                        type=int,
                        help="Number of threads")
    parser.add_argument("-v",
                        "--version",
                        action='version',
                        version=f"pangolin {__version__}")
    parser.add_argument("-pv",
                        "--pangoLEARN-version",
                        action='version',
                        version=f"pangoLEARN {pangoLEARN.__version__}",
                        help="show pangoLEARN's version number and exit")
    parser.add_argument(
        "-dv",
        "--pango-designation-version",
        action='version',
        version=
        f"pango-designation {PANGO_VERSION} used for pangoLEARN and UShER training",
        help="show pango-designation version number used for training and exit"
    )
    parser.add_argument("--aliases",
                        action='store_true',
                        default=False,
                        help="print pango-designation alias_key.json and exit")
    parser.add_argument(
        "--update",
        action='store_true',
        default=False,
        help=
        "Automatically updates to latest release of pangolin, pangoLEARN and constellations, then exits"
    )
    parser.add_argument(
        "--update-data",
        action='store_true',
        dest="update_data",
        default=False,
        help=
        "Automatically updates to latest release of pangoLEARN and constellations, then exits"
    )

    if len(sysargs) < 1:
        parser.print_help()
        sys.exit(-1)
    else:
        args = parser.parse_args(sysargs)
    args = parser.parse_args()

    if args.update:
        update({
            'pangolin': __version__,
            'pangolearn': pangoLEARN.__version__,
            'constellations': constellations.__version__,
            'scorpio': scorpio.__version__,
            'pango-designation': pango_designation.__version__
        })

    if args.update_data:
        update({
            'pangolearn': pangoLEARN.__version__,
            'constellations': constellations.__version__,
            'pango-designation': pango_designation.__version__
        })

    alias_file = None
    pango_designation_dir = pango_designation.__path__[0]
    for r, d, f in os.walk(pango_designation_dir):
        for fn in f:
            if fn == "alias_key.json":
                alias_file = os.path.join(r, fn)
    if not alias_file:
        sys.stderr.write(
            cyan(
                'Could not find alias file: please update pango-designation with \n'
            ) +
            "pip install git+https://github.com/cov-lineages/pango-designation.git"
        )
        sys.exit(-1)

    if args.aliases:
        with open(alias_file, 'r') as handle:
            for line in handle:
                print(line.rstrip())
        sys.exit(0)

    dependency_checks.check_dependencies()

    # to enable not having to pass a query if running update
    # by allowing query to accept 0 to many arguments
    if len(args.query) > 1:
        print(
            cyan(
                f"Error: Too many query (input) fasta files supplied: {args.query}\nPlease supply one only"
            ))
        parser.print_help()
        sys.exit(-1)
    else:
        # find the query fasta
        query = os.path.join(cwd, args.query[0])
        if not os.path.exists(query):
            sys.stderr.write(
                cyan(f'Error: cannot find query (input) fasta file at:') +
                f'{query}\n' +
                'Please enter your fasta sequence file and refer to pangolin usage at: https://cov-lineages.org/pangolin.html'
                + ' for detailed instructions.\n')
            sys.exit(-1)
        else:
            print(green(f"The query file is:") + f"{query}")

        # default output dir

    if args.outdir:
        outdir = os.path.join(cwd, args.outdir)
        if not os.path.exists(outdir):
            try:
                os.mkdir(outdir)
            except:
                sys.stderr.write(
                    cyan(f'Error: cannot create directory:') + f"{outdir}")
                sys.exit(-1)
    else:
        outdir = cwd

    if args.outfile:
        outfile = os.path.join(outdir, args.outfile)
    else:
        outfile = os.path.join(outdir, "lineage_report.csv")

    if args.tempdir:
        to_be_dir = os.path.join(cwd, args.tempdir)
        if not os.path.exists(to_be_dir):
            os.mkdir(to_be_dir)
        temporary_directory = tempfile.TemporaryDirectory(suffix=None,
                                                          prefix=None,
                                                          dir=to_be_dir)
        tempdir = temporary_directory.name
    else:
        temporary_directory = tempfile.TemporaryDirectory(suffix=None,
                                                          prefix=None,
                                                          dir=None)
        tempdir = temporary_directory.name

    if args.no_temp:
        print(
            green(f"\n--no-temp: ") +
            f"all intermediate files will be written to {outdir}\n")
        tempdir = outdir

    if args.alignment:
        align_dir = outdir
        alignment_out = True
    else:
        align_dir = tempdir
        alignment_out = False
    """
    QC steps:
    1) check no empty seqs
    2) check N content
    3) write a file that contains just the seqs to run
    """

    do_not_run = []
    run = []
    total_input = 0
    print(green("** Sequence QC **"))
    fmt = "{:<30}\t{:>25}\t{:<10}\n"

    print("{:<30}\t{:>25}\t{:<10}\n".format("Sequence name", "Reason",
                                            "Value"))

    file_ending = query.split(".")[-1]
    if file_ending in ["gz", "gzip", "tgz"]:
        query = gzip.open(query, 'rt')
    elif file_ending in ["xz", "lzma"]:
        query = lzma.open(query, 'rt')

    for record in SeqIO.parse(query, "fasta"):
        total_input += 1
        # replace spaces in sequence headers with underscores
        record.description = record.description.replace(' ', '_')
        record.id = record.description
        if "," in record.id:
            record.id = record.id.replace(",", "_")

        if len(record) < args.minlen:
            record.description = record.description + f" fail=seq_len:{len(record)}"
            do_not_run.append(record)
            print(fmt.format(record.id, "Seq too short", len(record)))
            # print(record.id, "\t\tsequence too short")
        else:
            num_N = str(record.seq).upper().count("N")
            prop_N = round((num_N) / len(record.seq), 2)
            if prop_N > args.maxambig:
                record.description = record.description + f" fail=N_content:{prop_N}"
                do_not_run.append(record)
                print(fmt.format(record.id, "N content too high", prop_N))
                # print("{record.id} | has an N content of {prop_N}")
            else:
                run.append(record)

    print(green("\nNumber of sequences detected: ") + f"{total_input}")
    print(green("Total passing QC: ") + f"{len(run)}")

    if run == []:
        with open(outfile, "w") as fw:
            fw.write(
                "taxon,lineage,conflict,ambiguity_score,scorpio_call,scorpio_support,scorpio_conflict,version,pangolin_version,pangoLEARN_version,pango_version,status,note\n"
            )
            for record in do_not_run:
                desc = record.description.split(" ")
                reason = ""
                for item in desc:
                    if item.startswith("fail="):
                        reason = item.split("=")[1]
                fw.write(
                    f"{record.id},None,,,,,,PANGO-{PANGO_VERSION},{__version__},{pangoLEARN.__version__},{PANGO_VERSION},fail,{reason}\n"
                )
        print(cyan(f'Note: no query sequences have passed the qc\n'))
        sys.exit(0)

    post_qc_query = os.path.join(tempdir, 'query.post_qc.fasta')
    with open(post_qc_query, "w") as fw:
        SeqIO.write(run, fw, "fasta")
    qc_fail = os.path.join(tempdir, 'query.failed_qc.fasta')
    with open(qc_fail, "w") as fw:
        SeqIO.write(do_not_run, fw, "fasta")

    config = {
        "query_fasta": post_qc_query,
        "outdir": outdir,
        "outfile": outfile,
        "tempdir": tempdir,
        "aligndir": align_dir,
        "alignment_out": alignment_out,
        "trim_start": 265,  # where to pad to using datafunk
        "trim_end": 29674,  # where to pad after using datafunk
        "qc_fail": qc_fail,
        "alias_file": alias_file,
        "verbose": args.verbose,
        "pangoLEARN_version": pangoLEARN.__version__,
        "pangolin_version": __version__,
        "pango_version": PANGO_VERSION,
        "threads": args.threads
    }

    data_install_checks.check_install(config)
    snakefile = data_install_checks.get_snakefile(thisdir)

    dependency_checks.set_up_verbosity(config)

    # find the data
    if args.datadir:
        data_dir = os.path.join(cwd, args.datadir)
        version = "Unknown"
        for r, d, f in os.walk(data_dir):
            for fn in f:
                if fn == "__init__.py":
                    print("Found __init__.py")
                    with open(os.path.join(r, fn), "r") as fr:
                        for l in fr:
                            if l.startswith("__version__"):
                                l = l.rstrip("\n")
                                version = l.split('=')[1]
                                version = version.replace('"',
                                                          "").replace(" ", "")
                                print("pangoLEARN version", version)
        config["pangoLEARN_version"] = version

    else:
        pangoLEARN_dir = pangoLEARN.__path__[0]
        data_dir = os.path.join(pangoLEARN_dir, "data")
    # print(f"Looking in {data_dir} for data files...")
    trained_model = ""
    header_file = ""
    designated_hash = ""
    use_usher = args.usher
    if args.usher_protobuf:
        usher_protobuf = os.path.join(cwd, args.usher_protobuf)
        if not os.path.exists(usher_protobuf):
            sys.stderr.write(
                'Error: cannot find --usher-tree file at {}\n'.format(
                    usher_protobuf))
            sys.exit(-1)
        use_usher = True
    else:
        usher_protobuf = ""

    for r, d, f in os.walk(data_dir):
        for fn in f:
            if fn == "decisionTreeHeaders_v1.joblib":
                header_file = os.path.join(r, fn)
            elif fn == "decisionTree_v1.joblib":
                trained_model = os.path.join(r, fn)
            elif fn == "lineages.hash.csv":
                designated_hash = os.path.join(r, fn)
            elif fn == "lineageTree.pb" and usher_protobuf == "":
                usher_protobuf = os.path.join(r, fn)
    if ((use_usher and (usher_protobuf == "" or designated_hash == "")
         or (not use_usher and (trained_model == "" or header_file == ""
                                or designated_hash == "")))):
        print(
            cyan("""pangoLEARN version should be >= 2021-05-27. \n
Appropriate data files not found from the installed pangoLEARN repo.
Please see https://cov-lineages.org/pangolin.html for installation and updating instructions."""
                 ))
        exit(1)
    else:
        if args.decompress:
            prev_size = os.path.getsize(trained_model)

            print("Decompressing model and header files.")
            model = joblib.load(trained_model)
            joblib.dump(model, trained_model, compress=0)
            headers = joblib.load(header_file)
            joblib.dump(headers, header_file, compress=0)

            if os.path.getsize(trained_model) >= prev_size:
                print(
                    green(f'Success! Decompressed the model file. Exiting\n'))
                sys.exit(0)
            else:
                print(cyan(f'Error: failed to decompress model. Exiting\n'))
                sys.exit(0)

        print(green("\nData files found:"))
        if use_usher:
            print(f"UShER tree:\t{usher_protobuf}")
            print(f"Designated hash:\t{designated_hash}")
        else:
            print(f"Trained model:\t{trained_model}")
            print(f"Header file:\t{header_file}")
            print(f"Designated hash:\t{designated_hash}")

        config["trained_model"] = trained_model
        config["header_file"] = header_file
        config["designated_hash"] = designated_hash

    if use_usher:
        config["usher_protobuf"] = usher_protobuf

    if config['verbose']:
        print(green("\n**** CONFIG ****"))
        for k in sorted(config):
            print(green(k), config[k])

        status = snakemake.snakemake(snakefile,
                                     printshellcmds=True,
                                     forceall=True,
                                     force_incomplete=True,
                                     workdir=tempdir,
                                     config=config,
                                     cores=args.threads,
                                     lock=False)
    else:
        logger = custom_logger.Logger()
        status = snakemake.snakemake(snakefile,
                                     printshellcmds=False,
                                     forceall=True,
                                     force_incomplete=True,
                                     workdir=tempdir,
                                     config=config,
                                     cores=args.threads,
                                     lock=False,
                                     quiet=True,
                                     log_handler=config["log_api"])

    if status:  # translate "success" into shell exit code of 0
        return 0

    return 1