Пример #1
0
def set_up_verbosity(config):
    if config[KEY_VERBOSE]:
        config["quiet"] = False
        config[KEY_LOG_API] = ""
        config["log_string"] = ""
    else:
        config["quiet"] = True
        logger = custom_logger.Logger()
        config[KEY_LOG_API] = logger.log_handler
Пример #2
0
def set_up_verbosity(config):
    if config["verbose"]:
        config["quiet"] = False
        config["log_api"] = ""
        config["log_string"] = ""
    else:
        config["quiet"] = True
        logger = custom_logger.Logger()
        config["log_api"] = logger.log_handler

        lh_path = os.path.realpath(lh.__file__)
        config["log_string"] = f"--quiet --log-handler-script {lh_path} "
Пример #3
0
def main(sysargs=sys.argv[1:]):

    parser = argparse.ArgumentParser(
        prog=_program,
        description=
        'pangolin: Phylogenetic Assignment of Named Global Outbreak LINeages',
        usage='''pangolin <query> [options]''')

    parser.add_argument('query',
                        nargs="*",
                        help='Query fasta file of sequences to analyse.')
    parser.add_argument('--alignment',
                        action="store_true",
                        help="Optional alignment output.")
    parser.add_argument('--usher',
                        action="store_true",
                        help="Use UShER model instead of default pangoLEARN")
    parser.add_argument(
        '--usher-tree',
        action='store',
        dest='usher_protobuf',
        help=
        "UShER Mutation Annotated Tree protobuf file to use instead of --usher default from pangoLEARN repository or --datadir"
    )
    parser.add_argument(
        '--max-ambig',
        action="store",
        default=0.3,
        type=float,
        help=
        "Maximum proportion of Ns allowed for pangolin to attempt assignment. Default: 0.3",
        dest="maxambig")
    parser.add_argument(
        '--min-length',
        action="store",
        default=25000,
        type=int,
        help=
        "Minimum query length allowed for pangolin to attempt assignment. Default: 25000",
        dest="minlen")
    parser.add_argument(
        '-o',
        '--outdir',
        action="store",
        help="Output directory. Default: current working directory")
    parser.add_argument(
        '--outfile',
        action="store",
        help="Optional output file name. Default: lineage_report.csv")
    parser.add_argument(
        '--tempdir',
        action="store",
        help="Specify where you want the temp stuff to go. Default: $TMPDIR")
    parser.add_argument(
        "--no-temp",
        action="store_true",
        help="Output all intermediate files, for dev purposes.")
    parser.add_argument(
        '-d',
        '--datadir',
        action='store',
        dest="datadir",
        help=
        "Data directory minimally containing a fasta alignment and guide tree")
    parser.add_argument(
        '--decompress-model',
        action="store_true",
        dest="decompress",
        help=
        "Permanently decompress the model file to save time running pangolin.")
    parser.add_argument("--verbose",
                        action="store_true",
                        help="Print lots of stuff to screen")
    parser.add_argument("-t",
                        "--threads",
                        action="store",
                        default=1,
                        type=int,
                        help="Number of threads")
    parser.add_argument("-v",
                        "--version",
                        action='version',
                        version=f"pangolin {__version__}")
    parser.add_argument("-pv",
                        "--pangoLEARN-version",
                        action='version',
                        version=f"pangoLEARN {pangoLEARN.__version__}",
                        help="show pangoLEARN's version number and exit")
    parser.add_argument(
        "-dv",
        "--pango-designation-version",
        action='version',
        version=
        f"pango-designation {PANGO_VERSION} used for pangoLEARN and UShER training",
        help="show pango-designation version number used for training and exit"
    )
    parser.add_argument("--aliases",
                        action='store_true',
                        default=False,
                        help="print pango-designation alias_key.json and exit")
    parser.add_argument(
        "--update",
        action='store_true',
        default=False,
        help=
        "Automatically updates to latest release of pangolin, pangoLEARN and constellations, then exits"
    )
    parser.add_argument(
        "--update-data",
        action='store_true',
        dest="update_data",
        default=False,
        help=
        "Automatically updates to latest release of pangoLEARN and constellations, then exits"
    )

    if len(sysargs) < 1:
        parser.print_help()
        sys.exit(-1)
    else:
        args = parser.parse_args(sysargs)
    args = parser.parse_args()

    if args.update:
        update({
            'pangolin': __version__,
            'pangolearn': pangoLEARN.__version__,
            'constellations': constellations.__version__,
            'scorpio': scorpio.__version__,
            'pango-designation': pango_designation.__version__
        })

    if args.update_data:
        update({
            'pangolearn': pangoLEARN.__version__,
            'constellations': constellations.__version__,
            'pango-designation': pango_designation.__version__
        })

    alias_file = None
    pango_designation_dir = pango_designation.__path__[0]
    for r, d, f in os.walk(pango_designation_dir):
        for fn in f:
            if fn == "alias_key.json":
                alias_file = os.path.join(r, fn)
    if not alias_file:
        sys.stderr.write(
            cyan(
                'Could not find alias file: please update pango-designation with \n'
            ) +
            "pip install git+https://github.com/cov-lineages/pango-designation.git"
        )
        sys.exit(-1)

    if args.aliases:
        with open(alias_file, 'r') as handle:
            for line in handle:
                print(line.rstrip())
        sys.exit(0)

    dependency_checks.check_dependencies()

    # to enable not having to pass a query if running update
    # by allowing query to accept 0 to many arguments
    if len(args.query) > 1:
        print(
            cyan(
                f"Error: Too many query (input) fasta files supplied: {args.query}\nPlease supply one only"
            ))
        parser.print_help()
        sys.exit(-1)
    else:
        # find the query fasta
        query = os.path.join(cwd, args.query[0])
        if not os.path.exists(query):
            sys.stderr.write(
                cyan(f'Error: cannot find query (input) fasta file at:') +
                f'{query}\n' +
                'Please enter your fasta sequence file and refer to pangolin usage at: https://cov-lineages.org/pangolin.html'
                + ' for detailed instructions.\n')
            sys.exit(-1)
        else:
            print(green(f"The query file is:") + f"{query}")

        # default output dir

    if args.outdir:
        outdir = os.path.join(cwd, args.outdir)
        if not os.path.exists(outdir):
            try:
                os.mkdir(outdir)
            except:
                sys.stderr.write(
                    cyan(f'Error: cannot create directory:') + f"{outdir}")
                sys.exit(-1)
    else:
        outdir = cwd

    if args.outfile:
        outfile = os.path.join(outdir, args.outfile)
    else:
        outfile = os.path.join(outdir, "lineage_report.csv")

    if args.tempdir:
        to_be_dir = os.path.join(cwd, args.tempdir)
        if not os.path.exists(to_be_dir):
            os.mkdir(to_be_dir)
        temporary_directory = tempfile.TemporaryDirectory(suffix=None,
                                                          prefix=None,
                                                          dir=to_be_dir)
        tempdir = temporary_directory.name
    else:
        temporary_directory = tempfile.TemporaryDirectory(suffix=None,
                                                          prefix=None,
                                                          dir=None)
        tempdir = temporary_directory.name

    if args.no_temp:
        print(
            green(f"\n--no-temp: ") +
            f"all intermediate files will be written to {outdir}\n")
        tempdir = outdir

    if args.alignment:
        align_dir = outdir
        alignment_out = True
    else:
        align_dir = tempdir
        alignment_out = False
    """
    QC steps:
    1) check no empty seqs
    2) check N content
    3) write a file that contains just the seqs to run
    """

    do_not_run = []
    run = []
    total_input = 0
    print(green("** Sequence QC **"))
    fmt = "{:<30}\t{:>25}\t{:<10}\n"

    print("{:<30}\t{:>25}\t{:<10}\n".format("Sequence name", "Reason",
                                            "Value"))

    file_ending = query.split(".")[-1]
    if file_ending in ["gz", "gzip", "tgz"]:
        query = gzip.open(query, 'rt')
    elif file_ending in ["xz", "lzma"]:
        query = lzma.open(query, 'rt')

    for record in SeqIO.parse(query, "fasta"):
        total_input += 1
        # replace spaces in sequence headers with underscores
        record.description = record.description.replace(' ', '_')
        record.id = record.description
        if "," in record.id:
            record.id = record.id.replace(",", "_")

        if len(record) < args.minlen:
            record.description = record.description + f" fail=seq_len:{len(record)}"
            do_not_run.append(record)
            print(fmt.format(record.id, "Seq too short", len(record)))
            # print(record.id, "\t\tsequence too short")
        else:
            num_N = str(record.seq).upper().count("N")
            prop_N = round((num_N) / len(record.seq), 2)
            if prop_N > args.maxambig:
                record.description = record.description + f" fail=N_content:{prop_N}"
                do_not_run.append(record)
                print(fmt.format(record.id, "N content too high", prop_N))
                # print("{record.id} | has an N content of {prop_N}")
            else:
                run.append(record)

    print(green("\nNumber of sequences detected: ") + f"{total_input}")
    print(green("Total passing QC: ") + f"{len(run)}")

    if run == []:
        with open(outfile, "w") as fw:
            fw.write(
                "taxon,lineage,conflict,ambiguity_score,scorpio_call,scorpio_support,scorpio_conflict,version,pangolin_version,pangoLEARN_version,pango_version,status,note\n"
            )
            for record in do_not_run:
                desc = record.description.split(" ")
                reason = ""
                for item in desc:
                    if item.startswith("fail="):
                        reason = item.split("=")[1]
                fw.write(
                    f"{record.id},None,,,,,,PANGO-{PANGO_VERSION},{__version__},{pangoLEARN.__version__},{PANGO_VERSION},fail,{reason}\n"
                )
        print(cyan(f'Note: no query sequences have passed the qc\n'))
        sys.exit(0)

    post_qc_query = os.path.join(tempdir, 'query.post_qc.fasta')
    with open(post_qc_query, "w") as fw:
        SeqIO.write(run, fw, "fasta")
    qc_fail = os.path.join(tempdir, 'query.failed_qc.fasta')
    with open(qc_fail, "w") as fw:
        SeqIO.write(do_not_run, fw, "fasta")

    config = {
        "query_fasta": post_qc_query,
        "outdir": outdir,
        "outfile": outfile,
        "tempdir": tempdir,
        "aligndir": align_dir,
        "alignment_out": alignment_out,
        "trim_start": 265,  # where to pad to using datafunk
        "trim_end": 29674,  # where to pad after using datafunk
        "qc_fail": qc_fail,
        "alias_file": alias_file,
        "verbose": args.verbose,
        "pangoLEARN_version": pangoLEARN.__version__,
        "pangolin_version": __version__,
        "pango_version": PANGO_VERSION,
        "threads": args.threads
    }

    data_install_checks.check_install(config)
    snakefile = data_install_checks.get_snakefile(thisdir)

    dependency_checks.set_up_verbosity(config)

    # find the data
    if args.datadir:
        data_dir = os.path.join(cwd, args.datadir)
        version = "Unknown"
        for r, d, f in os.walk(data_dir):
            for fn in f:
                if fn == "__init__.py":
                    print("Found __init__.py")
                    with open(os.path.join(r, fn), "r") as fr:
                        for l in fr:
                            if l.startswith("__version__"):
                                l = l.rstrip("\n")
                                version = l.split('=')[1]
                                version = version.replace('"',
                                                          "").replace(" ", "")
                                print("pangoLEARN version", version)
        config["pangoLEARN_version"] = version

    else:
        pangoLEARN_dir = pangoLEARN.__path__[0]
        data_dir = os.path.join(pangoLEARN_dir, "data")
    # print(f"Looking in {data_dir} for data files...")
    trained_model = ""
    header_file = ""
    designated_hash = ""
    use_usher = args.usher
    if args.usher_protobuf:
        usher_protobuf = os.path.join(cwd, args.usher_protobuf)
        if not os.path.exists(usher_protobuf):
            sys.stderr.write(
                'Error: cannot find --usher-tree file at {}\n'.format(
                    usher_protobuf))
            sys.exit(-1)
        use_usher = True
    else:
        usher_protobuf = ""

    for r, d, f in os.walk(data_dir):
        for fn in f:
            if fn == "decisionTreeHeaders_v1.joblib":
                header_file = os.path.join(r, fn)
            elif fn == "decisionTree_v1.joblib":
                trained_model = os.path.join(r, fn)
            elif fn == "lineages.hash.csv":
                designated_hash = os.path.join(r, fn)
            elif fn == "lineageTree.pb" and usher_protobuf == "":
                usher_protobuf = os.path.join(r, fn)
    if ((use_usher and (usher_protobuf == "" or designated_hash == "")
         or (not use_usher and (trained_model == "" or header_file == ""
                                or designated_hash == "")))):
        print(
            cyan("""pangoLEARN version should be >= 2021-05-27. \n
Appropriate data files not found from the installed pangoLEARN repo.
Please see https://cov-lineages.org/pangolin.html for installation and updating instructions."""
                 ))
        exit(1)
    else:
        if args.decompress:
            prev_size = os.path.getsize(trained_model)

            print("Decompressing model and header files.")
            model = joblib.load(trained_model)
            joblib.dump(model, trained_model, compress=0)
            headers = joblib.load(header_file)
            joblib.dump(headers, header_file, compress=0)

            if os.path.getsize(trained_model) >= prev_size:
                print(
                    green(f'Success! Decompressed the model file. Exiting\n'))
                sys.exit(0)
            else:
                print(cyan(f'Error: failed to decompress model. Exiting\n'))
                sys.exit(0)

        print(green("\nData files found:"))
        if use_usher:
            print(f"UShER tree:\t{usher_protobuf}")
            print(f"Designated hash:\t{designated_hash}")
        else:
            print(f"Trained model:\t{trained_model}")
            print(f"Header file:\t{header_file}")
            print(f"Designated hash:\t{designated_hash}")

        config["trained_model"] = trained_model
        config["header_file"] = header_file
        config["designated_hash"] = designated_hash

    if use_usher:
        config["usher_protobuf"] = usher_protobuf

    if config['verbose']:
        print(green("\n**** CONFIG ****"))
        for k in sorted(config):
            print(green(k), config[k])

        status = snakemake.snakemake(snakefile,
                                     printshellcmds=True,
                                     forceall=True,
                                     force_incomplete=True,
                                     workdir=tempdir,
                                     config=config,
                                     cores=args.threads,
                                     lock=False)
    else:
        logger = custom_logger.Logger()
        status = snakemake.snakemake(snakefile,
                                     printshellcmds=False,
                                     forceall=True,
                                     force_incomplete=True,
                                     workdir=tempdir,
                                     config=config,
                                     cores=args.threads,
                                     lock=False,
                                     quiet=True,
                                     log_handler=config["log_api"])

    if status:  # translate "success" into shell exit code of 0
        return 0

    return 1
Пример #4
0
def log_handler(msg):
    logger = custom_logger.Logger()
    return logger.log_handler