Пример #1
0
def run_main(parser, args):
    args = parser.parse_args()
    verbose = True
    if args.ont:
        args.expected_error_rate = 0.15
        args.filters = ["LOW_GT_CONF"]
        args.model = "kmer_count"
        logger.debug("Setting expected error rate to %s (--ont)" %
                     args.expected_error_rate)
        logger.debug(
            "Removing LOW_PERCENT_COVERAGE filter (increases sensitivity - in particular for ONT data)")

    if args.min_variant_conf is None:
        args.min_variant_conf = 100
    cp = CoverageParser(
        sample=args.sample,
        panel_file_paths=[args.probe_set],
        seq=args.seq,
        ctx=args.ctx,
        kmer=args.kmer,
        force=args.force,
        verbose=verbose,
        tmp_dir=args.tmp,
        skeleton_dir=args.skeleton_dir,
        threads=args.threads,
        memory=args.memory,
        mccortex31_path=args.mccortex31_path)
    cp.run()
    if args.expected_depth is None:
        args.expected_depth = cp.estimate_depth()

    base_json = {args.sample: {}}
    base_json[args.sample]["probe_set"] = args.probe_set
    if args.seq:
        base_json[args.sample]["files"] = args.seq
    else:
        base_json[args.sample]["files"] = args.ctx
    base_json[args.sample]["kmer"] = args.kmer
    base_json[args.sample]["version"] = __version__
    gt = Genotyper(
        sample=args.sample,
        expected_error_rate=args.expected_error_rate,
        expected_depths=[
            args.expected_depth],
        variant_covgs=cp.variant_covgs,
        gene_presence_covgs=cp.covgs["presence"],
        base_json=base_json,
        contamination_depths=[],
        ignore_filtered=args.ignore_filtered,
        filters=args.filters,
        model=args.model,
        report_all_calls=args.report_all_calls,
        variant_confidence_threshold=args.min_variant_conf,
        sequence_confidence_threshold=args.min_gene_conf,
        min_gene_percent_covg_threshold=args.min_gene_percent_covg_threshold)
    gt.run()
    if not args.keep_tmp:
        cp.remove_temporary_files()
    return gt.out_json
Пример #2
0
def run(parser, args):
    logger.info(
        f"Start runnning mykrobe predict. Command line: {' '.join(sys.argv)}")
    base_json = {args.sample: {}}
    ref_data = ref_data_from_args(args)
    if (args.species == "custom" and ref_data["var_to_res_json"] is None
            and ref_data["lineage_json"] is None):
        logger.info(
            "Forcing --report_all_calls because species is 'custom' and options --custom_variant_to_resistance_json,--custom_lineage_json were not used"
        )
        args.report_all_calls = True
    logger.info(
        f"Running mykrobe predict using species {args.species}, and panel version {ref_data['version']}"
    )

    if args.tmp is None:
        args.tmp = tempfile.mkdtemp() + "/"

    if args.ont:
        args.expected_error_rate = ONT_E_RATE
        logger.info(
            f"Set expected error rate to {args.expected_error_rate} because --ont flag was used"
        )
        args.ploidy = ONT_PLOIDY
        logger.info(f"Set ploidy to {args.ploidy} because --ont flag used")

    # Run Cortex
    cp = CoverageParser(
        sample=args.sample,
        panel_file_paths=ref_data["fasta_files"],
        seq=args.seq,
        kmer=ref_data["kmer"],
        force=args.force,
        threads=args.threads,
        verbose=False,
        tmp_dir=args.tmp,
        skeleton_dir=args.skeleton_dir,
        memory=args.memory,
    )
    cp.run()
    logger.debug("CoverageParser complete")

    if ref_data["species_phylo_group"] is None:
        phylogenetics = {}
        depths = [cp.estimate_depth()]
    else:
        phylogenetics, depths = detect_species_and_get_depths(
            cp, ref_data["hierarchy_json"], ref_data["species_phylo_group"])

    # Genotype
    variant_calls_dict = {}
    sequence_calls_dict = {}
    lineage_calls_dict = {}
    lineage_predict_dict = {}
    if args.force and len(depths) == 0:
        depths = [cp.estimate_depth()]
    gt = None

    if len(depths) > 0 or args.force:
        # Running the genotyper changes the contents of cp.covgs["presence"].
        # The changes can cause later second run (if it happens) of the
        # genotytper to crash.
        # Store the original cp.covgs["presence"] so we can use it again later.
        original_covgs_presence = copy.deepcopy(cp.covgs["presence"])
        gt = Genotyper(
            sample=args.sample,
            expected_depths=depths,
            expected_error_rate=args.expected_error_rate,
            variant_covgs=cp.variant_covgs,
            gene_presence_covgs=cp.covgs["presence"],
            base_json=base_json,
            contamination_depths=[],
            report_all_calls=True,
            ignore_filtered=True,
            filters=args.filters,
            variant_confidence_threshold=args.min_variant_conf,
            sequence_confidence_threshold=args.min_gene_conf,
            model=args.model,
            kmer_size=ref_data["kmer"],
            min_proportion_expected_depth=args.min_proportion_expected_depth,
            ploidy=args.ploidy,
            lineage_variants=ref_data["lineage_dict"],
        )
        gt.run()
        (
            kmer_count_error_rate,
            incorrect_kmer_to_pc_cov,
        ) = gt.estimate_kmer_count_error_rate_and_incorrect_kmer_to_percent_cov(
        )
        logger.debug("Estimated error rate for kmer count model: " +
                     str(round(100 * kmer_count_error_rate, 2)) + "%")
        if args.guess_sequence_method and kmer_count_error_rate > 0.001:
            logger.warning(
                "Guess sequence method is on, and we've guessed ONT")
            # this is duplicated from the if args.ont statement above
            args.expected_error_rate = ONT_E_RATE
            logger.info(
                f"Set expected error rate to {args.expected_error_rate}")
            args.ploidy = ONT_PLOIDY
            logger.info(f"Set ploidy to {args.ploidy}")

        # conf_percent_cutoff == 100 means that we want to keep all variant calls,
        # in which case there is no need to run the simulations
        if args.conf_percent_cutoff < 100:
            logger.debug("Expected depth: " + str(depths[0]))
            conf_thresholder = ConfThresholder(
                kmer_count_error_rate,
                depths[0],
                ref_data["kmer"],
                incorrect_kmer_to_pc_cov,
            )
            time_start = time.time()
            conf_threshold = conf_thresholder.get_conf_threshold(
                percent_to_keep=args.conf_percent_cutoff)
            time_end = time.time()
            time_to_sim = time_end - time_start
            logger.debug("Simulation time: " + str(time_to_sim))
            logger.debug("Confidence cutoff (using percent cutoff " +
                         str(args.conf_percent_cutoff) + "%): " +
                         str(conf_threshold))
            cp.covgs["presence"] = copy.deepcopy(original_covgs_presence)
            gt = Genotyper(
                sample=args.sample,
                expected_depths=depths,
                expected_error_rate=kmer_count_error_rate,
                variant_covgs=cp.variant_covgs,
                gene_presence_covgs=cp.covgs["presence"],
                base_json=base_json,
                contamination_depths=[],
                report_all_calls=True,
                ignore_filtered=True,
                filters=args.filters,
                variant_confidence_threshold=conf_threshold,
                sequence_confidence_threshold=args.min_gene_conf,
                model=args.model,
                kmer_size=ref_data["kmer"],
                min_proportion_expected_depth=args.
                min_proportion_expected_depth,
                ploidy=args.ploidy,
                lineage_variants=ref_data["lineage_dict"],
            )
            gt.run()

        variant_calls_dict = gt.variant_calls_dict
        sequence_calls_dict = gt.sequence_calls_dict
        lineage_predict_dict, lineage_calls_dict = gt.predict_lineage()
    else:
        depths = [cp.estimate_depth()]

    mykrobe_predictor_susceptibility_result = MykrobePredictorSusceptibilityResult(
    )
    if (gt is not None and (max(depths) > args.min_depth or args.force)
            and ref_data["var_to_res_json"] is not None):
        predictor = BasePredictor(
            variant_calls=gt.variant_calls,
            called_genes=gt.sequence_calls_dict,
            base_json=base_json[args.sample],
            depth_threshold=args.min_depth,
            ignore_filtered=True,
            ignore_minor_calls=args.ignore_minor_calls,
            variant_to_resistance_json_fp=ref_data["var_to_res_json"],
        )
        mykrobe_predictor_susceptibility_result = predictor.run()
        logger.info("Progress: finished making AMR predictions")

    base_json[args.sample] = {
        "susceptibility":
        list(mykrobe_predictor_susceptibility_result.to_dict().values())[0],
        "phylogenetics": {}
        if phylogenetics == {} else list(phylogenetics.to_dict().values())[0],
        "variant_calls":
        variant_calls_dict,
        "sequence_calls":
        sequence_calls_dict,
        "lineage_calls":
        lineage_calls_dict,
        "kmer":
        ref_data["kmer"],
        "probe_sets":
        ref_data["fasta_files"],
        "files":
        args.seq,
        "version": {
            "mykrobe-predictor": predictor_version,
            "mykrobe-atlas": atlas_version,
            "panel": ref_data["version"],
        },
        "genotype_model":
        args.model,
    }
    if len(lineage_predict_dict) > 0:
        base_json[
            args.sample]["phylogenetics"]["lineage"] = lineage_predict_dict

    if not args.keep_tmp:
        cp.remove_temporary_files()

    logger.info("Progress: writing output")
    fix_X_amino_acid_variants(base_json[args.sample])
    write_outputs(args, base_json)
    logger.info("Progress: finished")
Пример #3
0
def run(parser, args):
    base_json = {args.sample: {}}
    args = parser.parse_args()
    hierarchy_json_file = None
    if args.panel is not None:
        variant_to_resistance_json_fp = None
        if args.species == "tb" and args.panel == "bradley-2015":
            TB_PANELS = [
                "data/panels/tb-species-170421.fasta.gz",
                "data/panels/tb-bradley-probe-set-jan-2019.fasta.gz",
            ]
        elif args.species == "tb" and args.panel == "walker-2015":
            TB_PANELS = [
                "data/panels/tb-species-170421.fasta.gz",
                "data/panels/tb-walker-probe-set-jan-2019.fasta.gz",
            ]
        elif args.species == "tb" and args.panel == "201901":
            TB_PANELS = [
                "data/panels/tb-species-170421.fasta.gz",
                "data/panels/tb-hunt-probe-set-jan-03-2019.fasta.gz",
            ]
            data_dir = os.path.abspath(
                os.path.join(os.path.dirname(__file__), "../data/predict/tb/"))
            variant_to_resistance_json_fp = os.path.join(
                data_dir, "variant_to_resistance_drug-jan-03-2019.json")
        elif args.species == "tb" and args.panel == "atlas":
            TB_PANELS = [
                "data/panels/tb-species-170421.fasta.gz",
                "data/panels/tb-walker-probe-set-jan-2019.fasta.gz",
                "data/panels/tb-k21-probe-set-feb-09-2017.fasta.gz",
            ]
        elif args.panel == "custom":
            if not args.custom_probe_set_path:
                raise ValueError("Custom panel requires custom_probe_set_path")
            TB_PANELS = [
                args.custom_probe_set_path,
                "data/panels/tb-species-170421.fasta.gz",
            ]
            variant_to_resistance_json_fp = args.custom_variant_to_resistance_json
    Predictor = None
    if not args.species:
        panels = TB_PANELS + GN_PANELS + STAPH_PANELS
    elif args.species == "staph":
        panels = STAPH_PANELS
        Predictor = StaphPredictor
        args.kmer = 15  # Forced
        variant_to_resistance_json_fp = None
    elif args.species == "tb":
        panels = TB_PANELS
        hierarchy_json_file = "data/phylo/mtbc_hierarchy.json"
        Predictor = TBPredictor
    logger.info("Running AMR prediction with panels %s" % ", ".join(panels))
    version = {}
    version["mykrobe-predictor"] = predictor_version
    version["mykrobe-atlas"] = atlas_version
    # Get real paths for panels
    panels = [
        os.path.realpath(os.path.join(os.path.dirname(__file__), "..", f))
        for f in panels
    ]
    if hierarchy_json_file is not None:
        hierarchy_json_file = os.path.realpath(
            os.path.join(os.path.dirname(__file__), "..", hierarchy_json_file))
    # Run Cortex
    cp = CoverageParser(
        sample=args.sample,
        panel_file_paths=panels,
        seq=args.seq,
        kmer=args.kmer,
        force=args.force,
        threads=1,
        verbose=False,
        tmp_dir=args.tmp,
        skeleton_dir=args.skeleton_dir,
        mccortex31_path=args.mccortex31_path,
    )
    cp.run()
    logger.debug("CoverageParser complete")

    # Detect species
    species_predictor = AMRSpeciesPredictor(
        phylo_group_covgs=cp.covgs.get("complex",
                                       cp.covgs.get("phylo_group", {})),
        sub_complex_covgs=cp.covgs.get("sub-complex", {}),
        species_covgs=cp.covgs["species"],
        lineage_covgs=cp.covgs.get("sub-species", {}),
        hierarchy_json_file=hierarchy_json_file,
    )
    phylogenetics = species_predictor.run()

    # ## AMR prediction

    depths = []
    if species_predictor.is_saureus_present():
        depths = [
            species_predictor.out_json["phylogenetics"]["phylo_group"]
            ["Staphaureus"]["median_depth"]
        ]
    elif species_predictor.is_mtbc_present():
        depths = [
            species_predictor.out_json["phylogenetics"]["phylo_group"]
            ["Mycobacterium_tuberculosis_complex"]["median_depth"]
        ]
    # pprint (species_predictor.out_json["phylogenetics"]["species"])
    # Genotype
    q = args.quiet
    args.quiet = True
    variant_calls_dict = {}
    sequence_calls_dict = {}
    if args.force and not depths:
        depths = [1]
    gt = None

    if depths or args.force:
        gt = Genotyper(
            sample=args.sample,
            expected_depths=depths,
            expected_error_rate=args.expected_error_rate,
            variant_covgs=cp.variant_covgs,
            gene_presence_covgs=cp.covgs["presence"],
            base_json=base_json,
            contamination_depths=[],
            report_all_calls=True,
            ignore_filtered=True,
            filters=args.filters,
            variant_confidence_threshold=args.min_variant_conf,
            sequence_confidence_threshold=args.min_gene_conf,
            model=args.model,
            kmer_size=args.kmer,
            min_proportion_expected_depth=args.min_proportion_expected_depth,
            ploidy=args.ploidy,
        )
        gt.run()
        kmer_count_error_rate, incorrect_kmer_to_pc_cov = (
            gt.
            estimate_kmer_count_error_rate_and_incorrect_kmer_to_percent_cov())
        logger.info("Estimated error rate for kmer count model: " +
                    str(round(100 * kmer_count_error_rate, 2)) + "%")
        if args.guess_sequence_method and kmer_count_error_rate > 0.001:
            logger.warning(
                "Guess sequence method is on, and we've guessed ONT")
            args.ont = True

        if args.ont:
            args.expected_error_rate = 0.15
            args.ploidy = "haploid"
            args.ignore_minor_calls = True
            logger.warning("Setting ploidy to haploid")
            logger.warning("Setting ignore_minor_calls to True")
            logger.warning("Setting expected error rate to %s (--ont)" %
                           args.expected_error_rate)
            args.model = "kmer_count"

        # If the user didn't specify the conf_percent_cutoff, then set it
        # depending on whether or not the --ont flag was used
        if args.conf_percent_cutoff == -1:
            args.conf_percent_cutoff = 90 if args.ont else 100

        # conf_percent_cutoff == 100 means that we want to keep all variant calls,
        # in which case there is no need to run the simulations
        if args.conf_percent_cutoff < 100:
            logger.info("Expected depth: " + str(depths[0]))
            conf_thresholder = ConfThresholder(kmer_count_error_rate,
                                               depths[0], args.kmer,
                                               incorrect_kmer_to_pc_cov)
            time_start = time.time()
            conf_threshold = conf_thresholder.get_conf_threshold(
                percent_to_keep=args.conf_percent_cutoff)
            time_end = time.time()
            time_to_sim = time_end - time_start
            logger.info("Simulation time: " + str(time_to_sim))
            logger.info("Confidence cutoff (using percent cutoff " +
                        str(args.conf_percent_cutoff) + "%): " +
                        str(conf_threshold))
            gt = Genotyper(
                sample=args.sample,
                expected_depths=depths,
                expected_error_rate=kmer_count_error_rate,
                # expected_error_rate=args.expected_error_rate,
                variant_covgs=cp.variant_covgs,
                gene_presence_covgs=cp.covgs["presence"],
                base_json=base_json,
                contamination_depths=[],
                report_all_calls=True,
                ignore_filtered=True,
                filters=args.filters,
                variant_confidence_threshold=conf_threshold,
                sequence_confidence_threshold=args.min_gene_conf,
                model=args.model,
                kmer_size=args.kmer,
                min_proportion_expected_depth=args.
                min_proportion_expected_depth,
                ploidy=args.ploidy,
            )
            gt.run()

        variant_calls_dict = gt.variant_calls_dict
        sequence_calls_dict = gt.sequence_calls_dict
    else:
        depths = [cp.estimate_depth()]
    args.quiet = q
    mykrobe_predictor_susceptibility_result = MykrobePredictorSusceptibilityResult(
    )
    if gt is not None and (max(depths) > args.min_depth or args.force):
        predictor = Predictor(
            variant_calls=gt.variant_calls,
            called_genes=gt.sequence_calls_dict,
            base_json=base_json[args.sample],
            depth_threshold=args.min_depth,
            ignore_filtered=True,
            ignore_minor_calls=args.ignore_minor_calls,
            variant_to_resistance_json_fp=variant_to_resistance_json_fp,
        )
        mykrobe_predictor_susceptibility_result = predictor.run()
    base_json[args.sample] = MykrobePredictorResult(
        susceptibility=mykrobe_predictor_susceptibility_result,
        phylogenetics=phylogenetics,
        variant_calls=variant_calls_dict,
        sequence_calls=sequence_calls_dict,
        probe_sets=panels,
        files=args.seq,
        kmer=args.kmer,
        version=version,
        model=args.model,
    ).to_dict()
    if not args.keep_tmp:
        cp.remove_temporary_files()

    # write to file is specified by user, otherwise send to stdout
    if args.output_format == "csv":
        output = json_to_csv(base_json)
    else:
        ## Verbose json output requires --report_all_calls
        if not args.report_all_calls:
            del base_json[args.sample]["variant_calls"]
            del base_json[args.sample]["sequence_calls"]
        output = json.dumps(base_json, indent=4)

    if args.output:
        with open(args.output, "w") as outfile:
            outfile.write(output)
    else:
        print(output)
Пример #4
0
def run_main(parser, args):
    args = parser.parse_args()
    verbose = True
    if args.ont:
        args.expected_error_rate = ONT_E_RATE
        logger.debug("Setting expected error rate to %s (--ont)" %
                     args.expected_error_rate)

    if args.min_variant_conf is None:
        args.min_variant_conf = 100

    if args.tmp is None:
        args.tmp = tempfile.mkdtemp() + "/"

    cp = CoverageParser(
        sample=args.sample,
        panel_file_paths=[args.probe_set],
        seq=args.seq,
        ctx=args.ctx,
        kmer=args.kmer,
        force=args.force,
        verbose=verbose,
        tmp_dir=args.tmp,
        skeleton_dir=args.skeleton_dir,
        threads=args.threads,
        memory=args.memory,
    )
    cp.run()
    if args.expected_depth is None:
        args.expected_depth = cp.estimate_depth()

    base_json = {args.sample: {}}
    base_json[args.sample]["probe_set"] = args.probe_set
    if args.seq:
        base_json[args.sample]["files"] = args.seq
    else:
        base_json[args.sample]["files"] = args.ctx
    base_json[args.sample]["kmer"] = args.kmer
    base_json[args.sample]["version"] = __version__
    if args.lineage is None:
        lineage_dict = None
    else:
        lineage_dict = load_json(args.lineage)
    gt = Genotyper(
        sample=args.sample,
        expected_error_rate=args.expected_error_rate,
        expected_depths=[args.expected_depth],
        variant_covgs=cp.variant_covgs,
        gene_presence_covgs=cp.covgs["presence"],
        base_json=base_json,
        contamination_depths=[],
        ignore_filtered=args.ignore_filtered,
        filters=args.filters,
        model=args.model,
        report_all_calls=args.report_all_calls,
        variant_confidence_threshold=args.min_variant_conf,
        sequence_confidence_threshold=args.min_gene_conf,
        min_gene_percent_covg_threshold=args.min_gene_percent_covg_threshold,
        kmer_size=args.kmer,
        min_proportion_expected_depth=args.min_proportion_expected_depth,
        ploidy=args.ploidy,
        lineage_variants=lineage_dict,
    )
    gt.run()
    if args.output:
        with open(args.output, "w") as outfile:
            json.dump(gt.out_json, outfile, indent=4)

    if not args.keep_tmp:
        cp.remove_temporary_files()
    return gt.out_json
Пример #5
0
def run(parser, args):
    base_json = {args.sample: {}}
    args = parser.parse_args()
    hierarchy_json_file = None
    variant_to_resistance_json_fp: Optional[PathLike] = None
    species = Species(args.species)
    if species is not Species.TB and args.panel != "custom":
        args.panel = "default"
    panels = Panel.from_species_and_name(species, args.panel)

    if species is Species.TB and panels.name is TbPanel.NEJM_WALKER:
        data_dir = os.path.abspath(
            os.path.join(os.path.dirname(__file__), "../data/predict/tb/"))
        variant_to_resistance_json_fp = os.path.join(
            data_dir, "variant_to_resistance_drug-jan-03-2019.json")
    if panels.name in (TbPanel.CUSTOM, StaphPanel.CUSTOM):
        if not args.custom_probe_set_path:
            raise ValueError("Custom panel requires custom_probe_set_path")

        if not os.path.exists(args.custom_probe_set_path):
            raise FileNotFoundError(
                f"Custom probe path {args.custom_probe_set_path} does not exist!"
            )
        panels.add_path(args.custom_probe_set_path)

        if not os.path.exists(args.custom_variant_to_resistance_json):
            raise FileNotFoundError(
                ("Custom variant to resistance json "
                 f"{args.custom_variant_to_resistance_json} does not exist!"))
        variant_to_resistance_json_fp = args.custom_variant_to_resistance_json

    if species is Species.STAPH:
        Predictor = StaphPredictor
        args.kmer = 15  # Forced
    elif species is Species.TB:
        hierarchy_json_file = "data/phylo/mtbc_hierarchy.json"
        Predictor = TBPredictor
    else:
        raise ValueError(f"Unrecognised species {species}")

    logger.info("Running AMR prediction with panels %s" %
                ", ".join(panels.paths))
    version = dict()
    version["mykrobe-predictor"] = predictor_version
    version["mykrobe-atlas"] = atlas_version
    # Get real paths for panels
    panels = [
        os.path.realpath(os.path.join(os.path.dirname(__file__), "..", f))
        for f in panels.paths
    ]
    if hierarchy_json_file is not None:
        hierarchy_json_file = os.path.realpath(
            os.path.join(os.path.dirname(__file__), "..", hierarchy_json_file))
    # Run Cortex
    cp = CoverageParser(
        sample=args.sample,
        panel_file_paths=panels,
        seq=args.seq,
        kmer=args.kmer,
        force=args.force,
        threads=1,
        verbose=False,
        tmp_dir=args.tmp,
        skeleton_dir=args.skeleton_dir,
    )
    cp.run()
    logger.debug("CoverageParser complete")

    # Detect species
    species_predictor = AMRSpeciesPredictor(
        phylo_group_covgs=cp.covgs.get("complex",
                                       cp.covgs.get("phylo_group", {})),
        sub_complex_covgs=cp.covgs.get("sub-complex", {}),
        species_covgs=cp.covgs["species"],
        lineage_covgs=cp.covgs.get("sub-species", {}),
        hierarchy_json_file=hierarchy_json_file,
    )
    phylogenetics = species_predictor.run()

    # ## AMR prediction

    depths = []
    if species_predictor.is_saureus_present():
        depths = [
            species_predictor.out_json["phylogenetics"]["phylo_group"]
            ["Staphaureus"]["median_depth"]
        ]
    elif species_predictor.is_mtbc_present():
        depths = [
            species_predictor.out_json["phylogenetics"]["phylo_group"]
            ["Mycobacterium_tuberculosis_complex"]["median_depth"]
        ]
    # pprint (species_predictor.out_json["phylogenetics"]["species"])
    # Genotype
    q = args.quiet
    args.quiet = True
    variant_calls_dict = {}
    sequence_calls_dict = {}
    if args.force and not depths:
        depths = [1]
    gt = None

    if depths or args.force:
        gt = Genotyper(
            sample=args.sample,
            expected_depths=depths,
            expected_error_rate=args.expected_error_rate,
            variant_covgs=cp.variant_covgs,
            gene_presence_covgs=cp.covgs["presence"],
            base_json=base_json,
            contamination_depths=[],
            report_all_calls=True,
            ignore_filtered=True,
            filters=args.filters,
            variant_confidence_threshold=args.min_variant_conf,
            sequence_confidence_threshold=args.min_gene_conf,
            model=args.model,
            kmer_size=args.kmer,
            min_proportion_expected_depth=args.min_proportion_expected_depth,
            ploidy=args.ploidy,
        )
        gt.run()
        (
            kmer_count_error_rate,
            incorrect_kmer_to_pc_cov,
        ) = gt.estimate_kmer_count_error_rate_and_incorrect_kmer_to_percent_cov(
        )
        logger.info("Estimated error rate for kmer count model: " +
                    str(round(100 * kmer_count_error_rate, 2)) + "%")
        if args.guess_sequence_method and kmer_count_error_rate > 0.001:
            logger.warning(
                "Guess sequence method is on, and we've guessed ONT")
            args.ont = True

        if args.ont:
            args.expected_error_rate = 0.15
            args.ploidy = "haploid"
            args.ignore_minor_calls = True
            logger.warning("Setting ploidy to haploid")
            logger.warning("Setting ignore_minor_calls to True")
            logger.warning("Setting expected error rate to %s (--ont)" %
                           args.expected_error_rate)
            args.model = "kmer_count"

        # If the user didn't specify the conf_percent_cutoff, then set it
        # depending on whether or not the --ont flag was used
        if args.conf_percent_cutoff == -1:
            args.conf_percent_cutoff = 90 if args.ont else 100

        # conf_percent_cutoff == 100 means that we want to keep all variant calls,
        # in which case there is no need to run the simulations
        if args.conf_percent_cutoff < 100:
            logger.info("Expected depth: " + str(depths[0]))
            conf_thresholder = ConfThresholder(kmer_count_error_rate,
                                               depths[0], args.kmer,
                                               incorrect_kmer_to_pc_cov)
            time_start = time.time()
            conf_threshold = conf_thresholder.get_conf_threshold(
                percent_to_keep=args.conf_percent_cutoff)
            time_end = time.time()
            time_to_sim = time_end - time_start
            logger.info("Simulation time: " + str(time_to_sim))
            logger.info("Confidence cutoff (using percent cutoff " +
                        str(args.conf_percent_cutoff) + "%): " +
                        str(conf_threshold))
            gt = Genotyper(
                sample=args.sample,
                expected_depths=depths,
                expected_error_rate=kmer_count_error_rate,
                variant_covgs=cp.variant_covgs,
                gene_presence_covgs=cp.covgs["presence"],
                base_json=base_json,
                contamination_depths=[],
                report_all_calls=True,
                ignore_filtered=True,
                filters=args.filters,
                variant_confidence_threshold=conf_threshold,
                sequence_confidence_threshold=args.min_gene_conf,
                model=args.model,
                kmer_size=args.kmer,
                min_proportion_expected_depth=args.
                min_proportion_expected_depth,
                ploidy=args.ploidy,
            )
            gt.run()

        variant_calls_dict = gt.variant_calls_dict
        sequence_calls_dict = gt.sequence_calls_dict
    else:
        depths = [cp.estimate_depth()]
    args.quiet = q
    mykrobe_predictor_susceptibility_result = MykrobePredictorSusceptibilityResult(
    )
    if gt is not None and (max(depths) > args.min_depth or args.force):
        predictor = Predictor(
            variant_calls=gt.variant_calls,
            called_genes=gt.sequence_calls_dict,
            base_json=base_json[args.sample],
            depth_threshold=args.min_depth,
            ignore_filtered=True,
            ignore_minor_calls=args.ignore_minor_calls,
            variant_to_resistance_json_fp=variant_to_resistance_json_fp,
        )
        mykrobe_predictor_susceptibility_result = predictor.run()
    base_json[args.sample] = MykrobePredictorResult(
        susceptibility=mykrobe_predictor_susceptibility_result,
        phylogenetics=phylogenetics,
        variant_calls=variant_calls_dict,
        sequence_calls=sequence_calls_dict,
        probe_sets=panels,
        files=args.seq,
        kmer=args.kmer,
        version=version,
        model=args.model,
    ).to_dict()
    if not args.keep_tmp:
        cp.remove_temporary_files()

    outputs = {}

    if args.output_format in ["csv", "json_and_csv"]:
        outputs["csv"] = json_to_csv(base_json)
    if args.output_format in ["json", "json_and_csv"]:
        # Verbose json output requires --report_all_calls
        if not args.report_all_calls:
            del base_json[args.sample]["variant_calls"]
            del base_json[args.sample]["sequence_calls"]
        outputs["json"] = json.dumps(base_json, indent=4)

    if len(outputs) == 0:
        raise ValueError(
            (f"Output format must be one of: csv,json,json_and_csv. Got "
             f"'{args.output_format}'"))

    for output_type, output in outputs.items():
        # write to file is specified by user, otherwise send to stdout
        if args.output:
            if args.output_format == "json_and_csv":
                outfile = args.output + "." + output_type
            else:
                outfile = args.output
            with open(outfile, "w") as f:
                f.write(output)
        else:
            print(output)
Пример #6
0
def run(parser, args):
    base_json = {args.sample: {}}
    args = parser.parse_args()
    hierarchy_json_file = None
    if args.panel is not None:
        if args.panel == "bradley-2015":
            TB_PANELS = [
                "data/panels/tb-species-170421.fasta.gz",
                "data/panels/tb-bradley-probe-set-feb-09-2017.fasta.gz"
            ]
        elif args.panel == "walker-2015":
            TB_PANELS = [
                "data/panels/tb-species-170421.fasta.gz",
                "data/panels/tb-walker-probe-set-feb-09-2017.fasta.gz"
            ]
        elif args.panel == "custom":
            if not args.custom_probe_set_path:
                raise ValueError("Custom panel requires custom_probe_set_path")
            TB_PANELS = [
                args.custom_probe_set_path,
                "data/panels/tb-species-170421.fasta.gz"
            ]
    Predictor = None
    if not args.species:
        panels = TB_PANELS + GN_PANELS + STAPH_PANELS
    elif args.species == "staph":
        panels = STAPH_PANELS
        Predictor = StaphPredictor
        args.kmer = 15  # Forced
    elif args.species == "tb":
        panels = TB_PANELS
        hierarchy_json_file = "data/phylo/mtbc_hierarchy.json"
        Predictor = TBPredictor
    logger.info("Running AMR prediction with panels %s" % ", ".join(panels))
    version = {}
    version["mykrobe-predictor"] = predictor_version
    version["mykrobe-atlas"] = atlas_version
    # Get real paths for panels
    panels = [
        os.path.realpath(os.path.join(os.path.dirname(__file__), "..", f))
        for f in panels
    ]
    if hierarchy_json_file is not None:
        hierarchy_json_file = os.path.realpath(
            os.path.join(os.path.dirname(__file__), "..", hierarchy_json_file))
    if args.ont:
        args.expected_error_rate = 0.15
        logger.debug("Setting expected error rate to %s (--ont)" %
                     args.expected_error_rate)
        args.filters = ["LOW_GT_CONF"]
        args.model = "kmer_count"
    # Run Cortex
    cp = CoverageParser(sample=args.sample,
                        panel_file_paths=panels,
                        seq=args.seq,
                        kmer=args.kmer,
                        force=args.force,
                        threads=1,
                        verbose=False,
                        tmp_dir=args.tmp,
                        skeleton_dir=args.skeleton_dir,
                        mccortex31_path=args.mccortex31_path)
    cp.run()
    logger.debug('CoverageParser complete')

    # Detect species
    species_predictor = AMRSpeciesPredictor(
        phylo_group_covgs=cp.covgs.get("complex",
                                       cp.covgs.get("phylo_group", {})),
        sub_complex_covgs=cp.covgs.get("sub-complex", {}),
        species_covgs=cp.covgs["species"],
        lineage_covgs=cp.covgs.get("sub-species", {}),
        hierarchy_json_file=hierarchy_json_file)
    phylogenetics = species_predictor.run()

    # ## AMR prediction

    depths = []
    if species_predictor.is_saureus_present():
        depths = [
            species_predictor.out_json["phylogenetics"]["phylo_group"]
            ["Staphaureus"]["median_depth"]
        ]
    elif species_predictor.is_mtbc_present():
        depths = [
            species_predictor.out_json["phylogenetics"]["phylo_group"]
            ["Mycobacterium_tuberculosis_complex"]["median_depth"]
        ]
    # pprint (species_predictor.out_json["phylogenetics"]["species"])
    # Genotype
    q = args.quiet
    args.quiet = True
    variant_calls_dict = {}
    sequence_calls_dict = {}
    if args.force and not depths:
        depths = [1]
    gt = None
    if depths or args.force:
        gt = Genotyper(sample=args.sample,
                       expected_depths=depths,
                       expected_error_rate=args.expected_error_rate,
                       variant_covgs=cp.variant_covgs,
                       gene_presence_covgs=cp.covgs["presence"],
                       base_json=base_json,
                       contamination_depths=[],
                       report_all_calls=True,
                       ignore_filtered=True,
                       filters=args.filters,
                       variant_confidence_threshold=args.min_variant_conf,
                       sequence_confidence_threshold=args.min_gene_conf,
                       model=args.model)
        gt.run()
        variant_calls_dict = gt.variant_calls_dict
        sequence_calls_dict = gt.sequence_calls_dict
    else:
        depths = [cp.estimate_depth()]
    args.quiet = q
    mykrobe_predictor_susceptibility_result = MykrobePredictorSusceptibilityResult(
    )
    if gt is not None and (max(depths) > args.min_depth or args.force):
        predictor = Predictor(variant_calls=gt.variant_calls,
                              called_genes=gt.sequence_calls_dict,
                              base_json=base_json[args.sample],
                              depth_threshold=args.min_depth,
                              ignore_filtered=True,
                              ignore_minor_calls=args.ont,
                              variant_to_resistance_json_fp=args.
                              custom_variant_to_resistance_json)
        mykrobe_predictor_susceptibility_result = predictor.run()
    base_json[args.sample] = MykrobePredictorResult(
        susceptibility=mykrobe_predictor_susceptibility_result,
        phylogenetics=phylogenetics,
        variant_calls=variant_calls_dict,
        sequence_calls=sequence_calls_dict,
        probe_sets=panels,
        files=args.seq,
        kmer=args.kmer,
        version=version,
        model=args.model).to_dict()
    if not args.keep_tmp:
        cp.remove_temporary_files()

    # write to file is specified by user, otherwise send to stdout
    if args.output:
        with open(args.output, 'w') as outfile:
            json.dump(base_json, outfile, indent=4)
    else:
        print(json.dumps(base_json, indent=4))