Python CoverageParser примеры использования

Язык программирования: Python

Пространство имен/Пакет: mykrobe.typing

Класс/Тип: CoverageParser

Примеров на hotexamples.com: 6

Python CoverageParser - 6 примеров найдено. Это лучшие примеры Python кода для mykrobe.typing.CoverageParser, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

CoverageParser(6)

estimate_depth(6)

remove_temporary_files(6)

run(6)

Пример #1

Показать файл

Файл: genotype.py Проект: Phelimb/mykrobe-atlas-cli

def run_main(parser, args):
    args = parser.parse_args()
    verbose = True
    if args.ont:
        args.expected_error_rate = 0.15
        args.filters = ["LOW_GT_CONF"]
        args.model = "kmer_count"
        logger.debug("Setting expected error rate to %s (--ont)" %
                     args.expected_error_rate)
        logger.debug(
            "Removing LOW_PERCENT_COVERAGE filter (increases sensitivity - in particular for ONT data)")

    if args.min_variant_conf is None:
        args.min_variant_conf = 100
    cp = CoverageParser(
        sample=args.sample,
        panel_file_paths=[args.probe_set],
        seq=args.seq,
        ctx=args.ctx,
        kmer=args.kmer,
        force=args.force,
        verbose=verbose,
        tmp_dir=args.tmp,
        skeleton_dir=args.skeleton_dir,
        threads=args.threads,
        memory=args.memory,
        mccortex31_path=args.mccortex31_path)
    cp.run()
    if args.expected_depth is None:
        args.expected_depth = cp.estimate_depth()

    base_json = {args.sample: {}}
    base_json[args.sample]["probe_set"] = args.probe_set
    if args.seq:
        base_json[args.sample]["files"] = args.seq
    else:
        base_json[args.sample]["files"] = args.ctx
    base_json[args.sample]["kmer"] = args.kmer
    base_json[args.sample]["version"] = __version__
    gt = Genotyper(
        sample=args.sample,
        expected_error_rate=args.expected_error_rate,
        expected_depths=[
            args.expected_depth],
        variant_covgs=cp.variant_covgs,
        gene_presence_covgs=cp.covgs["presence"],
        base_json=base_json,
        contamination_depths=[],
        ignore_filtered=args.ignore_filtered,
        filters=args.filters,
        model=args.model,
        report_all_calls=args.report_all_calls,
        variant_confidence_threshold=args.min_variant_conf,
        sequence_confidence_threshold=args.min_gene_conf,
        min_gene_percent_covg_threshold=args.min_gene_percent_covg_threshold)
    gt.run()
    if not args.keep_tmp:
        cp.remove_temporary_files()
    return gt.out_json

Пример #2

Показать файл

def run(parser, args):
    logger.info(
        f"Start runnning mykrobe predict. Command line: {' '.join(sys.argv)}")
    base_json = {args.sample: {}}
    ref_data = ref_data_from_args(args)
    if (args.species == "custom" and ref_data["var_to_res_json"] is None
            and ref_data["lineage_json"] is None):
        logger.info(
            "Forcing --report_all_calls because species is 'custom' and options --custom_variant_to_resistance_json,--custom_lineage_json were not used"
        )
        args.report_all_calls = True
    logger.info(
        f"Running mykrobe predict using species {args.species}, and panel version {ref_data['version']}"
    )

    if args.tmp is None:
        args.tmp = tempfile.mkdtemp() + "/"

    if args.ont:
        args.expected_error_rate = ONT_E_RATE
        logger.info(
            f"Set expected error rate to {args.expected_error_rate} because --ont flag was used"
        )
        args.ploidy = ONT_PLOIDY
        logger.info(f"Set ploidy to {args.ploidy} because --ont flag used")

    # Run Cortex
    cp = CoverageParser(
        sample=args.sample,
        panel_file_paths=ref_data["fasta_files"],
        seq=args.seq,
        kmer=ref_data["kmer"],
        force=args.force,
        threads=args.threads,
        verbose=False,
        tmp_dir=args.tmp,
        skeleton_dir=args.skeleton_dir,
        memory=args.memory,
    )
    cp.run()
    logger.debug("CoverageParser complete")

    if ref_data["species_phylo_group"] is None:
        phylogenetics = {}
        depths = [cp.estimate_depth()]
    else:
        phylogenetics, depths = detect_species_and_get_depths(
            cp, ref_data["hierarchy_json"], ref_data["species_phylo_group"])

    # Genotype
    variant_calls_dict = {}
    sequence_calls_dict = {}
    lineage_calls_dict = {}
    lineage_predict_dict = {}
    if args.force and len(depths) == 0:
        depths = [cp.estimate_depth()]
    gt = None

    if len(depths) > 0 or args.force:
        # Running the genotyper changes the contents of cp.covgs["presence"].
        # The changes can cause later second run (if it happens) of the
        # genotytper to crash.
        # Store the original cp.covgs["presence"] so we can use it again later.
        original_covgs_presence = copy.deepcopy(cp.covgs["presence"])
        gt = Genotyper(
            sample=args.sample,
            expected_depths=depths,
            expected_error_rate=args.expected_error_rate,
            variant_covgs=cp.variant_covgs,
            gene_presence_covgs=cp.covgs["presence"],
            base_json=base_json,
            contamination_depths=[],
            report_all_calls=True,
            ignore_filtered=True,
            filters=args.filters,
            variant_confidence_threshold=args.min_variant_conf,
            sequence_confidence_threshold=args.min_gene_conf,
            model=args.model,
            kmer_size=ref_data["kmer"],
            min_proportion_expected_depth=args.min_proportion_expected_depth,
            ploidy=args.ploidy,
            lineage_variants=ref_data["lineage_dict"],
        )
        gt.run()
        (
            kmer_count_error_rate,
            incorrect_kmer_to_pc_cov,
        ) = gt.estimate_kmer_count_error_rate_and_incorrect_kmer_to_percent_cov(
        )
        logger.debug("Estimated error rate for kmer count model: " +
                     str(round(100 * kmer_count_error_rate, 2)) + "%")
        if args.guess_sequence_method and kmer_count_error_rate > 0.001:
            logger.warning(
                "Guess sequence method is on, and we've guessed ONT")
            # this is duplicated from the if args.ont statement above
            args.expected_error_rate = ONT_E_RATE
            logger.info(
                f"Set expected error rate to {args.expected_error_rate}")
            args.ploidy = ONT_PLOIDY
            logger.info(f"Set ploidy to {args.ploidy}")

        # conf_percent_cutoff == 100 means that we want to keep all variant calls,
        # in which case there is no need to run the simulations
        if args.conf_percent_cutoff < 100:
            logger.debug("Expected depth: " + str(depths[0]))
            conf_thresholder = ConfThresholder(
                kmer_count_error_rate,
                depths[0],
                ref_data["kmer"],
                incorrect_kmer_to_pc_cov,
            )
            time_start = time.time()
            conf_threshold = conf_thresholder.get_conf_threshold(
                percent_to_keep=args.conf_percent_cutoff)
            time_end = time.time()
            time_to_sim = time_end - time_start
            logger.debug("Simulation time: " + str(time_to_sim))
            logger.debug("Confidence cutoff (using percent cutoff " +
                         str(args.conf_percent_cutoff) + "%): " +
                         str(conf_threshold))
            cp.covgs["presence"] = copy.deepcopy(original_covgs_presence)
            gt = Genotyper(
                sample=args.sample,
                expected_depths=depths,
                expected_error_rate=kmer_count_error_rate,
                variant_covgs=cp.variant_covgs,
                gene_presence_covgs=cp.covgs["presence"],
                base_json=base_json,
                contamination_depths=[],
                report_all_calls=True,
                ignore_filtered=True,
                filters=args.filters,
                variant_confidence_threshold=conf_threshold,
                sequence_confidence_threshold=args.min_gene_conf,
                model=args.model,
                kmer_size=ref_data["kmer"],
                min_proportion_expected_depth=args.
                min_proportion_expected_depth,
                ploidy=args.ploidy,
                lineage_variants=ref_data["lineage_dict"],
            )
            gt.run()

        variant_calls_dict = gt.variant_calls_dict
        sequence_calls_dict = gt.sequence_calls_dict
        lineage_predict_dict, lineage_calls_dict = gt.predict_lineage()
    else:
        depths = [cp.estimate_depth()]

    mykrobe_predictor_susceptibility_result = MykrobePredictorSusceptibilityResult(
    )
    if (gt is not None and (max(depths) > args.min_depth or args.force)
            and ref_data["var_to_res_json"] is not None):
        predictor = BasePredictor(
            variant_calls=gt.variant_calls,
            called_genes=gt.sequence_calls_dict,
            base_json=base_json[args.sample],
            depth_threshold=args.min_depth,
            ignore_filtered=True,
            ignore_minor_calls=args.ignore_minor_calls,
            variant_to_resistance_json_fp=ref_data["var_to_res_json"],
        )
        mykrobe_predictor_susceptibility_result = predictor.run()
        logger.info("Progress: finished making AMR predictions")

    base_json[args.sample] = {
        "susceptibility":
        list(mykrobe_predictor_susceptibility_result.to_dict().values())[0],
        "phylogenetics": {}
        if phylogenetics == {} else list(phylogenetics.to_dict().values())[0],
        "variant_calls":
        variant_calls_dict,
        "sequence_calls":
        sequence_calls_dict,
        "lineage_calls":
        lineage_calls_dict,
        "kmer":
        ref_data["kmer"],
        "probe_sets":
        ref_data["fasta_files"],
        "files":
        args.seq,
        "version": {
            "mykrobe-predictor": predictor_version,
            "mykrobe-atlas": atlas_version,
            "panel": ref_data["version"],
        },
        "genotype_model":
        args.model,
    }
    if len(lineage_predict_dict) > 0:
        base_json[
            args.sample]["phylogenetics"]["lineage"] = lineage_predict_dict

    if not args.keep_tmp:
        cp.remove_temporary_files()

    logger.info("Progress: writing output")
    fix_X_amino_acid_variants(base_json[args.sample])
    write_outputs(args, base_json)
    logger.info("Progress: finished")

Пример #3

Показать файл

Файл: amr.py Проект: leoisl/mykrobe

def run(parser, args):
    base_json = {args.sample: {}}
    args = parser.parse_args()
    hierarchy_json_file = None
    if args.panel is not None:
        variant_to_resistance_json_fp = None
        if args.species == "tb" and args.panel == "bradley-2015":
            TB_PANELS = [
                "data/panels/tb-species-170421.fasta.gz",
                "data/panels/tb-bradley-probe-set-jan-2019.fasta.gz",
            ]
        elif args.species == "tb" and args.panel == "walker-2015":
            TB_PANELS = [
                "data/panels/tb-species-170421.fasta.gz",
                "data/panels/tb-walker-probe-set-jan-2019.fasta.gz",
            ]
        elif args.species == "tb" and args.panel == "201901":
            TB_PANELS = [
                "data/panels/tb-species-170421.fasta.gz",
                "data/panels/tb-hunt-probe-set-jan-03-2019.fasta.gz",
            ]
            data_dir = os.path.abspath(
                os.path.join(os.path.dirname(__file__), "../data/predict/tb/"))
            variant_to_resistance_json_fp = os.path.join(
                data_dir, "variant_to_resistance_drug-jan-03-2019.json")
        elif args.species == "tb" and args.panel == "atlas":
            TB_PANELS = [
                "data/panels/tb-species-170421.fasta.gz",
                "data/panels/tb-walker-probe-set-jan-2019.fasta.gz",
                "data/panels/tb-k21-probe-set-feb-09-2017.fasta.gz",
            ]
        elif args.panel == "custom":
            if not args.custom_probe_set_path:
                raise ValueError("Custom panel requires custom_probe_set_path")
            TB_PANELS = [
                args.custom_probe_set_path,
                "data/panels/tb-species-170421.fasta.gz",
            ]
            variant_to_resistance_json_fp = args.custom_variant_to_resistance_json
    Predictor = None
    if not args.species:
        panels = TB_PANELS + GN_PANELS + STAPH_PANELS
    elif args.species == "staph":
        panels = STAPH_PANELS
        Predictor = StaphPredictor
        args.kmer = 15  # Forced
        variant_to_resistance_json_fp = None
    elif args.species == "tb":
        panels = TB_PANELS
        hierarchy_json_file = "data/phylo/mtbc_hierarchy.json"
        Predictor = TBPredictor
    logger.info("Running AMR prediction with panels %s" % ", ".join(panels))
    version = {}
    version["mykrobe-predictor"] = predictor_version
    version["mykrobe-atlas"] = atlas_version
    # Get real paths for panels
    panels = [
        os.path.realpath(os.path.join(os.path.dirname(__file__), "..", f))
        for f in panels
    ]
    if hierarchy_json_file is not None:
        hierarchy_json_file = os.path.realpath(
            os.path.join(os.path.dirname(__file__), "..", hierarchy_json_file))
    # Run Cortex
    cp = CoverageParser(
        sample=args.sample,
        panel_file_paths=panels,
        seq=args.seq,
        kmer=args.kmer,
        force=args.force,
        threads=1,
        verbose=False,
        tmp_dir=args.tmp,
        skeleton_dir=args.skeleton_dir,
        mccortex31_path=args.mccortex31_path,
    )
    cp.run()
    logger.debug("CoverageParser complete")

    # Detect species
    species_predictor = AMRSpeciesPredictor(
        phylo_group_covgs=cp.covgs.get("complex",
                                       cp.covgs.get("phylo_group", {})),
        sub_complex_covgs=cp.covgs.get("sub-complex", {}),
        species_covgs=cp.covgs["species"],
        lineage_covgs=cp.covgs.get("sub-species", {}),
        hierarchy_json_file=hierarchy_json_file,
    )
    phylogenetics = species_predictor.run()

    # ## AMR prediction

    depths = []
    if species_predictor.is_saureus_present():
        depths = [
            species_predictor.out_json["phylogenetics"]["phylo_group"]
            ["Staphaureus"]["median_depth"]
        ]
    elif species_predictor.is_mtbc_present():
        depths = [
            species_predictor.out_json["phylogenetics"]["phylo_group"]
            ["Mycobacterium_tuberculosis_complex"]["median_depth"]
        ]
    # pprint (species_predictor.out_json["phylogenetics"]["species"])
    # Genotype
    q = args.quiet
    args.quiet = True
    variant_calls_dict = {}
    sequence_calls_dict = {}
    if args.force and not depths:
        depths = [1]
    gt = None

    if depths or args.force:
        gt = Genotyper(
            sample=args.sample,
            expected_depths=depths,
            expected_error_rate=args.expected_error_rate,
            variant_covgs=cp.variant_covgs,
            gene_presence_covgs=cp.covgs["presence"],
            base_json=base_json,
            contamination_depths=[],
            report_all_calls=True,
            ignore_filtered=True,
            filters=args.filters,
            variant_confidence_threshold=args.min_variant_conf,
            sequence_confidence_threshold=args.min_gene_conf,
            model=args.model,
            kmer_size=args.kmer,
            min_proportion_expected_depth=args.min_proportion_expected_depth,
            ploidy=args.ploidy,
        )
        gt.run()
        kmer_count_error_rate, incorrect_kmer_to_pc_cov = (
            gt.
            estimate_kmer_count_error_rate_and_incorrect_kmer_to_percent_cov())
        logger.info("Estimated error rate for kmer count model: " +
                    str(round(100 * kmer_count_error_rate, 2)) + "%")
        if args.guess_sequence_method and kmer_count_error_rate > 0.001:
            logger.warning(
                "Guess sequence method is on, and we've guessed ONT")
            args.ont = True

        if args.ont:
            args.expected_error_rate = 0.15
            args.ploidy = "haploid"
            args.ignore_minor_calls = True
            logger.warning("Setting ploidy to haploid")
            logger.warning("Setting ignore_minor_calls to True")
            logger.warning("Setting expected error rate to %s (--ont)" %
                           args.expected_error_rate)
            args.model = "kmer_count"

        # If the user didn't specify the conf_percent_cutoff, then set it
        # depending on whether or not the --ont flag was used
        if args.conf_percent_cutoff == -1:
            args.conf_percent_cutoff = 90 if args.ont else 100

        # conf_percent_cutoff == 100 means that we want to keep all variant calls,
        # in which case there is no need to run the simulations
        if args.conf_percent_cutoff < 100:
            logger.info("Expected depth: " + str(depths[0]))
            conf_thresholder = ConfThresholder(kmer_count_error_rate,
                                               depths[0], args.kmer,
                                               incorrect_kmer_to_pc_cov)
            time_start = time.time()
            conf_threshold = conf_thresholder.get_conf_threshold(
                percent_to_keep=args.conf_percent_cutoff)
            time_end = time.time()
            time_to_sim = time_end - time_start
            logger.info("Simulation time: " + str(time_to_sim))
            logger.info("Confidence cutoff (using percent cutoff " +
                        str(args.conf_percent_cutoff) + "%): " +
                        str(conf_threshold))
            gt = Genotyper(
                sample=args.sample,
                expected_depths=depths,
                expected_error_rate=kmer_count_error_rate,
                # expected_error_rate=args.expected_error_rate,
                variant_covgs=cp.variant_covgs,
                gene_presence_covgs=cp.covgs["presence"],
                base_json=base_json,
                contamination_depths=[],
                report_all_calls=True,
                ignore_filtered=True,
                filters=args.filters,
                variant_confidence_threshold=conf_threshold,
                sequence_confidence_threshold=args.min_gene_conf,
                model=args.model,
                kmer_size=args.kmer,
                min_proportion_expected_depth=args.
                min_proportion_expected_depth,
                ploidy=args.ploidy,
            )
            gt.run()

        variant_calls_dict = gt.variant_calls_dict
        sequence_calls_dict = gt.sequence_calls_dict
    else:
        depths = [cp.estimate_depth()]
    args.quiet = q
    mykrobe_predictor_susceptibility_result = MykrobePredictorSusceptibilityResult(
    )
    if gt is not None and (max(depths) > args.min_depth or args.force):
        predictor = Predictor(
            variant_calls=gt.variant_calls,
            called_genes=gt.sequence_calls_dict,
            base_json=base_json[args.sample],
            depth_threshold=args.min_depth,
            ignore_filtered=True,
            ignore_minor_calls=args.ignore_minor_calls,
            variant_to_resistance_json_fp=variant_to_resistance_json_fp,
        )
        mykrobe_predictor_susceptibility_result = predictor.run()
    base_json[args.sample] = MykrobePredictorResult(
        susceptibility=mykrobe_predictor_susceptibility_result,
        phylogenetics=phylogenetics,
        variant_calls=variant_calls_dict,
        sequence_calls=sequence_calls_dict,
        probe_sets=panels,
        files=args.seq,
        kmer=args.kmer,
        version=version,
        model=args.model,
    ).to_dict()
    if not args.keep_tmp:
        cp.remove_temporary_files()

    # write to file is specified by user, otherwise send to stdout
    if args.output_format == "csv":
        output = json_to_csv(base_json)
    else:
        ## Verbose json output requires --report_all_calls
        if not args.report_all_calls:
            del base_json[args.sample]["variant_calls"]
            del base_json[args.sample]["sequence_calls"]
        output = json.dumps(base_json, indent=4)

    if args.output:
        with open(args.output, "w") as outfile:
            outfile.write(output)
    else:
        print(output)

Пример #4

Показать файл

Файл: genotype.py Проект: martinghunt/mykrobe

def run_main(parser, args):
    args = parser.parse_args()
    verbose = True
    if args.ont:
        args.expected_error_rate = ONT_E_RATE
        logger.debug("Setting expected error rate to %s (--ont)" %
                     args.expected_error_rate)

    if args.min_variant_conf is None:
        args.min_variant_conf = 100

    if args.tmp is None:
        args.tmp = tempfile.mkdtemp() + "/"

    cp = CoverageParser(
        sample=args.sample,
        panel_file_paths=[args.probe_set],
        seq=args.seq,
        ctx=args.ctx,
        kmer=args.kmer,
        force=args.force,
        verbose=verbose,
        tmp_dir=args.tmp,
        skeleton_dir=args.skeleton_dir,
        threads=args.threads,
        memory=args.memory,
    )
    cp.run()
    if args.expected_depth is None:
        args.expected_depth = cp.estimate_depth()

    base_json = {args.sample: {}}
    base_json[args.sample]["probe_set"] = args.probe_set
    if args.seq:
        base_json[args.sample]["files"] = args.seq
    else:
        base_json[args.sample]["files"] = args.ctx
    base_json[args.sample]["kmer"] = args.kmer
    base_json[args.sample]["version"] = __version__
    if args.lineage is None:
        lineage_dict = None
    else:
        lineage_dict = load_json(args.lineage)
    gt = Genotyper(
        sample=args.sample,
        expected_error_rate=args.expected_error_rate,
        expected_depths=[args.expected_depth],
        variant_covgs=cp.variant_covgs,
        gene_presence_covgs=cp.covgs["presence"],
        base_json=base_json,
        contamination_depths=[],
        ignore_filtered=args.ignore_filtered,
        filters=args.filters,
        model=args.model,
        report_all_calls=args.report_all_calls,
        variant_confidence_threshold=args.min_variant_conf,
        sequence_confidence_threshold=args.min_gene_conf,
        min_gene_percent_covg_threshold=args.min_gene_percent_covg_threshold,
        kmer_size=args.kmer,
        min_proportion_expected_depth=args.min_proportion_expected_depth,
        ploidy=args.ploidy,
        lineage_variants=lineage_dict,
    )
    gt.run()
    if args.output:
        with open(args.output, "w") as outfile:
            json.dump(gt.out_json, outfile, indent=4)

    if not args.keep_tmp:
        cp.remove_temporary_files()
    return gt.out_json

Пример #5

Показать файл

Файл: amr.py Проект: feihongloveworld/mykrobe

def run(parser, args):
    base_json = {args.sample: {}}
    args = parser.parse_args()
    hierarchy_json_file = None
    variant_to_resistance_json_fp: Optional[PathLike] = None
    species = Species(args.species)
    if species is not Species.TB and args.panel != "custom":
        args.panel = "default"
    panels = Panel.from_species_and_name(species, args.panel)

    if species is Species.TB and panels.name is TbPanel.NEJM_WALKER:
        data_dir = os.path.abspath(
            os.path.join(os.path.dirname(__file__), "../data/predict/tb/"))
        variant_to_resistance_json_fp = os.path.join(
            data_dir, "variant_to_resistance_drug-jan-03-2019.json")
    if panels.name in (TbPanel.CUSTOM, StaphPanel.CUSTOM):
        if not args.custom_probe_set_path:
            raise ValueError("Custom panel requires custom_probe_set_path")

        if not os.path.exists(args.custom_probe_set_path):
            raise FileNotFoundError(
                f"Custom probe path {args.custom_probe_set_path} does not exist!"
            )
        panels.add_path(args.custom_probe_set_path)

        if not os.path.exists(args.custom_variant_to_resistance_json):
            raise FileNotFoundError(
                ("Custom variant to resistance json "
                 f"{args.custom_variant_to_resistance_json} does not exist!"))
        variant_to_resistance_json_fp = args.custom_variant_to_resistance_json

    if species is Species.STAPH:
        Predictor = StaphPredictor
        args.kmer = 15  # Forced
    elif species is Species.TB:
        hierarchy_json_file = "data/phylo/mtbc_hierarchy.json"
        Predictor = TBPredictor
    else:
        raise ValueError(f"Unrecognised species {species}")

    logger.info("Running AMR prediction with panels %s" %
                ", ".join(panels.paths))
    version = dict()
    version["mykrobe-predictor"] = predictor_version
    version["mykrobe-atlas"] = atlas_version
    # Get real paths for panels
    panels = [
        os.path.realpath(os.path.join(os.path.dirname(__file__), "..", f))
        for f in panels.paths
    ]
    if hierarchy_json_file is not None:
        hierarchy_json_file = os.path.realpath(
            os.path.join(os.path.dirname(__file__), "..", hierarchy_json_file))
    # Run Cortex
    cp = CoverageParser(
        sample=args.sample,
        panel_file_paths=panels,
        seq=args.seq,
        kmer=args.kmer,
        force=args.force,
        threads=1,
        verbose=False,
        tmp_dir=args.tmp,
        skeleton_dir=args.skeleton_dir,
    )
    cp.run()
    logger.debug("CoverageParser complete")

    # Detect species
    species_predictor = AMRSpeciesPredictor(
        phylo_group_covgs=cp.covgs.get("complex",
                                       cp.covgs.get("phylo_group", {})),
        sub_complex_covgs=cp.covgs.get("sub-complex", {}),
        species_covgs=cp.covgs["species"],
        lineage_covgs=cp.covgs.get("sub-species", {}),
        hierarchy_json_file=hierarchy_json_file,
    )
    phylogenetics = species_predictor.run()

    # ## AMR prediction

    depths = []
    if species_predictor.is_saureus_present():
        depths = [
            species_predictor.out_json["phylogenetics"]["phylo_group"]
            ["Staphaureus"]["median_depth"]
        ]
    elif species_predictor.is_mtbc_present():
        depths = [
            species_predictor.out_json["phylogenetics"]["phylo_group"]
            ["Mycobacterium_tuberculosis_complex"]["median_depth"]
        ]
    # pprint (species_predictor.out_json["phylogenetics"]["species"])
    # Genotype
    q = args.quiet
    args.quiet = True
    variant_calls_dict = {}
    sequence_calls_dict = {}
    if args.force and not depths:
        depths = [1]
    gt = None

    if depths or args.force:
        gt = Genotyper(
            sample=args.sample,
            expected_depths=depths,
            expected_error_rate=args.expected_error_rate,
            variant_covgs=cp.variant_covgs,
            gene_presence_covgs=cp.covgs["presence"],
            base_json=base_json,
            contamination_depths=[],
            report_all_calls=True,
            ignore_filtered=True,
            filters=args.filters,
            variant_confidence_threshold=args.min_variant_conf,
            sequence_confidence_threshold=args.min_gene_conf,
            model=args.model,
            kmer_size=args.kmer,
            min_proportion_expected_depth=args.min_proportion_expected_depth,
            ploidy=args.ploidy,
        )
        gt.run()
        (
            kmer_count_error_rate,
            incorrect_kmer_to_pc_cov,
        ) = gt.estimate_kmer_count_error_rate_and_incorrect_kmer_to_percent_cov(
        )
        logger.info("Estimated error rate for kmer count model: " +
                    str(round(100 * kmer_count_error_rate, 2)) + "%")
        if args.guess_sequence_method and kmer_count_error_rate > 0.001:
            logger.warning(
                "Guess sequence method is on, and we've guessed ONT")
            args.ont = True

        if args.ont:
            args.expected_error_rate = 0.15
            args.ploidy = "haploid"
            args.ignore_minor_calls = True
            logger.warning("Setting ploidy to haploid")
            logger.warning("Setting ignore_minor_calls to True")
            logger.warning("Setting expected error rate to %s (--ont)" %
                           args.expected_error_rate)
            args.model = "kmer_count"

        # If the user didn't specify the conf_percent_cutoff, then set it
        # depending on whether or not the --ont flag was used
        if args.conf_percent_cutoff == -1:
            args.conf_percent_cutoff = 90 if args.ont else 100

        # conf_percent_cutoff == 100 means that we want to keep all variant calls,
        # in which case there is no need to run the simulations
        if args.conf_percent_cutoff < 100:
            logger.info("Expected depth: " + str(depths[0]))
            conf_thresholder = ConfThresholder(kmer_count_error_rate,
                                               depths[0], args.kmer,
                                               incorrect_kmer_to_pc_cov)
            time_start = time.time()
            conf_threshold = conf_thresholder.get_conf_threshold(
                percent_to_keep=args.conf_percent_cutoff)
            time_end = time.time()
            time_to_sim = time_end - time_start
            logger.info("Simulation time: " + str(time_to_sim))
            logger.info("Confidence cutoff (using percent cutoff " +
                        str(args.conf_percent_cutoff) + "%): " +
                        str(conf_threshold))
            gt = Genotyper(
                sample=args.sample,
                expected_depths=depths,
                expected_error_rate=kmer_count_error_rate,
                variant_covgs=cp.variant_covgs,
                gene_presence_covgs=cp.covgs["presence"],
                base_json=base_json,
                contamination_depths=[],
                report_all_calls=True,
                ignore_filtered=True,
                filters=args.filters,
                variant_confidence_threshold=conf_threshold,
                sequence_confidence_threshold=args.min_gene_conf,
                model=args.model,
                kmer_size=args.kmer,
                min_proportion_expected_depth=args.
                min_proportion_expected_depth,
                ploidy=args.ploidy,
            )
            gt.run()

        variant_calls_dict = gt.variant_calls_dict
        sequence_calls_dict = gt.sequence_calls_dict
    else:
        depths = [cp.estimate_depth()]
    args.quiet = q
    mykrobe_predictor_susceptibility_result = MykrobePredictorSusceptibilityResult(
    )
    if gt is not None and (max(depths) > args.min_depth or args.force):
        predictor = Predictor(
            variant_calls=gt.variant_calls,
            called_genes=gt.sequence_calls_dict,
            base_json=base_json[args.sample],
            depth_threshold=args.min_depth,
            ignore_filtered=True,
            ignore_minor_calls=args.ignore_minor_calls,
            variant_to_resistance_json_fp=variant_to_resistance_json_fp,
        )
        mykrobe_predictor_susceptibility_result = predictor.run()
    base_json[args.sample] = MykrobePredictorResult(
        susceptibility=mykrobe_predictor_susceptibility_result,
        phylogenetics=phylogenetics,
        variant_calls=variant_calls_dict,
        sequence_calls=sequence_calls_dict,
        probe_sets=panels,
        files=args.seq,
        kmer=args.kmer,
        version=version,
        model=args.model,
    ).to_dict()
    if not args.keep_tmp:
        cp.remove_temporary_files()

    outputs = {}

    if args.output_format in ["csv", "json_and_csv"]:
        outputs["csv"] = json_to_csv(base_json)
    if args.output_format in ["json", "json_and_csv"]:
        # Verbose json output requires --report_all_calls
        if not args.report_all_calls:
            del base_json[args.sample]["variant_calls"]
            del base_json[args.sample]["sequence_calls"]
        outputs["json"] = json.dumps(base_json, indent=4)

    if len(outputs) == 0:
        raise ValueError(
            (f"Output format must be one of: csv,json,json_and_csv. Got "
             f"'{args.output_format}'"))

    for output_type, output in outputs.items():
        # write to file is specified by user, otherwise send to stdout
        if args.output:
            if args.output_format == "json_and_csv":
                outfile = args.output + "." + output_type
            else:
                outfile = args.output
            with open(outfile, "w") as f:
                f.write(output)
        else:
            print(output)

Пример #6

Показать файл

def run(parser, args):
    base_json = {args.sample: {}}
    args = parser.parse_args()
    hierarchy_json_file = None
    if args.panel is not None:
        if args.panel == "bradley-2015":
            TB_PANELS = [
                "data/panels/tb-species-170421.fasta.gz",
                "data/panels/tb-bradley-probe-set-feb-09-2017.fasta.gz"
            ]
        elif args.panel == "walker-2015":
            TB_PANELS = [
                "data/panels/tb-species-170421.fasta.gz",
                "data/panels/tb-walker-probe-set-feb-09-2017.fasta.gz"
            ]
        elif args.panel == "custom":
            if not args.custom_probe_set_path:
                raise ValueError("Custom panel requires custom_probe_set_path")
            TB_PANELS = [
                args.custom_probe_set_path,
                "data/panels/tb-species-170421.fasta.gz"
            ]
    Predictor = None
    if not args.species:
        panels = TB_PANELS + GN_PANELS + STAPH_PANELS
    elif args.species == "staph":
        panels = STAPH_PANELS
        Predictor = StaphPredictor
        args.kmer = 15  # Forced
    elif args.species == "tb":
        panels = TB_PANELS
        hierarchy_json_file = "data/phylo/mtbc_hierarchy.json"
        Predictor = TBPredictor
    logger.info("Running AMR prediction with panels %s" % ", ".join(panels))
    version = {}
    version["mykrobe-predictor"] = predictor_version
    version["mykrobe-atlas"] = atlas_version
    # Get real paths for panels
    panels = [
        os.path.realpath(os.path.join(os.path.dirname(__file__), "..", f))
        for f in panels
    ]
    if hierarchy_json_file is not None:
        hierarchy_json_file = os.path.realpath(
            os.path.join(os.path.dirname(__file__), "..", hierarchy_json_file))
    if args.ont:
        args.expected_error_rate = 0.15
        logger.debug("Setting expected error rate to %s (--ont)" %
                     args.expected_error_rate)
        args.filters = ["LOW_GT_CONF"]
        args.model = "kmer_count"
    # Run Cortex
    cp = CoverageParser(sample=args.sample,
                        panel_file_paths=panels,
                        seq=args.seq,
                        kmer=args.kmer,
                        force=args.force,
                        threads=1,
                        verbose=False,
                        tmp_dir=args.tmp,
                        skeleton_dir=args.skeleton_dir,
                        mccortex31_path=args.mccortex31_path)
    cp.run()
    logger.debug('CoverageParser complete')

    # Detect species
    species_predictor = AMRSpeciesPredictor(
        phylo_group_covgs=cp.covgs.get("complex",
                                       cp.covgs.get("phylo_group", {})),
        sub_complex_covgs=cp.covgs.get("sub-complex", {}),
        species_covgs=cp.covgs["species"],
        lineage_covgs=cp.covgs.get("sub-species", {}),
        hierarchy_json_file=hierarchy_json_file)
    phylogenetics = species_predictor.run()

    # ## AMR prediction

    depths = []
    if species_predictor.is_saureus_present():
        depths = [
            species_predictor.out_json["phylogenetics"]["phylo_group"]
            ["Staphaureus"]["median_depth"]
        ]
    elif species_predictor.is_mtbc_present():
        depths = [
            species_predictor.out_json["phylogenetics"]["phylo_group"]
            ["Mycobacterium_tuberculosis_complex"]["median_depth"]
        ]
    # pprint (species_predictor.out_json["phylogenetics"]["species"])
    # Genotype
    q = args.quiet
    args.quiet = True
    variant_calls_dict = {}
    sequence_calls_dict = {}
    if args.force and not depths:
        depths = [1]
    gt = None
    if depths or args.force:
        gt = Genotyper(sample=args.sample,
                       expected_depths=depths,
                       expected_error_rate=args.expected_error_rate,
                       variant_covgs=cp.variant_covgs,
                       gene_presence_covgs=cp.covgs["presence"],
                       base_json=base_json,
                       contamination_depths=[],
                       report_all_calls=True,
                       ignore_filtered=True,
                       filters=args.filters,
                       variant_confidence_threshold=args.min_variant_conf,
                       sequence_confidence_threshold=args.min_gene_conf,
                       model=args.model)
        gt.run()
        variant_calls_dict = gt.variant_calls_dict
        sequence_calls_dict = gt.sequence_calls_dict
    else:
        depths = [cp.estimate_depth()]
    args.quiet = q
    mykrobe_predictor_susceptibility_result = MykrobePredictorSusceptibilityResult(
    )
    if gt is not None and (max(depths) > args.min_depth or args.force):
        predictor = Predictor(variant_calls=gt.variant_calls,
                              called_genes=gt.sequence_calls_dict,
                              base_json=base_json[args.sample],
                              depth_threshold=args.min_depth,
                              ignore_filtered=True,
                              ignore_minor_calls=args.ont,
                              variant_to_resistance_json_fp=args.
                              custom_variant_to_resistance_json)
        mykrobe_predictor_susceptibility_result = predictor.run()
    base_json[args.sample] = MykrobePredictorResult(
        susceptibility=mykrobe_predictor_susceptibility_result,
        phylogenetics=phylogenetics,
        variant_calls=variant_calls_dict,
        sequence_calls=sequence_calls_dict,
        probe_sets=panels,
        files=args.seq,
        kmer=args.kmer,
        version=version,
        model=args.model).to_dict()
    if not args.keep_tmp:
        cp.remove_temporary_files()

    # write to file is specified by user, otherwise send to stdout
    if args.output:
        with open(args.output, 'w') as outfile:
            json.dump(base_json, outfile, indent=4)
    else:
        print(json.dumps(base_json, indent=4))