Exemplo n.º 1
0
def write_outputs(args, base_json):
    outputs = {}

    if args.output_format in ["csv", "json_and_csv"]:
        outputs["csv"] = json_to_csv(base_json)
    if args.output_format in ["json", "json_and_csv"]:
        # Verbose json output requires --report_all_calls
        if not args.report_all_calls:
            del base_json[args.sample]["variant_calls"]
            del base_json[args.sample]["sequence_calls"]
            del base_json[args.sample]["lineage_calls"]
        outputs["json"] = json.dumps(base_json, indent=4)

    if len(outputs) == 0:
        raise ValueError(
            (f"Output format must be one of: csv,json,json_and_csv. Got "
             f"'{args.output_format}'"))

    for output_type, output in outputs.items():
        # write to file is specified by user, otherwise send to stdout
        if args.output:
            if args.output_format == "json_and_csv":
                outfile = args.output + "." + output_type
            else:
                outfile = args.output
            with open(outfile, "w") as f:
                f.write(output)
        else:
            print(output)
Exemplo n.º 2
0
def run(parser, args):
    base_json = {args.sample: {}}
    args = parser.parse_args()
    hierarchy_json_file = None
    if args.panel is not None:
        variant_to_resistance_json_fp = None
        if args.species == "tb" and args.panel == "bradley-2015":
            TB_PANELS = [
                "data/panels/tb-species-170421.fasta.gz",
                "data/panels/tb-bradley-probe-set-jan-2019.fasta.gz",
            ]
        elif args.species == "tb" and args.panel == "walker-2015":
            TB_PANELS = [
                "data/panels/tb-species-170421.fasta.gz",
                "data/panels/tb-walker-probe-set-jan-2019.fasta.gz",
            ]
        elif args.species == "tb" and args.panel == "201901":
            TB_PANELS = [
                "data/panels/tb-species-170421.fasta.gz",
                "data/panels/tb-hunt-probe-set-jan-03-2019.fasta.gz",
            ]
            data_dir = os.path.abspath(
                os.path.join(os.path.dirname(__file__), "../data/predict/tb/"))
            variant_to_resistance_json_fp = os.path.join(
                data_dir, "variant_to_resistance_drug-jan-03-2019.json")
        elif args.species == "tb" and args.panel == "atlas":
            TB_PANELS = [
                "data/panels/tb-species-170421.fasta.gz",
                "data/panels/tb-walker-probe-set-jan-2019.fasta.gz",
                "data/panels/tb-k21-probe-set-feb-09-2017.fasta.gz",
            ]
        elif args.panel == "custom":
            if not args.custom_probe_set_path:
                raise ValueError("Custom panel requires custom_probe_set_path")
            TB_PANELS = [
                args.custom_probe_set_path,
                "data/panels/tb-species-170421.fasta.gz",
            ]
            variant_to_resistance_json_fp = args.custom_variant_to_resistance_json
    Predictor = None
    if not args.species:
        panels = TB_PANELS + GN_PANELS + STAPH_PANELS
    elif args.species == "staph":
        panels = STAPH_PANELS
        Predictor = StaphPredictor
        args.kmer = 15  # Forced
        variant_to_resistance_json_fp = None
    elif args.species == "tb":
        panels = TB_PANELS
        hierarchy_json_file = "data/phylo/mtbc_hierarchy.json"
        Predictor = TBPredictor
    logger.info("Running AMR prediction with panels %s" % ", ".join(panels))
    version = {}
    version["mykrobe-predictor"] = predictor_version
    version["mykrobe-atlas"] = atlas_version
    # Get real paths for panels
    panels = [
        os.path.realpath(os.path.join(os.path.dirname(__file__), "..", f))
        for f in panels
    ]
    if hierarchy_json_file is not None:
        hierarchy_json_file = os.path.realpath(
            os.path.join(os.path.dirname(__file__), "..", hierarchy_json_file))
    # Run Cortex
    cp = CoverageParser(
        sample=args.sample,
        panel_file_paths=panels,
        seq=args.seq,
        kmer=args.kmer,
        force=args.force,
        threads=1,
        verbose=False,
        tmp_dir=args.tmp,
        skeleton_dir=args.skeleton_dir,
        mccortex31_path=args.mccortex31_path,
    )
    cp.run()
    logger.debug("CoverageParser complete")

    # Detect species
    species_predictor = AMRSpeciesPredictor(
        phylo_group_covgs=cp.covgs.get("complex",
                                       cp.covgs.get("phylo_group", {})),
        sub_complex_covgs=cp.covgs.get("sub-complex", {}),
        species_covgs=cp.covgs["species"],
        lineage_covgs=cp.covgs.get("sub-species", {}),
        hierarchy_json_file=hierarchy_json_file,
    )
    phylogenetics = species_predictor.run()

    # ## AMR prediction

    depths = []
    if species_predictor.is_saureus_present():
        depths = [
            species_predictor.out_json["phylogenetics"]["phylo_group"]
            ["Staphaureus"]["median_depth"]
        ]
    elif species_predictor.is_mtbc_present():
        depths = [
            species_predictor.out_json["phylogenetics"]["phylo_group"]
            ["Mycobacterium_tuberculosis_complex"]["median_depth"]
        ]
    # pprint (species_predictor.out_json["phylogenetics"]["species"])
    # Genotype
    q = args.quiet
    args.quiet = True
    variant_calls_dict = {}
    sequence_calls_dict = {}
    if args.force and not depths:
        depths = [1]
    gt = None

    if depths or args.force:
        gt = Genotyper(
            sample=args.sample,
            expected_depths=depths,
            expected_error_rate=args.expected_error_rate,
            variant_covgs=cp.variant_covgs,
            gene_presence_covgs=cp.covgs["presence"],
            base_json=base_json,
            contamination_depths=[],
            report_all_calls=True,
            ignore_filtered=True,
            filters=args.filters,
            variant_confidence_threshold=args.min_variant_conf,
            sequence_confidence_threshold=args.min_gene_conf,
            model=args.model,
            kmer_size=args.kmer,
            min_proportion_expected_depth=args.min_proportion_expected_depth,
            ploidy=args.ploidy,
        )
        gt.run()
        kmer_count_error_rate, incorrect_kmer_to_pc_cov = (
            gt.
            estimate_kmer_count_error_rate_and_incorrect_kmer_to_percent_cov())
        logger.info("Estimated error rate for kmer count model: " +
                    str(round(100 * kmer_count_error_rate, 2)) + "%")
        if args.guess_sequence_method and kmer_count_error_rate > 0.001:
            logger.warning(
                "Guess sequence method is on, and we've guessed ONT")
            args.ont = True

        if args.ont:
            args.expected_error_rate = 0.15
            args.ploidy = "haploid"
            args.ignore_minor_calls = True
            logger.warning("Setting ploidy to haploid")
            logger.warning("Setting ignore_minor_calls to True")
            logger.warning("Setting expected error rate to %s (--ont)" %
                           args.expected_error_rate)
            args.model = "kmer_count"

        # If the user didn't specify the conf_percent_cutoff, then set it
        # depending on whether or not the --ont flag was used
        if args.conf_percent_cutoff == -1:
            args.conf_percent_cutoff = 90 if args.ont else 100

        # conf_percent_cutoff == 100 means that we want to keep all variant calls,
        # in which case there is no need to run the simulations
        if args.conf_percent_cutoff < 100:
            logger.info("Expected depth: " + str(depths[0]))
            conf_thresholder = ConfThresholder(kmer_count_error_rate,
                                               depths[0], args.kmer,
                                               incorrect_kmer_to_pc_cov)
            time_start = time.time()
            conf_threshold = conf_thresholder.get_conf_threshold(
                percent_to_keep=args.conf_percent_cutoff)
            time_end = time.time()
            time_to_sim = time_end - time_start
            logger.info("Simulation time: " + str(time_to_sim))
            logger.info("Confidence cutoff (using percent cutoff " +
                        str(args.conf_percent_cutoff) + "%): " +
                        str(conf_threshold))
            gt = Genotyper(
                sample=args.sample,
                expected_depths=depths,
                expected_error_rate=kmer_count_error_rate,
                # expected_error_rate=args.expected_error_rate,
                variant_covgs=cp.variant_covgs,
                gene_presence_covgs=cp.covgs["presence"],
                base_json=base_json,
                contamination_depths=[],
                report_all_calls=True,
                ignore_filtered=True,
                filters=args.filters,
                variant_confidence_threshold=conf_threshold,
                sequence_confidence_threshold=args.min_gene_conf,
                model=args.model,
                kmer_size=args.kmer,
                min_proportion_expected_depth=args.
                min_proportion_expected_depth,
                ploidy=args.ploidy,
            )
            gt.run()

        variant_calls_dict = gt.variant_calls_dict
        sequence_calls_dict = gt.sequence_calls_dict
    else:
        depths = [cp.estimate_depth()]
    args.quiet = q
    mykrobe_predictor_susceptibility_result = MykrobePredictorSusceptibilityResult(
    )
    if gt is not None and (max(depths) > args.min_depth or args.force):
        predictor = Predictor(
            variant_calls=gt.variant_calls,
            called_genes=gt.sequence_calls_dict,
            base_json=base_json[args.sample],
            depth_threshold=args.min_depth,
            ignore_filtered=True,
            ignore_minor_calls=args.ignore_minor_calls,
            variant_to_resistance_json_fp=variant_to_resistance_json_fp,
        )
        mykrobe_predictor_susceptibility_result = predictor.run()
    base_json[args.sample] = MykrobePredictorResult(
        susceptibility=mykrobe_predictor_susceptibility_result,
        phylogenetics=phylogenetics,
        variant_calls=variant_calls_dict,
        sequence_calls=sequence_calls_dict,
        probe_sets=panels,
        files=args.seq,
        kmer=args.kmer,
        version=version,
        model=args.model,
    ).to_dict()
    if not args.keep_tmp:
        cp.remove_temporary_files()

    # write to file is specified by user, otherwise send to stdout
    if args.output_format == "csv":
        output = json_to_csv(base_json)
    else:
        ## Verbose json output requires --report_all_calls
        if not args.report_all_calls:
            del base_json[args.sample]["variant_calls"]
            del base_json[args.sample]["sequence_calls"]
        output = json.dumps(base_json, indent=4)

    if args.output:
        with open(args.output, "w") as outfile:
            outfile.write(output)
    else:
        print(output)
Exemplo n.º 3
0
def run(parser, args):
    base_json = {args.sample: {}}
    args = parser.parse_args()
    hierarchy_json_file = None
    variant_to_resistance_json_fp: Optional[PathLike] = None
    species = Species(args.species)
    if species is not Species.TB and args.panel != "custom":
        args.panel = "default"
    panels = Panel.from_species_and_name(species, args.panel)

    if species is Species.TB and panels.name is TbPanel.NEJM_WALKER:
        data_dir = os.path.abspath(
            os.path.join(os.path.dirname(__file__), "../data/predict/tb/"))
        variant_to_resistance_json_fp = os.path.join(
            data_dir, "variant_to_resistance_drug-jan-03-2019.json")
    if panels.name in (TbPanel.CUSTOM, StaphPanel.CUSTOM):
        if not args.custom_probe_set_path:
            raise ValueError("Custom panel requires custom_probe_set_path")

        if not os.path.exists(args.custom_probe_set_path):
            raise FileNotFoundError(
                f"Custom probe path {args.custom_probe_set_path} does not exist!"
            )
        panels.add_path(args.custom_probe_set_path)

        if not os.path.exists(args.custom_variant_to_resistance_json):
            raise FileNotFoundError(
                ("Custom variant to resistance json "
                 f"{args.custom_variant_to_resistance_json} does not exist!"))
        variant_to_resistance_json_fp = args.custom_variant_to_resistance_json

    if species is Species.STAPH:
        Predictor = StaphPredictor
        args.kmer = 15  # Forced
    elif species is Species.TB:
        hierarchy_json_file = "data/phylo/mtbc_hierarchy.json"
        Predictor = TBPredictor
    else:
        raise ValueError(f"Unrecognised species {species}")

    logger.info("Running AMR prediction with panels %s" %
                ", ".join(panels.paths))
    version = dict()
    version["mykrobe-predictor"] = predictor_version
    version["mykrobe-atlas"] = atlas_version
    # Get real paths for panels
    panels = [
        os.path.realpath(os.path.join(os.path.dirname(__file__), "..", f))
        for f in panels.paths
    ]
    if hierarchy_json_file is not None:
        hierarchy_json_file = os.path.realpath(
            os.path.join(os.path.dirname(__file__), "..", hierarchy_json_file))
    # Run Cortex
    cp = CoverageParser(
        sample=args.sample,
        panel_file_paths=panels,
        seq=args.seq,
        kmer=args.kmer,
        force=args.force,
        threads=1,
        verbose=False,
        tmp_dir=args.tmp,
        skeleton_dir=args.skeleton_dir,
    )
    cp.run()
    logger.debug("CoverageParser complete")

    # Detect species
    species_predictor = AMRSpeciesPredictor(
        phylo_group_covgs=cp.covgs.get("complex",
                                       cp.covgs.get("phylo_group", {})),
        sub_complex_covgs=cp.covgs.get("sub-complex", {}),
        species_covgs=cp.covgs["species"],
        lineage_covgs=cp.covgs.get("sub-species", {}),
        hierarchy_json_file=hierarchy_json_file,
    )
    phylogenetics = species_predictor.run()

    # ## AMR prediction

    depths = []
    if species_predictor.is_saureus_present():
        depths = [
            species_predictor.out_json["phylogenetics"]["phylo_group"]
            ["Staphaureus"]["median_depth"]
        ]
    elif species_predictor.is_mtbc_present():
        depths = [
            species_predictor.out_json["phylogenetics"]["phylo_group"]
            ["Mycobacterium_tuberculosis_complex"]["median_depth"]
        ]
    # pprint (species_predictor.out_json["phylogenetics"]["species"])
    # Genotype
    q = args.quiet
    args.quiet = True
    variant_calls_dict = {}
    sequence_calls_dict = {}
    if args.force and not depths:
        depths = [1]
    gt = None

    if depths or args.force:
        gt = Genotyper(
            sample=args.sample,
            expected_depths=depths,
            expected_error_rate=args.expected_error_rate,
            variant_covgs=cp.variant_covgs,
            gene_presence_covgs=cp.covgs["presence"],
            base_json=base_json,
            contamination_depths=[],
            report_all_calls=True,
            ignore_filtered=True,
            filters=args.filters,
            variant_confidence_threshold=args.min_variant_conf,
            sequence_confidence_threshold=args.min_gene_conf,
            model=args.model,
            kmer_size=args.kmer,
            min_proportion_expected_depth=args.min_proportion_expected_depth,
            ploidy=args.ploidy,
        )
        gt.run()
        (
            kmer_count_error_rate,
            incorrect_kmer_to_pc_cov,
        ) = gt.estimate_kmer_count_error_rate_and_incorrect_kmer_to_percent_cov(
        )
        logger.info("Estimated error rate for kmer count model: " +
                    str(round(100 * kmer_count_error_rate, 2)) + "%")
        if args.guess_sequence_method and kmer_count_error_rate > 0.001:
            logger.warning(
                "Guess sequence method is on, and we've guessed ONT")
            args.ont = True

        if args.ont:
            args.expected_error_rate = 0.15
            args.ploidy = "haploid"
            args.ignore_minor_calls = True
            logger.warning("Setting ploidy to haploid")
            logger.warning("Setting ignore_minor_calls to True")
            logger.warning("Setting expected error rate to %s (--ont)" %
                           args.expected_error_rate)
            args.model = "kmer_count"

        # If the user didn't specify the conf_percent_cutoff, then set it
        # depending on whether or not the --ont flag was used
        if args.conf_percent_cutoff == -1:
            args.conf_percent_cutoff = 90 if args.ont else 100

        # conf_percent_cutoff == 100 means that we want to keep all variant calls,
        # in which case there is no need to run the simulations
        if args.conf_percent_cutoff < 100:
            logger.info("Expected depth: " + str(depths[0]))
            conf_thresholder = ConfThresholder(kmer_count_error_rate,
                                               depths[0], args.kmer,
                                               incorrect_kmer_to_pc_cov)
            time_start = time.time()
            conf_threshold = conf_thresholder.get_conf_threshold(
                percent_to_keep=args.conf_percent_cutoff)
            time_end = time.time()
            time_to_sim = time_end - time_start
            logger.info("Simulation time: " + str(time_to_sim))
            logger.info("Confidence cutoff (using percent cutoff " +
                        str(args.conf_percent_cutoff) + "%): " +
                        str(conf_threshold))
            gt = Genotyper(
                sample=args.sample,
                expected_depths=depths,
                expected_error_rate=kmer_count_error_rate,
                variant_covgs=cp.variant_covgs,
                gene_presence_covgs=cp.covgs["presence"],
                base_json=base_json,
                contamination_depths=[],
                report_all_calls=True,
                ignore_filtered=True,
                filters=args.filters,
                variant_confidence_threshold=conf_threshold,
                sequence_confidence_threshold=args.min_gene_conf,
                model=args.model,
                kmer_size=args.kmer,
                min_proportion_expected_depth=args.
                min_proportion_expected_depth,
                ploidy=args.ploidy,
            )
            gt.run()

        variant_calls_dict = gt.variant_calls_dict
        sequence_calls_dict = gt.sequence_calls_dict
    else:
        depths = [cp.estimate_depth()]
    args.quiet = q
    mykrobe_predictor_susceptibility_result = MykrobePredictorSusceptibilityResult(
    )
    if gt is not None and (max(depths) > args.min_depth or args.force):
        predictor = Predictor(
            variant_calls=gt.variant_calls,
            called_genes=gt.sequence_calls_dict,
            base_json=base_json[args.sample],
            depth_threshold=args.min_depth,
            ignore_filtered=True,
            ignore_minor_calls=args.ignore_minor_calls,
            variant_to_resistance_json_fp=variant_to_resistance_json_fp,
        )
        mykrobe_predictor_susceptibility_result = predictor.run()
    base_json[args.sample] = MykrobePredictorResult(
        susceptibility=mykrobe_predictor_susceptibility_result,
        phylogenetics=phylogenetics,
        variant_calls=variant_calls_dict,
        sequence_calls=sequence_calls_dict,
        probe_sets=panels,
        files=args.seq,
        kmer=args.kmer,
        version=version,
        model=args.model,
    ).to_dict()
    if not args.keep_tmp:
        cp.remove_temporary_files()

    outputs = {}

    if args.output_format in ["csv", "json_and_csv"]:
        outputs["csv"] = json_to_csv(base_json)
    if args.output_format in ["json", "json_and_csv"]:
        # Verbose json output requires --report_all_calls
        if not args.report_all_calls:
            del base_json[args.sample]["variant_calls"]
            del base_json[args.sample]["sequence_calls"]
        outputs["json"] = json.dumps(base_json, indent=4)

    if len(outputs) == 0:
        raise ValueError(
            (f"Output format must be one of: csv,json,json_and_csv. Got "
             f"'{args.output_format}'"))

    for output_type, output in outputs.items():
        # write to file is specified by user, otherwise send to stdout
        if args.output:
            if args.output_format == "json_and_csv":
                outfile = args.output + "." + output_type
            else:
                outfile = args.output
            with open(outfile, "w") as f:
                f.write(output)
        else:
            print(output)