def run_main(parser, args): args = parser.parse_args() verbose = True if args.ont: args.expected_error_rate = 0.15 args.filters = ["LOW_GT_CONF"] args.model = "kmer_count" logger.debug("Setting expected error rate to %s (--ont)" % args.expected_error_rate) logger.debug( "Removing LOW_PERCENT_COVERAGE filter (increases sensitivity - in particular for ONT data)") if args.min_variant_conf is None: args.min_variant_conf = 100 cp = CoverageParser( sample=args.sample, panel_file_paths=[args.probe_set], seq=args.seq, ctx=args.ctx, kmer=args.kmer, force=args.force, verbose=verbose, tmp_dir=args.tmp, skeleton_dir=args.skeleton_dir, threads=args.threads, memory=args.memory, mccortex31_path=args.mccortex31_path) cp.run() if args.expected_depth is None: args.expected_depth = cp.estimate_depth() base_json = {args.sample: {}} base_json[args.sample]["probe_set"] = args.probe_set if args.seq: base_json[args.sample]["files"] = args.seq else: base_json[args.sample]["files"] = args.ctx base_json[args.sample]["kmer"] = args.kmer base_json[args.sample]["version"] = __version__ gt = Genotyper( sample=args.sample, expected_error_rate=args.expected_error_rate, expected_depths=[ args.expected_depth], variant_covgs=cp.variant_covgs, gene_presence_covgs=cp.covgs["presence"], base_json=base_json, contamination_depths=[], ignore_filtered=args.ignore_filtered, filters=args.filters, model=args.model, report_all_calls=args.report_all_calls, variant_confidence_threshold=args.min_variant_conf, sequence_confidence_threshold=args.min_gene_conf, min_gene_percent_covg_threshold=args.min_gene_percent_covg_threshold) gt.run() if not args.keep_tmp: cp.remove_temporary_files() return gt.out_json
def run(parser, args): logger.info( f"Start runnning mykrobe predict. Command line: {' '.join(sys.argv)}") base_json = {args.sample: {}} ref_data = ref_data_from_args(args) if (args.species == "custom" and ref_data["var_to_res_json"] is None and ref_data["lineage_json"] is None): logger.info( "Forcing --report_all_calls because species is 'custom' and options --custom_variant_to_resistance_json,--custom_lineage_json were not used" ) args.report_all_calls = True logger.info( f"Running mykrobe predict using species {args.species}, and panel version {ref_data['version']}" ) if args.tmp is None: args.tmp = tempfile.mkdtemp() + "/" if args.ont: args.expected_error_rate = ONT_E_RATE logger.info( f"Set expected error rate to {args.expected_error_rate} because --ont flag was used" ) args.ploidy = ONT_PLOIDY logger.info(f"Set ploidy to {args.ploidy} because --ont flag used") # Run Cortex cp = CoverageParser( sample=args.sample, panel_file_paths=ref_data["fasta_files"], seq=args.seq, kmer=ref_data["kmer"], force=args.force, threads=args.threads, verbose=False, tmp_dir=args.tmp, skeleton_dir=args.skeleton_dir, memory=args.memory, ) cp.run() logger.debug("CoverageParser complete") if ref_data["species_phylo_group"] is None: phylogenetics = {} depths = [cp.estimate_depth()] else: phylogenetics, depths = detect_species_and_get_depths( cp, ref_data["hierarchy_json"], ref_data["species_phylo_group"]) # Genotype variant_calls_dict = {} sequence_calls_dict = {} lineage_calls_dict = {} lineage_predict_dict = {} if args.force and len(depths) == 0: depths = [cp.estimate_depth()] gt = None if len(depths) > 0 or args.force: # Running the genotyper changes the contents of cp.covgs["presence"]. # The changes can cause later second run (if it happens) of the # genotytper to crash. # Store the original cp.covgs["presence"] so we can use it again later. original_covgs_presence = copy.deepcopy(cp.covgs["presence"]) gt = Genotyper( sample=args.sample, expected_depths=depths, expected_error_rate=args.expected_error_rate, variant_covgs=cp.variant_covgs, gene_presence_covgs=cp.covgs["presence"], base_json=base_json, contamination_depths=[], report_all_calls=True, ignore_filtered=True, filters=args.filters, variant_confidence_threshold=args.min_variant_conf, sequence_confidence_threshold=args.min_gene_conf, model=args.model, kmer_size=ref_data["kmer"], min_proportion_expected_depth=args.min_proportion_expected_depth, ploidy=args.ploidy, lineage_variants=ref_data["lineage_dict"], ) gt.run() ( kmer_count_error_rate, incorrect_kmer_to_pc_cov, ) = gt.estimate_kmer_count_error_rate_and_incorrect_kmer_to_percent_cov( ) logger.debug("Estimated error rate for kmer count model: " + str(round(100 * kmer_count_error_rate, 2)) + "%") if args.guess_sequence_method and kmer_count_error_rate > 0.001: logger.warning( "Guess sequence method is on, and we've guessed ONT") # this is duplicated from the if args.ont statement above args.expected_error_rate = ONT_E_RATE logger.info( f"Set expected error rate to {args.expected_error_rate}") args.ploidy = ONT_PLOIDY logger.info(f"Set ploidy to {args.ploidy}") # conf_percent_cutoff == 100 means that we want to keep all variant calls, # in which case there is no need to run the simulations if args.conf_percent_cutoff < 100: logger.debug("Expected depth: " + str(depths[0])) conf_thresholder = ConfThresholder( kmer_count_error_rate, depths[0], ref_data["kmer"], incorrect_kmer_to_pc_cov, ) time_start = time.time() conf_threshold = conf_thresholder.get_conf_threshold( percent_to_keep=args.conf_percent_cutoff) time_end = time.time() time_to_sim = time_end - time_start logger.debug("Simulation time: " + str(time_to_sim)) logger.debug("Confidence cutoff (using percent cutoff " + str(args.conf_percent_cutoff) + "%): " + str(conf_threshold)) cp.covgs["presence"] = copy.deepcopy(original_covgs_presence) gt = Genotyper( sample=args.sample, expected_depths=depths, expected_error_rate=kmer_count_error_rate, variant_covgs=cp.variant_covgs, gene_presence_covgs=cp.covgs["presence"], base_json=base_json, contamination_depths=[], report_all_calls=True, ignore_filtered=True, filters=args.filters, variant_confidence_threshold=conf_threshold, sequence_confidence_threshold=args.min_gene_conf, model=args.model, kmer_size=ref_data["kmer"], min_proportion_expected_depth=args. min_proportion_expected_depth, ploidy=args.ploidy, lineage_variants=ref_data["lineage_dict"], ) gt.run() variant_calls_dict = gt.variant_calls_dict sequence_calls_dict = gt.sequence_calls_dict lineage_predict_dict, lineage_calls_dict = gt.predict_lineage() else: depths = [cp.estimate_depth()] mykrobe_predictor_susceptibility_result = MykrobePredictorSusceptibilityResult( ) if (gt is not None and (max(depths) > args.min_depth or args.force) and ref_data["var_to_res_json"] is not None): predictor = BasePredictor( variant_calls=gt.variant_calls, called_genes=gt.sequence_calls_dict, base_json=base_json[args.sample], depth_threshold=args.min_depth, ignore_filtered=True, ignore_minor_calls=args.ignore_minor_calls, variant_to_resistance_json_fp=ref_data["var_to_res_json"], ) mykrobe_predictor_susceptibility_result = predictor.run() logger.info("Progress: finished making AMR predictions") base_json[args.sample] = { "susceptibility": list(mykrobe_predictor_susceptibility_result.to_dict().values())[0], "phylogenetics": {} if phylogenetics == {} else list(phylogenetics.to_dict().values())[0], "variant_calls": variant_calls_dict, "sequence_calls": sequence_calls_dict, "lineage_calls": lineage_calls_dict, "kmer": ref_data["kmer"], "probe_sets": ref_data["fasta_files"], "files": args.seq, "version": { "mykrobe-predictor": predictor_version, "mykrobe-atlas": atlas_version, "panel": ref_data["version"], }, "genotype_model": args.model, } if len(lineage_predict_dict) > 0: base_json[ args.sample]["phylogenetics"]["lineage"] = lineage_predict_dict if not args.keep_tmp: cp.remove_temporary_files() logger.info("Progress: writing output") fix_X_amino_acid_variants(base_json[args.sample]) write_outputs(args, base_json) logger.info("Progress: finished")
def run(parser, args): base_json = {args.sample: {}} args = parser.parse_args() hierarchy_json_file = None if args.panel is not None: variant_to_resistance_json_fp = None if args.species == "tb" and args.panel == "bradley-2015": TB_PANELS = [ "data/panels/tb-species-170421.fasta.gz", "data/panels/tb-bradley-probe-set-jan-2019.fasta.gz", ] elif args.species == "tb" and args.panel == "walker-2015": TB_PANELS = [ "data/panels/tb-species-170421.fasta.gz", "data/panels/tb-walker-probe-set-jan-2019.fasta.gz", ] elif args.species == "tb" and args.panel == "201901": TB_PANELS = [ "data/panels/tb-species-170421.fasta.gz", "data/panels/tb-hunt-probe-set-jan-03-2019.fasta.gz", ] data_dir = os.path.abspath( os.path.join(os.path.dirname(__file__), "../data/predict/tb/")) variant_to_resistance_json_fp = os.path.join( data_dir, "variant_to_resistance_drug-jan-03-2019.json") elif args.species == "tb" and args.panel == "atlas": TB_PANELS = [ "data/panels/tb-species-170421.fasta.gz", "data/panels/tb-walker-probe-set-jan-2019.fasta.gz", "data/panels/tb-k21-probe-set-feb-09-2017.fasta.gz", ] elif args.panel == "custom": if not args.custom_probe_set_path: raise ValueError("Custom panel requires custom_probe_set_path") TB_PANELS = [ args.custom_probe_set_path, "data/panels/tb-species-170421.fasta.gz", ] variant_to_resistance_json_fp = args.custom_variant_to_resistance_json Predictor = None if not args.species: panels = TB_PANELS + GN_PANELS + STAPH_PANELS elif args.species == "staph": panels = STAPH_PANELS Predictor = StaphPredictor args.kmer = 15 # Forced variant_to_resistance_json_fp = None elif args.species == "tb": panels = TB_PANELS hierarchy_json_file = "data/phylo/mtbc_hierarchy.json" Predictor = TBPredictor logger.info("Running AMR prediction with panels %s" % ", ".join(panels)) version = {} version["mykrobe-predictor"] = predictor_version version["mykrobe-atlas"] = atlas_version # Get real paths for panels panels = [ os.path.realpath(os.path.join(os.path.dirname(__file__), "..", f)) for f in panels ] if hierarchy_json_file is not None: hierarchy_json_file = os.path.realpath( os.path.join(os.path.dirname(__file__), "..", hierarchy_json_file)) # Run Cortex cp = CoverageParser( sample=args.sample, panel_file_paths=panels, seq=args.seq, kmer=args.kmer, force=args.force, threads=1, verbose=False, tmp_dir=args.tmp, skeleton_dir=args.skeleton_dir, mccortex31_path=args.mccortex31_path, ) cp.run() logger.debug("CoverageParser complete") # Detect species species_predictor = AMRSpeciesPredictor( phylo_group_covgs=cp.covgs.get("complex", cp.covgs.get("phylo_group", {})), sub_complex_covgs=cp.covgs.get("sub-complex", {}), species_covgs=cp.covgs["species"], lineage_covgs=cp.covgs.get("sub-species", {}), hierarchy_json_file=hierarchy_json_file, ) phylogenetics = species_predictor.run() # ## AMR prediction depths = [] if species_predictor.is_saureus_present(): depths = [ species_predictor.out_json["phylogenetics"]["phylo_group"] ["Staphaureus"]["median_depth"] ] elif species_predictor.is_mtbc_present(): depths = [ species_predictor.out_json["phylogenetics"]["phylo_group"] ["Mycobacterium_tuberculosis_complex"]["median_depth"] ] # pprint (species_predictor.out_json["phylogenetics"]["species"]) # Genotype q = args.quiet args.quiet = True variant_calls_dict = {} sequence_calls_dict = {} if args.force and not depths: depths = [1] gt = None if depths or args.force: gt = Genotyper( sample=args.sample, expected_depths=depths, expected_error_rate=args.expected_error_rate, variant_covgs=cp.variant_covgs, gene_presence_covgs=cp.covgs["presence"], base_json=base_json, contamination_depths=[], report_all_calls=True, ignore_filtered=True, filters=args.filters, variant_confidence_threshold=args.min_variant_conf, sequence_confidence_threshold=args.min_gene_conf, model=args.model, kmer_size=args.kmer, min_proportion_expected_depth=args.min_proportion_expected_depth, ploidy=args.ploidy, ) gt.run() kmer_count_error_rate, incorrect_kmer_to_pc_cov = ( gt. estimate_kmer_count_error_rate_and_incorrect_kmer_to_percent_cov()) logger.info("Estimated error rate for kmer count model: " + str(round(100 * kmer_count_error_rate, 2)) + "%") if args.guess_sequence_method and kmer_count_error_rate > 0.001: logger.warning( "Guess sequence method is on, and we've guessed ONT") args.ont = True if args.ont: args.expected_error_rate = 0.15 args.ploidy = "haploid" args.ignore_minor_calls = True logger.warning("Setting ploidy to haploid") logger.warning("Setting ignore_minor_calls to True") logger.warning("Setting expected error rate to %s (--ont)" % args.expected_error_rate) args.model = "kmer_count" # If the user didn't specify the conf_percent_cutoff, then set it # depending on whether or not the --ont flag was used if args.conf_percent_cutoff == -1: args.conf_percent_cutoff = 90 if args.ont else 100 # conf_percent_cutoff == 100 means that we want to keep all variant calls, # in which case there is no need to run the simulations if args.conf_percent_cutoff < 100: logger.info("Expected depth: " + str(depths[0])) conf_thresholder = ConfThresholder(kmer_count_error_rate, depths[0], args.kmer, incorrect_kmer_to_pc_cov) time_start = time.time() conf_threshold = conf_thresholder.get_conf_threshold( percent_to_keep=args.conf_percent_cutoff) time_end = time.time() time_to_sim = time_end - time_start logger.info("Simulation time: " + str(time_to_sim)) logger.info("Confidence cutoff (using percent cutoff " + str(args.conf_percent_cutoff) + "%): " + str(conf_threshold)) gt = Genotyper( sample=args.sample, expected_depths=depths, expected_error_rate=kmer_count_error_rate, # expected_error_rate=args.expected_error_rate, variant_covgs=cp.variant_covgs, gene_presence_covgs=cp.covgs["presence"], base_json=base_json, contamination_depths=[], report_all_calls=True, ignore_filtered=True, filters=args.filters, variant_confidence_threshold=conf_threshold, sequence_confidence_threshold=args.min_gene_conf, model=args.model, kmer_size=args.kmer, min_proportion_expected_depth=args. min_proportion_expected_depth, ploidy=args.ploidy, ) gt.run() variant_calls_dict = gt.variant_calls_dict sequence_calls_dict = gt.sequence_calls_dict else: depths = [cp.estimate_depth()] args.quiet = q mykrobe_predictor_susceptibility_result = MykrobePredictorSusceptibilityResult( ) if gt is not None and (max(depths) > args.min_depth or args.force): predictor = Predictor( variant_calls=gt.variant_calls, called_genes=gt.sequence_calls_dict, base_json=base_json[args.sample], depth_threshold=args.min_depth, ignore_filtered=True, ignore_minor_calls=args.ignore_minor_calls, variant_to_resistance_json_fp=variant_to_resistance_json_fp, ) mykrobe_predictor_susceptibility_result = predictor.run() base_json[args.sample] = MykrobePredictorResult( susceptibility=mykrobe_predictor_susceptibility_result, phylogenetics=phylogenetics, variant_calls=variant_calls_dict, sequence_calls=sequence_calls_dict, probe_sets=panels, files=args.seq, kmer=args.kmer, version=version, model=args.model, ).to_dict() if not args.keep_tmp: cp.remove_temporary_files() # write to file is specified by user, otherwise send to stdout if args.output_format == "csv": output = json_to_csv(base_json) else: ## Verbose json output requires --report_all_calls if not args.report_all_calls: del base_json[args.sample]["variant_calls"] del base_json[args.sample]["sequence_calls"] output = json.dumps(base_json, indent=4) if args.output: with open(args.output, "w") as outfile: outfile.write(output) else: print(output)
def run_main(parser, args): args = parser.parse_args() verbose = True if args.ont: args.expected_error_rate = ONT_E_RATE logger.debug("Setting expected error rate to %s (--ont)" % args.expected_error_rate) if args.min_variant_conf is None: args.min_variant_conf = 100 if args.tmp is None: args.tmp = tempfile.mkdtemp() + "/" cp = CoverageParser( sample=args.sample, panel_file_paths=[args.probe_set], seq=args.seq, ctx=args.ctx, kmer=args.kmer, force=args.force, verbose=verbose, tmp_dir=args.tmp, skeleton_dir=args.skeleton_dir, threads=args.threads, memory=args.memory, ) cp.run() if args.expected_depth is None: args.expected_depth = cp.estimate_depth() base_json = {args.sample: {}} base_json[args.sample]["probe_set"] = args.probe_set if args.seq: base_json[args.sample]["files"] = args.seq else: base_json[args.sample]["files"] = args.ctx base_json[args.sample]["kmer"] = args.kmer base_json[args.sample]["version"] = __version__ if args.lineage is None: lineage_dict = None else: lineage_dict = load_json(args.lineage) gt = Genotyper( sample=args.sample, expected_error_rate=args.expected_error_rate, expected_depths=[args.expected_depth], variant_covgs=cp.variant_covgs, gene_presence_covgs=cp.covgs["presence"], base_json=base_json, contamination_depths=[], ignore_filtered=args.ignore_filtered, filters=args.filters, model=args.model, report_all_calls=args.report_all_calls, variant_confidence_threshold=args.min_variant_conf, sequence_confidence_threshold=args.min_gene_conf, min_gene_percent_covg_threshold=args.min_gene_percent_covg_threshold, kmer_size=args.kmer, min_proportion_expected_depth=args.min_proportion_expected_depth, ploidy=args.ploidy, lineage_variants=lineage_dict, ) gt.run() if args.output: with open(args.output, "w") as outfile: json.dump(gt.out_json, outfile, indent=4) if not args.keep_tmp: cp.remove_temporary_files() return gt.out_json
def run(parser, args): base_json = {args.sample: {}} args = parser.parse_args() hierarchy_json_file = None variant_to_resistance_json_fp: Optional[PathLike] = None species = Species(args.species) if species is not Species.TB and args.panel != "custom": args.panel = "default" panels = Panel.from_species_and_name(species, args.panel) if species is Species.TB and panels.name is TbPanel.NEJM_WALKER: data_dir = os.path.abspath( os.path.join(os.path.dirname(__file__), "../data/predict/tb/")) variant_to_resistance_json_fp = os.path.join( data_dir, "variant_to_resistance_drug-jan-03-2019.json") if panels.name in (TbPanel.CUSTOM, StaphPanel.CUSTOM): if not args.custom_probe_set_path: raise ValueError("Custom panel requires custom_probe_set_path") if not os.path.exists(args.custom_probe_set_path): raise FileNotFoundError( f"Custom probe path {args.custom_probe_set_path} does not exist!" ) panels.add_path(args.custom_probe_set_path) if not os.path.exists(args.custom_variant_to_resistance_json): raise FileNotFoundError( ("Custom variant to resistance json " f"{args.custom_variant_to_resistance_json} does not exist!")) variant_to_resistance_json_fp = args.custom_variant_to_resistance_json if species is Species.STAPH: Predictor = StaphPredictor args.kmer = 15 # Forced elif species is Species.TB: hierarchy_json_file = "data/phylo/mtbc_hierarchy.json" Predictor = TBPredictor else: raise ValueError(f"Unrecognised species {species}") logger.info("Running AMR prediction with panels %s" % ", ".join(panels.paths)) version = dict() version["mykrobe-predictor"] = predictor_version version["mykrobe-atlas"] = atlas_version # Get real paths for panels panels = [ os.path.realpath(os.path.join(os.path.dirname(__file__), "..", f)) for f in panels.paths ] if hierarchy_json_file is not None: hierarchy_json_file = os.path.realpath( os.path.join(os.path.dirname(__file__), "..", hierarchy_json_file)) # Run Cortex cp = CoverageParser( sample=args.sample, panel_file_paths=panels, seq=args.seq, kmer=args.kmer, force=args.force, threads=1, verbose=False, tmp_dir=args.tmp, skeleton_dir=args.skeleton_dir, ) cp.run() logger.debug("CoverageParser complete") # Detect species species_predictor = AMRSpeciesPredictor( phylo_group_covgs=cp.covgs.get("complex", cp.covgs.get("phylo_group", {})), sub_complex_covgs=cp.covgs.get("sub-complex", {}), species_covgs=cp.covgs["species"], lineage_covgs=cp.covgs.get("sub-species", {}), hierarchy_json_file=hierarchy_json_file, ) phylogenetics = species_predictor.run() # ## AMR prediction depths = [] if species_predictor.is_saureus_present(): depths = [ species_predictor.out_json["phylogenetics"]["phylo_group"] ["Staphaureus"]["median_depth"] ] elif species_predictor.is_mtbc_present(): depths = [ species_predictor.out_json["phylogenetics"]["phylo_group"] ["Mycobacterium_tuberculosis_complex"]["median_depth"] ] # pprint (species_predictor.out_json["phylogenetics"]["species"]) # Genotype q = args.quiet args.quiet = True variant_calls_dict = {} sequence_calls_dict = {} if args.force and not depths: depths = [1] gt = None if depths or args.force: gt = Genotyper( sample=args.sample, expected_depths=depths, expected_error_rate=args.expected_error_rate, variant_covgs=cp.variant_covgs, gene_presence_covgs=cp.covgs["presence"], base_json=base_json, contamination_depths=[], report_all_calls=True, ignore_filtered=True, filters=args.filters, variant_confidence_threshold=args.min_variant_conf, sequence_confidence_threshold=args.min_gene_conf, model=args.model, kmer_size=args.kmer, min_proportion_expected_depth=args.min_proportion_expected_depth, ploidy=args.ploidy, ) gt.run() ( kmer_count_error_rate, incorrect_kmer_to_pc_cov, ) = gt.estimate_kmer_count_error_rate_and_incorrect_kmer_to_percent_cov( ) logger.info("Estimated error rate for kmer count model: " + str(round(100 * kmer_count_error_rate, 2)) + "%") if args.guess_sequence_method and kmer_count_error_rate > 0.001: logger.warning( "Guess sequence method is on, and we've guessed ONT") args.ont = True if args.ont: args.expected_error_rate = 0.15 args.ploidy = "haploid" args.ignore_minor_calls = True logger.warning("Setting ploidy to haploid") logger.warning("Setting ignore_minor_calls to True") logger.warning("Setting expected error rate to %s (--ont)" % args.expected_error_rate) args.model = "kmer_count" # If the user didn't specify the conf_percent_cutoff, then set it # depending on whether or not the --ont flag was used if args.conf_percent_cutoff == -1: args.conf_percent_cutoff = 90 if args.ont else 100 # conf_percent_cutoff == 100 means that we want to keep all variant calls, # in which case there is no need to run the simulations if args.conf_percent_cutoff < 100: logger.info("Expected depth: " + str(depths[0])) conf_thresholder = ConfThresholder(kmer_count_error_rate, depths[0], args.kmer, incorrect_kmer_to_pc_cov) time_start = time.time() conf_threshold = conf_thresholder.get_conf_threshold( percent_to_keep=args.conf_percent_cutoff) time_end = time.time() time_to_sim = time_end - time_start logger.info("Simulation time: " + str(time_to_sim)) logger.info("Confidence cutoff (using percent cutoff " + str(args.conf_percent_cutoff) + "%): " + str(conf_threshold)) gt = Genotyper( sample=args.sample, expected_depths=depths, expected_error_rate=kmer_count_error_rate, variant_covgs=cp.variant_covgs, gene_presence_covgs=cp.covgs["presence"], base_json=base_json, contamination_depths=[], report_all_calls=True, ignore_filtered=True, filters=args.filters, variant_confidence_threshold=conf_threshold, sequence_confidence_threshold=args.min_gene_conf, model=args.model, kmer_size=args.kmer, min_proportion_expected_depth=args. min_proportion_expected_depth, ploidy=args.ploidy, ) gt.run() variant_calls_dict = gt.variant_calls_dict sequence_calls_dict = gt.sequence_calls_dict else: depths = [cp.estimate_depth()] args.quiet = q mykrobe_predictor_susceptibility_result = MykrobePredictorSusceptibilityResult( ) if gt is not None and (max(depths) > args.min_depth or args.force): predictor = Predictor( variant_calls=gt.variant_calls, called_genes=gt.sequence_calls_dict, base_json=base_json[args.sample], depth_threshold=args.min_depth, ignore_filtered=True, ignore_minor_calls=args.ignore_minor_calls, variant_to_resistance_json_fp=variant_to_resistance_json_fp, ) mykrobe_predictor_susceptibility_result = predictor.run() base_json[args.sample] = MykrobePredictorResult( susceptibility=mykrobe_predictor_susceptibility_result, phylogenetics=phylogenetics, variant_calls=variant_calls_dict, sequence_calls=sequence_calls_dict, probe_sets=panels, files=args.seq, kmer=args.kmer, version=version, model=args.model, ).to_dict() if not args.keep_tmp: cp.remove_temporary_files() outputs = {} if args.output_format in ["csv", "json_and_csv"]: outputs["csv"] = json_to_csv(base_json) if args.output_format in ["json", "json_and_csv"]: # Verbose json output requires --report_all_calls if not args.report_all_calls: del base_json[args.sample]["variant_calls"] del base_json[args.sample]["sequence_calls"] outputs["json"] = json.dumps(base_json, indent=4) if len(outputs) == 0: raise ValueError( (f"Output format must be one of: csv,json,json_and_csv. Got " f"'{args.output_format}'")) for output_type, output in outputs.items(): # write to file is specified by user, otherwise send to stdout if args.output: if args.output_format == "json_and_csv": outfile = args.output + "." + output_type else: outfile = args.output with open(outfile, "w") as f: f.write(output) else: print(output)
def run(parser, args): base_json = {args.sample: {}} args = parser.parse_args() hierarchy_json_file = None if args.panel is not None: if args.panel == "bradley-2015": TB_PANELS = [ "data/panels/tb-species-170421.fasta.gz", "data/panels/tb-bradley-probe-set-feb-09-2017.fasta.gz" ] elif args.panel == "walker-2015": TB_PANELS = [ "data/panels/tb-species-170421.fasta.gz", "data/panels/tb-walker-probe-set-feb-09-2017.fasta.gz" ] elif args.panel == "custom": if not args.custom_probe_set_path: raise ValueError("Custom panel requires custom_probe_set_path") TB_PANELS = [ args.custom_probe_set_path, "data/panels/tb-species-170421.fasta.gz" ] Predictor = None if not args.species: panels = TB_PANELS + GN_PANELS + STAPH_PANELS elif args.species == "staph": panels = STAPH_PANELS Predictor = StaphPredictor args.kmer = 15 # Forced elif args.species == "tb": panels = TB_PANELS hierarchy_json_file = "data/phylo/mtbc_hierarchy.json" Predictor = TBPredictor logger.info("Running AMR prediction with panels %s" % ", ".join(panels)) version = {} version["mykrobe-predictor"] = predictor_version version["mykrobe-atlas"] = atlas_version # Get real paths for panels panels = [ os.path.realpath(os.path.join(os.path.dirname(__file__), "..", f)) for f in panels ] if hierarchy_json_file is not None: hierarchy_json_file = os.path.realpath( os.path.join(os.path.dirname(__file__), "..", hierarchy_json_file)) if args.ont: args.expected_error_rate = 0.15 logger.debug("Setting expected error rate to %s (--ont)" % args.expected_error_rate) args.filters = ["LOW_GT_CONF"] args.model = "kmer_count" # Run Cortex cp = CoverageParser(sample=args.sample, panel_file_paths=panels, seq=args.seq, kmer=args.kmer, force=args.force, threads=1, verbose=False, tmp_dir=args.tmp, skeleton_dir=args.skeleton_dir, mccortex31_path=args.mccortex31_path) cp.run() logger.debug('CoverageParser complete') # Detect species species_predictor = AMRSpeciesPredictor( phylo_group_covgs=cp.covgs.get("complex", cp.covgs.get("phylo_group", {})), sub_complex_covgs=cp.covgs.get("sub-complex", {}), species_covgs=cp.covgs["species"], lineage_covgs=cp.covgs.get("sub-species", {}), hierarchy_json_file=hierarchy_json_file) phylogenetics = species_predictor.run() # ## AMR prediction depths = [] if species_predictor.is_saureus_present(): depths = [ species_predictor.out_json["phylogenetics"]["phylo_group"] ["Staphaureus"]["median_depth"] ] elif species_predictor.is_mtbc_present(): depths = [ species_predictor.out_json["phylogenetics"]["phylo_group"] ["Mycobacterium_tuberculosis_complex"]["median_depth"] ] # pprint (species_predictor.out_json["phylogenetics"]["species"]) # Genotype q = args.quiet args.quiet = True variant_calls_dict = {} sequence_calls_dict = {} if args.force and not depths: depths = [1] gt = None if depths or args.force: gt = Genotyper(sample=args.sample, expected_depths=depths, expected_error_rate=args.expected_error_rate, variant_covgs=cp.variant_covgs, gene_presence_covgs=cp.covgs["presence"], base_json=base_json, contamination_depths=[], report_all_calls=True, ignore_filtered=True, filters=args.filters, variant_confidence_threshold=args.min_variant_conf, sequence_confidence_threshold=args.min_gene_conf, model=args.model) gt.run() variant_calls_dict = gt.variant_calls_dict sequence_calls_dict = gt.sequence_calls_dict else: depths = [cp.estimate_depth()] args.quiet = q mykrobe_predictor_susceptibility_result = MykrobePredictorSusceptibilityResult( ) if gt is not None and (max(depths) > args.min_depth or args.force): predictor = Predictor(variant_calls=gt.variant_calls, called_genes=gt.sequence_calls_dict, base_json=base_json[args.sample], depth_threshold=args.min_depth, ignore_filtered=True, ignore_minor_calls=args.ont, variant_to_resistance_json_fp=args. custom_variant_to_resistance_json) mykrobe_predictor_susceptibility_result = predictor.run() base_json[args.sample] = MykrobePredictorResult( susceptibility=mykrobe_predictor_susceptibility_result, phylogenetics=phylogenetics, variant_calls=variant_calls_dict, sequence_calls=sequence_calls_dict, probe_sets=panels, files=args.seq, kmer=args.kmer, version=version, model=args.model).to_dict() if not args.keep_tmp: cp.remove_temporary_files() # write to file is specified by user, otherwise send to stdout if args.output: with open(args.output, 'w') as outfile: json.dump(base_json, outfile, indent=4) else: print(json.dumps(base_json, indent=4))