def analyse(args): drug_loci = pp.load_bed(args.bed, [6], 4) for d in drug_loci: drug_loci[d] = [x.lower() for x in drug_loci[d][0].split(";")] print(drug_loci) drug = args.drug results = json.load(open("results.json")) fn = defaultdict(int) fp = defaultdict(int) print("Number true positives: %s" % len(results[drug]["tp"])) print("Number false negatives: %s" % len(results[drug]["fn"])) print("Number true negatives: %s" % len(results[drug]["tn"])) print("Number false positives: %s" % len(results[drug]["fp"])) for s in results[drug]["fn"]: print("%s\tFalse_negative" % s) tmp = json.load(open("%s/%s.results.json" % (args.dir, s))) for var in tmp["other_variants"]: if not any([drug in x for x in drug_loci[var["locus_tag"]]]): continue fn[(var["gene"], var["locus_tag"], var["change"])] += 1 for s in results[drug]["fp"]: print("%s\tFalse_positive" % s) tmp = json.load(open("%s/%s.results.json" % (args.dir, s))) for var in tmp["dr_variants"]: #if drug not in drug_loci[var["locus_tag"]]: if drug != var["drug"].lower(): continue fp[(var["gene"], var["locus_tag"], var["change"])] += 1 for key in sorted(fn, key=lambda x: fn[x]): gene, rv, var = key if gene == ".": gene = rv print("False_Negative\t%s\t%s\t%s" % (gene, var, fn[key])) for key in sorted(fp, key=lambda x: fp[x]): gene, rv, var = key # tmp = odds_ratio[rv][var] # print "False_Positive\t%s\t%s\t%s\t%s\t%s\t%s" % (gene,var,fp[key],tmp[4],tmp[5],tmp[6]) print("False_Positive\t%s\t%s\t%s" % (gene, var, fp[key])) if args.itol: write_itol(results[drug]["fp"], results[drug]["fn"], args.drug)
def calculate(args): sample_file = args.samples dst_file = args.dst dst = load_dst(dst_file) drug_loci = pp.load_bed(args.bed, [6], 4) # {'Rv0668': ('rifampicin')} FAIL = open("samples_not_found.txt", "w") samples = [x.rstrip() for x in open(sample_file).readlines()] ext = ".results.json" drugs = [d.lower() for d in dst[samples[0]].keys()] results = { d: { "tp": [], "tn": [], "fp": [], "fn": [] } for d in drugs + ["flq", "mdr", "xdr", "sus"] } counts = { d: { "tp": 0, "tn": 0, "fp": 0, "fn": 0 } for d in drugs + ["flq", "mdr", "xdr", "sus"] } pre = args.dir if args.dir else "" for s in tqdm(samples): res_file = "%s/%s%s" % (pre, s, ext) if pp.nofile(res_file): pp.log("Warning: %s does not exist!" % res_file) FAIL.write("%s\n" % s) continue res = json.load(open(res_file)) na_drugs = set() for locus in drug_loci: if res["missing_regions"][locus] > args.miss: for tmp in drug_loci[locus][0].split(","): na_drugs.add(tmp) resistant_drugs = [d["drug"].lower() for d in res["dr_variants"]] for d in drugs: if d in na_drugs: dst[s][d] = "NA" for d in drugs: if dst[s][d] == "0" and d not in resistant_drugs: results[d]["tn"].append(s) counts[d]["tn"] += 1 elif dst[s][d] == "0" and d in resistant_drugs: results[d]["fp"].append(s) counts[d]["fp"] += 1 elif dst[s][d] == "1" and d not in resistant_drugs: results[d]["fn"].append(s) counts[d]["fn"] += 1 elif dst[s][d] == "1" and d in resistant_drugs: results[d]["tp"].append(s) counts[d]["tp"] += 1 #### Fluoroquinolones #### dst_flq = "0" dst_flq_NA = True for d in fluoroquinolones: if d not in dst[s]: continue if dst[s][d] != "NA": dst_flq_NA = False if dst[s][d] == "1": dst_flq = "1" dst_flq_list = [dst[s][d] for d in fluoroquinolones if d in dst[s]] if "1" in dst_flq_list and "0" in dst_flq_list: dst_flq = "NA" if dst_flq_NA: dst_flq = "NA" gst_flq = "0" for d in fluoroquinolones: if d in resistant_drugs: gst_flq = "1" if dst_flq == "1" and gst_flq == "1": results["flq"]["tp"].append(s) counts["flq"]["tp"] += 1 if dst_flq == "0" and gst_flq == "1": results["flq"]["fp"].append(s) counts["flq"]["fp"] += 1 if dst_flq == "1" and gst_flq == "0": results["flq"]["fn"].append(s) counts["flq"]["fn"] += 1 if dst_flq == "0" and gst_flq == "0": results["flq"]["tn"].append(s) counts["flq"]["tn"] += 1 #### MDR & XDR #### dst_mdr = "1" if dst[s]["rifampicin"] == "1" and dst[s][ "isoniazid"] == "1" else "0" if dst[s]["rifampicin"] == "NA" or dst[s]["isoniazid"] == "NA": dst_mdr = "NA" flq = False flq_NA = True for d in fluoroquinolones: if d not in dst[s]: continue if dst[s][d] != "NA": flq_NA = False if dst[s][d] == "1": flq = True amg = False amg_NA = True for d in aminoglycosides: if d not in dst[s]: continue if dst[s][d] != "NA": amg_NA = False if dst[s][d] == "1": amg = True dst_xdr = "1" if dst_mdr == "1" and flq and amg else "0" if flq_NA or amg_NA: dst_xdr = "NA" if dst_mdr == "NA": dst_xdr = "NA" #### Profiling results ##### gst_mdr = "1" if "rifampicin" in resistant_drugs and "isoniazid" in resistant_drugs else "0" flq = False for d in fluoroquinolones: if d in resistant_drugs: flq = True amg = False for d in aminoglycosides: if d in resistant_drugs: amg = True gst_xdr = "1" if gst_mdr == "1" and flq and amg else "0" if dst_mdr == "1" and gst_mdr == "1": results["mdr"]["tp"].append(s) counts["mdr"]["tp"] += 1 if dst_mdr == "0" and gst_mdr == "1": results["mdr"]["fp"].append(s) counts["mdr"]["fp"] += 1 if dst_mdr == "1" and gst_mdr == "0": results["mdr"]["fn"].append(s) counts["mdr"]["fn"] += 1 if dst_mdr == "0" and gst_mdr == "0": results["mdr"]["tn"].append(s) counts["mdr"]["tn"] += 1 if dst_xdr == "1" and gst_xdr == "1": results["xdr"]["tp"].append(s) counts["xdr"]["tp"] += 1 if dst_xdr == "0" and gst_xdr == "1": results["xdr"]["fp"].append(s) counts["xdr"]["fp"] += 1 if dst_xdr == "1" and gst_xdr == "0": results["xdr"]["fn"].append(s) counts["xdr"]["fn"] += 1 if dst_xdr == "0" and gst_xdr == "0": results["xdr"]["tn"].append(s) counts["xdr"]["tn"] += 1 ### susceptibility if "NA" not in [dst[s][d] for d in first_line]: dst_sus = "1" if "1" not in [dst[s][d] for d in drugs] else "0" gst_sus = "1" if all( [x not in resistant_drugs for x in first_line]) else "0" if dst_sus == "1" and gst_sus == "1": results["sus"]["tp"].append(s) counts["sus"]["tp"] += 1 if dst_sus == "0" and gst_sus == "1": results["sus"]["fp"].append(s) counts["sus"]["fp"] += 1 if dst_sus == "1" and gst_sus == "0": results["sus"]["fn"].append(s) counts["sus"]["fn"] += 1 if dst_sus == "0" and gst_sus == "0": results["sus"]["tn"].append(s) counts["sus"]["tn"] += 1 json.dump(results, open("results.json", "w")) json.dump(counts, open("counts.json", "w")) counts = json.load(open("counts.json")) drugs = [x.rstrip().lower() for x in open(args.drugs).readlines() ] if args.drugs else list(counts.keys()) print("Drug\tNum\tSusceptible\tResistant\tSensitivity\tSpecificity") for d in drugs: if d not in counts: continue if counts[d]["tp"] + counts[d]["fn"] == 0 or counts[d]["tn"] + counts[ d]["fp"] == 0: continue sensitivity = counts[d]["tp"] / (counts[d]["tp"] + counts[d]["fn"]) specificity = counts[d]["tn"] / (counts[d]["tn"] + counts[d]["fp"]) total = counts[d]["tp"] + counts[d]["fp"] + counts[d]["tn"] + counts[ d]["fn"] suc = counts[d]["tn"] + counts[d]["fp"] res = counts[d]["tp"] + counts[d]["fn"] print("%s\t%s\t%s\t%s\t%s\t%s" % (d.capitalize(), total, suc, res, sensitivity, specificity))
def profile_vcf(filename, conf): params = conf.copy() params["tmpvcf"] = pp.get_random_file(extension=".vcf.gz") params["tmpcsq"] = pp.get_random_file(extension=".vcf.gz") params["filename"] = filename params["tmphdr"] = pp.get_random_file() params["tmptxt"] = pp.get_random_file() l = "" for l in pp.cmd_out( "bcftools view %(filename)s -h | grep \"^##FORMAT=<ID=AD\"" % params): pass AD_found = False if l == "" else True if AD_found == False: open(params["tmphdr"], "w").write( "##FORMAT=<ID=AD,Number=R,Type=Integer,Description=\"Allelic depths\">\n" ) pp.run_cmd( "bcftools query -f '%%CHROM\\t%%POS\\t%%REF\\t%%ALT\\t.[\\t0,100]\\n' %(filename)s > %(tmptxt)s" % params) pp.run_cmd("bgzip %(tmptxt)s" % params) pp.run_cmd("tabix -s 1 -b 2 -p vcf %(tmptxt)s.gz" % params) pp.run_cmd( "bcftools view -a %(filename)s | bcftools annotate -a %(tmptxt)s.gz -c CHROM,POS,REF,ALT,-,FMT/AD -h %(tmphdr)s -Oz -o %(tmpvcf)s" % params) else: pp.run_cmd("bcftools view -a %(filename)s -Oz -o %(tmpvcf)s" % params) pp.run_cmd( "bcftools view -T %(bed)s %(tmpvcf)s | bcftools csq -f %(ref)s -g %(gff)s -Oz -o %(tmpcsq)s -p a" % params) csq_bcf_obj = pp.bcf(params["tmpcsq"]) csq = csq_bcf_obj.load_csq(ann_file=conf["ann"]) results = { "variants": [], "missing_pos": [], "qc": { "pct_reads_mapped": "NA", "num_reads_mapped": "NA" } } for sample in csq: results["variants"] = csq[sample] all_bcf_obj = pp.bcf(params["tmpvcf"]) mutations = all_bcf_obj.get_bed_gt(conf["barcode"], conf["ref"]) if "C" in mutations["Chromosome"][325505] and mutations["Chromosome"][ 325505]["C"] == 50: mutations["Chromosome"][325505] = {"T": 25} if "G" in mutations["Chromosome"][599868] and mutations["Chromosome"][ 599868]["G"] == 50: mutations["Chromosome"][599868] = {"A": 25} if "C" in mutations["Chromosome"][931123] and mutations["Chromosome"][ 931123]["C"] == 50: mutations["Chromosome"][931123] = {"T": 25} if "T" in mutations["Chromosome"][1759252] and mutations["Chromosome"][ 1759252]["T"] == 50: mutations["Chromosome"][1759252] = {"G": 25} json.dump(mutations, open("dump.json", "w")) barcode_mutations = pp.barcode(mutations, conf["barcode"]) results["barcode"] = barcode_mutations results = pp.db_compare(db_file=conf["json_db"], mutations=results) bed_regions = pp.load_bed(conf["bed"], [4], 4) missing_regions = {gene: "NA" for gene in bed_regions} results["missing_regions"] = missing_regions if AD_found: pp.run_cmd("rm %(tmpcsq)s" % params) else: pp.run_cmd("rm %(tmpcsq)s %(tmphdr)s %(tmptxt)s*" % params) return results