def main(args): total_samples = set() sample_sets = {} for f in args.samples.split(","): sample_sets[f] = [s.rstrip() for s in open(f).readlines()] for s in sample_sets[f]: total_samples.add(s) mutations = defaultdict(set) columns = ["gene", "change" ] + (args.columns.split(",") if args.columns else []) for s in tqdm(total_samples): if pp.nofile("%s/%s.results.json" % (args.dir, s)): continue tmp = json.load(open("%s/%s.results.json" % (args.dir, s))) for var in tmp["dr_variants"]: tmp_var = {x: var[x] for x in columns} mutations[json.dumps(tmp_var)].add(s) if args.non_dr: for var in tmp["other_variants"]: tmp_var = var tmp_var = {x: var[x] for x in columns} mutations[json.dumps(tmp_var)].add(s) if args.summary: O = open(args.summary, "w") O.write("%s\n" % ("\t".join(["Gene", "Mutation"] + columns + [ "%s_num\t%s_pct" % (x, x) if args.pct else x for x in list(sample_sets) ]))) for var_string in mutations: if "gene_name" not in var: var["gene_name"] = var[ "gene"] ######Fix for large deletions not haveing this key tmp_freqs = [] for f in sample_sets: num = len( [s for s in sample_sets[f] if s in mutations[var_string]]) if args.pct: pct = num / len(sample_sets[f]) * 100 tmp_freqs.append("%s\t%.2f" % (num, pct)) else: tmp_freqs.append("%s" % num) O.write("%s\t%s\t%s\t%s\n" % (var["gene_name"], var["change"], "\t".join( [str(var[x]) for x in columns]), "\t".join(tmp_freqs))) O.close()
def main(args): change_field = "change" if args.variant_format=="hgvs" else "_internal_change" conf = conf = get_conf_dict(sys.base_prefix + "/share/tbprofiler/%s" % args.db) drug2genes = defaultdict(set) gene2drugs = defaultdict(set) for l in open(conf["bed"]): row = l.rstrip().split() for d in row[5].split(","): drug2genes[d].add(row[3]) gene2drugs[row[3]].add(d) if args.samples: samples = [x.rstrip() for x in open(args.samples).readlines()] else: samples = [x.replace(".results.json","") for x in os.listdir("results/") if x[-13:]==".results.json"] variants = defaultdict(lambda:defaultdict(list)) if args.non_dr: print("sample,%s" % (",".join(["%s,%s" % ("dr_mutations_%s" % x,"other_mutations_%s" % x) for x in sorted(drug2genes)]))) else: print("sample,%s" % (",".join(["dr_mutations_%s" % x for x in sorted(drug2genes)]))) for s in tqdm(samples): if pp.nofile("%s/%s.results.json" % (args.dir,s)): if args.non_dr: print("%s,%s" % (s,",".join(["NA,NA" for _ in drug2genes]))) else: print("%s,%s" % (s,",".join(["NA" for _ in drug2genes]))) continue tmp = json.load(open("%s/%s.results.json" % (args.dir,s))) sample_dr_mutations = defaultdict(list) tmp_store = set([json.dumps(x) for x in tmp["dr_variants"]]) # This step is only to remove duplicate tmp["dr_variants"] = [] # mutations introduced by a bug that is tmp["dr_variants"] = [json.loads(x) for x in tmp_store] # now fixed for var in tmp["dr_variants"]: sample_dr_mutations[var["drug"]].append(var["gene"]+"_"+var[change_field]) if args.non_dr: sample_other_mutations = defaultdict(list) tmp_store = set([json.dumps(x) for x in tmp["other_variants"]]) # This step is only to remove duplicate tmp["other_variants"] = [] # mutations introduced by a bug that is tmp["other_variants"] = [json.loads(x) for x in tmp_store] # now fixed for var in tmp["other_variants"]: for d in gene2drugs[var["locus_tag"]]: sample_other_mutations[d].append(var["gene"]+"_"+var[change_field]) print("%s,%s" % (s,",".join(["%s,%s" % ("; ".join(sample_dr_mutations[d]) if d in sample_dr_mutations else "WT","; ".join(sample_other_mutations[d]) if d in sample_other_mutations else "WT",) for d in sorted(drug2genes)]))) else: print("%s,%s" % (s,",".join(["; ".join(sample_dr_mutations[d]) if d in sample_dr_mutations else "WT" for d in sorted(drug2genes)])))
def phylogeny(prefix,conf_file,sample_file=None,base_dir = ".",threads=3): conf = json.load(open(conf_file)) if sample_file: samples = [x.rstrip() for x in open(sample_file).readlines()] else: samples = [x.replace(".results.json","") for x in os.listdir("results/") if x[-13:]==".results.json"] samples_file = pp.get_random_file() OUT = open(samples_file,"w") OUT.write("%s\n"%"\n".join(samples)) OUT.close() for s in samples: tprefix = s+".genome" gbcf_file = "%s.gbcf" % tprefix if pp.nofile("%s/vcf/%s.genome.gbcf" % (base_dir,s)): bam_file = "%s/bam/%s.bam" % (base_dir,s) bam_obj = pp.bam(bam_file,s,conf["ref"]) bam_obj.gbcf(prefix=tprefix) pp.run_cmd("mv %s* %s/vcf" % (gbcf_file,base_dir)) cmd = "merge_vcfs.py %s %s %s --vcf_dir %s/vcf/ --vcf_ext genome.gbcf" % (samples_file,conf["ref"],prefix,base_dir) print(cmd)
def calculate(args): sample_file = args.samples dst_file = args.dst dst = load_dst(dst_file) drug_loci = pp.load_bed(args.bed, [6], 4) # {'Rv0668': ('rifampicin')} FAIL = open("samples_not_found.txt", "w") samples = [x.rstrip() for x in open(sample_file).readlines()] ext = ".results.json" drugs = [d.lower() for d in dst[samples[0]].keys()] results = { d: { "tp": [], "tn": [], "fp": [], "fn": [] } for d in drugs + ["flq", "mdr", "xdr", "sus"] } counts = { d: { "tp": 0, "tn": 0, "fp": 0, "fn": 0 } for d in drugs + ["flq", "mdr", "xdr", "sus"] } pre = args.dir if args.dir else "" for s in tqdm(samples): res_file = "%s/%s%s" % (pre, s, ext) if pp.nofile(res_file): pp.log("Warning: %s does not exist!" % res_file) FAIL.write("%s\n" % s) continue res = json.load(open(res_file)) na_drugs = set() for locus in drug_loci: if res["missing_regions"][locus] > args.miss: for tmp in drug_loci[locus][0].split(","): na_drugs.add(tmp) resistant_drugs = [d["drug"].lower() for d in res["dr_variants"]] for d in drugs: if d in na_drugs: dst[s][d] = "NA" for d in drugs: if dst[s][d] == "0" and d not in resistant_drugs: results[d]["tn"].append(s) counts[d]["tn"] += 1 elif dst[s][d] == "0" and d in resistant_drugs: results[d]["fp"].append(s) counts[d]["fp"] += 1 elif dst[s][d] == "1" and d not in resistant_drugs: results[d]["fn"].append(s) counts[d]["fn"] += 1 elif dst[s][d] == "1" and d in resistant_drugs: results[d]["tp"].append(s) counts[d]["tp"] += 1 #### Fluoroquinolones #### dst_flq = "0" dst_flq_NA = True for d in fluoroquinolones: if d not in dst[s]: continue if dst[s][d] != "NA": dst_flq_NA = False if dst[s][d] == "1": dst_flq = "1" dst_flq_list = [dst[s][d] for d in fluoroquinolones if d in dst[s]] if "1" in dst_flq_list and "0" in dst_flq_list: dst_flq = "NA" if dst_flq_NA: dst_flq = "NA" gst_flq = "0" for d in fluoroquinolones: if d in resistant_drugs: gst_flq = "1" if dst_flq == "1" and gst_flq == "1": results["flq"]["tp"].append(s) counts["flq"]["tp"] += 1 if dst_flq == "0" and gst_flq == "1": results["flq"]["fp"].append(s) counts["flq"]["fp"] += 1 if dst_flq == "1" and gst_flq == "0": results["flq"]["fn"].append(s) counts["flq"]["fn"] += 1 if dst_flq == "0" and gst_flq == "0": results["flq"]["tn"].append(s) counts["flq"]["tn"] += 1 #### MDR & XDR #### dst_mdr = "1" if dst[s]["rifampicin"] == "1" and dst[s][ "isoniazid"] == "1" else "0" if dst[s]["rifampicin"] == "NA" or dst[s]["isoniazid"] == "NA": dst_mdr = "NA" flq = False flq_NA = True for d in fluoroquinolones: if d not in dst[s]: continue if dst[s][d] != "NA": flq_NA = False if dst[s][d] == "1": flq = True amg = False amg_NA = True for d in aminoglycosides: if d not in dst[s]: continue if dst[s][d] != "NA": amg_NA = False if dst[s][d] == "1": amg = True dst_xdr = "1" if dst_mdr == "1" and flq and amg else "0" if flq_NA or amg_NA: dst_xdr = "NA" if dst_mdr == "NA": dst_xdr = "NA" #### Profiling results ##### gst_mdr = "1" if "rifampicin" in resistant_drugs and "isoniazid" in resistant_drugs else "0" flq = False for d in fluoroquinolones: if d in resistant_drugs: flq = True amg = False for d in aminoglycosides: if d in resistant_drugs: amg = True gst_xdr = "1" if gst_mdr == "1" and flq and amg else "0" if dst_mdr == "1" and gst_mdr == "1": results["mdr"]["tp"].append(s) counts["mdr"]["tp"] += 1 if dst_mdr == "0" and gst_mdr == "1": results["mdr"]["fp"].append(s) counts["mdr"]["fp"] += 1 if dst_mdr == "1" and gst_mdr == "0": results["mdr"]["fn"].append(s) counts["mdr"]["fn"] += 1 if dst_mdr == "0" and gst_mdr == "0": results["mdr"]["tn"].append(s) counts["mdr"]["tn"] += 1 if dst_xdr == "1" and gst_xdr == "1": results["xdr"]["tp"].append(s) counts["xdr"]["tp"] += 1 if dst_xdr == "0" and gst_xdr == "1": results["xdr"]["fp"].append(s) counts["xdr"]["fp"] += 1 if dst_xdr == "1" and gst_xdr == "0": results["xdr"]["fn"].append(s) counts["xdr"]["fn"] += 1 if dst_xdr == "0" and gst_xdr == "0": results["xdr"]["tn"].append(s) counts["xdr"]["tn"] += 1 ### susceptibility if "NA" not in [dst[s][d] for d in first_line]: dst_sus = "1" if "1" not in [dst[s][d] for d in drugs] else "0" gst_sus = "1" if all( [x not in resistant_drugs for x in first_line]) else "0" if dst_sus == "1" and gst_sus == "1": results["sus"]["tp"].append(s) counts["sus"]["tp"] += 1 if dst_sus == "0" and gst_sus == "1": results["sus"]["fp"].append(s) counts["sus"]["fp"] += 1 if dst_sus == "1" and gst_sus == "0": results["sus"]["fn"].append(s) counts["sus"]["fn"] += 1 if dst_sus == "0" and gst_sus == "0": results["sus"]["tn"].append(s) counts["sus"]["tn"] += 1 json.dump(results, open("results.json", "w")) json.dump(counts, open("counts.json", "w")) counts = json.load(open("counts.json")) drugs = [x.rstrip().lower() for x in open(args.drugs).readlines() ] if args.drugs else list(counts.keys()) print("Drug\tNum\tSusceptible\tResistant\tSensitivity\tSpecificity") for d in drugs: if d not in counts: continue if counts[d]["tp"] + counts[d]["fn"] == 0 or counts[d]["tn"] + counts[ d]["fp"] == 0: continue sensitivity = counts[d]["tp"] / (counts[d]["tp"] + counts[d]["fn"]) specificity = counts[d]["tn"] / (counts[d]["tn"] + counts[d]["fp"]) total = counts[d]["tp"] + counts[d]["fp"] + counts[d]["tn"] + counts[ d]["fn"] suc = counts[d]["tn"] + counts[d]["fp"] res = counts[d]["tp"] + counts[d]["fn"] print("%s\t%s\t%s\t%s\t%s\t%s" % (d.capitalize(), total, suc, res, sensitivity, specificity))
def main(args): vcf_class = pp.vcf(args.vcf) vcf_positions = vcf_class.get_positions() if not args.fasta: if not args.ref: sys.stderr.write( "\nERROR: Please supply a reference with --ref\n\n") quit() pp.run_cmd( "vcf2fasta.py --vcf %(vcf)s --snps --ref %(ref)s --snps-no-filt" % vars(args)) args.fasta = "%s.snps.fa" % vcf_class.prefix if pp.nofile("%s.asr.state" % args.fasta): pp.run_cmd( "iqtree -m %(model)s -te %(tree)s -s %(fasta)s -nt AUTO -asr -pre %(fasta)s.asr" % vars(args)) tree = ete3.Tree("%s.asr.treefile" % args.fasta, format=1) node_names = set([tree.name] + [n.name.split("/")[0] for n in tree.get_descendants()]) leaf_names = set(tree.get_leaf_names()) internal_node_names = node_names - leaf_names states_file = "%s.asr.state" % args.fasta states = defaultdict(dict) sys.stderr.write("Loading states\n") for l in tqdm(open(states_file)): if l[0] == "#": continue row = l.strip().split() if row[0] == "Node": continue site = int(row[1]) if row[0] not in internal_node_names: continue states[site][row[0]] = row[2] seqs = pp.fasta(args.fasta).fa_dict for site in tqdm(list(states)): for sample in seqs: states[site][sample] = seqs[sample][site - 1] acgt = set(["A", "C", "G", "T", "a", "c", "g", "t"]) convergent_sites = [] for site in tqdm(list(states)): nucleotides = set([states[site][n] for n in node_names]) if len(nucleotides) == 1: continue # Set up storage objects origins = [] tree.add_feature("state", states[site][tree.name]) for n in tree.traverse(): if n == tree: continue node_state = states[site][n.name] if node_state != n.get_ancestors( )[0].state and node_state in acgt and n.get_ancestors( )[0].state in acgt: origins.append(n.name) n.add_feature("state", node_state) if len(origins) > 1: convergent_sites.append((site, vcf_positions[site - 1], origins)) with open(args.out, "w") as O: for site in convergent_sites: O.write("%s\t%s\n" % (site[1][1], len(site[2])))
def main_profile(args): #### Setup conf dictionary ### if args.db == "tbdb" and not args.external_db and pp.nofile( sys.base_prefix + "/share/tbprofiler/tbdb.fasta"): pp.log( "Can't find the tbdb file at %s. Please run 'tb-profiler update_tbdb' to load the default library or specify another using the '--external_db' flag" % sys.base_prefix, ext=True) if args.external_db: conf = get_conf_dict(args.external_db) else: conf = get_conf_dict(sys.base_prefix + "/share/tbprofiler/%s" % args.db) ### Create folders for results if they don't exist ### if pp.nofolder(args.dir): os.mkdir(args.dir) for x in ["bam", "vcf", "results"]: if pp.nofolder(args.dir + "/" + x): os.mkdir(args.dir + "/" + x) ### Set up platform dependant parameters ### if args.platform == "nanopore": args.mapper = "minimap2" args.caller = "bcftools" args.no_trim = True run_delly = False else: if args.no_delly: run_delly = False else: run_delly = True ### Setup prefix for files ### files_prefix = args.dir + "/" + args.prefix ### Create bam file if fastq has been supplied ### if args.bam == None: if args.read1 and args.read2 and args.no_trim: # Paired + no trimming fastq_obj = pp.fastq(args.read1, args.read2) elif args.read1 and args.read2 and not args.no_trim: # Paired + trimming untrimmed_fastq_obj = pp.fastq(args.read1, args.read2) fastq_obj = untrimmed_fastq_obj.trim(files_prefix, threads=args.threads) elif args.read1 and not args.read2 and args.no_trim: # Unpaired + trimming fastq_obj = pp.fastq(args.read1, args.read2) elif args.read1 and not args.read2 and not args.no_trim: # Unpaired + trimming untrimmed_fastq_obj = pp.fastq(args.read1) fastq_obj = untrimmed_fastq_obj.trim(files_prefix, threads=args.threads) else: exit("\nPlease provide a bam file or a fastq file(s)...Exiting!\n") bam_obj = fastq_obj.map_to_ref(ref_file=conf["ref"], prefix=files_prefix, sample_name=args.prefix, aligner=args.mapper, platform=args.platform, threads=args.threads) bam_file = bam_obj.bam_file else: bam_file = args.bam print(args.delly_bcf_file) run_coverage = False if args.no_coverage else True ### Run profiling module from pathogen-profiler ### results = pp.bam_profiler( conf=conf, bam_file=bam_file, prefix=files_prefix, platform=args.platform, caller=args.caller, threads=args.threads, no_flagstat=args.no_flagstat, run_delly=run_delly, calling_params=args.calling_params, coverage_fraction_threshold=args.coverage_fraction_threshold, missing_cov_threshold=args.missing_cov_threshold, delly_bcf_file=args.delly_bcf_file) json.dump(results, open(args.prefix + ".tmp_results.json", "w")) ### Reformat the results to TB-Profiler style ### results = tbp.reformat(results, conf, reporting_af=args.reporting_af) results["id"] = args.prefix results["tbprofiler_version"] = tbp._VERSION results["pipeline"] = { "mapper": args.mapper if not args.bam else "N/A", "variant_caller": args.caller } json_output = args.dir + "/results/" + args.prefix + ".results.json" tex_output = args.dir + "/results/" + args.prefix + ".results.tex" text_output = args.dir + "/results/" + args.prefix + ".results.txt" csv_output = args.dir + "/results/" + args.prefix + ".results.csv" json.dump(results, open(json_output, "w")) extra_columns = [x.lower() for x in args.add_columns.split(",") ] if args.add_columns else [] if args.pdf: tbp.write_tex(results, conf, tex_output, extra_columns) pp.run_cmd("pdflatex %s" % tex_output, verbose=1) pp.rm_files([ tex_output, args.dir + "/" + args.prefix + ".results.aux", args.dir + "/" + args.prefix + ".results.log" ]) if args.txt: tbp.write_text(results, conf, text_output, extra_columns, reporting_af=args.reporting_af) if args.csv: tbp.write_csv(results, conf, csv_output, extra_columns) ### Move files to respective directories ### if not args.bam: pp.run_cmd("mv %(dir)s/%(prefix)s.bam* %(dir)s/bam/" % vars(args)) if not args.no_trim: pp.run_cmd("rm -f %s" % " ".join(fastq_obj.files)) pp.run_cmd("mv -f %(dir)s/%(prefix)s*.vcf.gz* %(dir)s/vcf/" % vars(args)) if run_delly and results["delly"] == "success" and not args.delly_bcf_file: pp.run_cmd("mv -f %(dir)s/%(prefix)s.delly.bcf* %(dir)s/vcf/" % vars(args)) ### Add meta data to results if args.meta: for row in csv.DictReader(open(args.meta)): if row["id"] == results["id"]: for col in row: results["meta_" + col] = row[col] pp.log("Profiling finished sucessfully!")