Exemplo n.º 1
0
def main(args):
    total_samples = set()
    sample_sets = {}
    for f in args.samples.split(","):
        sample_sets[f] = [s.rstrip() for s in open(f).readlines()]
        for s in sample_sets[f]:
            total_samples.add(s)

    mutations = defaultdict(set)
    columns = ["gene", "change"
               ] + (args.columns.split(",") if args.columns else [])

    for s in tqdm(total_samples):
        if pp.nofile("%s/%s.results.json" % (args.dir, s)): continue
        tmp = json.load(open("%s/%s.results.json" % (args.dir, s)))
        for var in tmp["dr_variants"]:
            tmp_var = {x: var[x] for x in columns}
            mutations[json.dumps(tmp_var)].add(s)
        if args.non_dr:
            for var in tmp["other_variants"]:
                tmp_var = var
                tmp_var = {x: var[x] for x in columns}
                mutations[json.dumps(tmp_var)].add(s)

    if args.summary:
        O = open(args.summary, "w")
        O.write("%s\n" % ("\t".join(["Gene", "Mutation"] + columns + [
            "%s_num\t%s_pct" % (x, x) if args.pct else x
            for x in list(sample_sets)
        ])))
        for var_string in mutations:
            if "gene_name" not in var:
                var["gene_name"] = var[
                    "gene"]  ######Fix for large deletions not haveing this key

            tmp_freqs = []
            for f in sample_sets:
                num = len(
                    [s for s in sample_sets[f] if s in mutations[var_string]])
                if args.pct:
                    pct = num / len(sample_sets[f]) * 100
                    tmp_freqs.append("%s\t%.2f" % (num, pct))
                else:
                    tmp_freqs.append("%s" % num)
            O.write("%s\t%s\t%s\t%s\n" %
                    (var["gene_name"], var["change"], "\t".join(
                        [str(var[x]) for x in columns]), "\t".join(tmp_freqs)))
        O.close()
Exemplo n.º 2
0
def main(args):
	change_field = "change" if args.variant_format=="hgvs" else "_internal_change"
	conf = conf = get_conf_dict(sys.base_prefix + "/share/tbprofiler/%s" % args.db)
	drug2genes = defaultdict(set)
	gene2drugs = defaultdict(set)
	for l in open(conf["bed"]):
		row = l.rstrip().split()
		for d in row[5].split(","):
			drug2genes[d].add(row[3])
			gene2drugs[row[3]].add(d)
	if args.samples:
		samples = [x.rstrip() for x in open(args.samples).readlines()]
	else:
		samples = [x.replace(".results.json","") for x in os.listdir("results/") if x[-13:]==".results.json"]
	variants = defaultdict(lambda:defaultdict(list))
	if args.non_dr:
		print("sample,%s" % (",".join(["%s,%s" % ("dr_mutations_%s" % x,"other_mutations_%s" % x)  for x in sorted(drug2genes)])))
	else:
		print("sample,%s" % (",".join(["dr_mutations_%s" % x for x in sorted(drug2genes)])))
	for s in tqdm(samples):
		if pp.nofile("%s/%s.results.json" % (args.dir,s)):
			if args.non_dr:
				print("%s,%s" % (s,",".join(["NA,NA" for _ in drug2genes])))
			else:
				print("%s,%s" % (s,",".join(["NA" for _ in drug2genes])))
			continue
		tmp = json.load(open("%s/%s.results.json" % (args.dir,s)))
		sample_dr_mutations = defaultdict(list)
		tmp_store = set([json.dumps(x) for x in tmp["dr_variants"]])	# This step is only to remove duplicate
		tmp["dr_variants"] = []											# mutations introduced by a bug that is
		tmp["dr_variants"] = [json.loads(x) for x in tmp_store]			# now fixed
		for var in tmp["dr_variants"]:
			sample_dr_mutations[var["drug"]].append(var["gene"]+"_"+var[change_field])
		if args.non_dr:
			sample_other_mutations = defaultdict(list)
			tmp_store = set([json.dumps(x) for x in tmp["other_variants"]])	# This step is only to remove duplicate
			tmp["other_variants"] = []										# mutations introduced by a bug that is
			tmp["other_variants"] = [json.loads(x) for x in tmp_store]		# now fixed
			for var in tmp["other_variants"]:
				for d in gene2drugs[var["locus_tag"]]:
					sample_other_mutations[d].append(var["gene"]+"_"+var[change_field])
			print("%s,%s" % (s,",".join(["%s,%s" % ("; ".join(sample_dr_mutations[d]) if d in sample_dr_mutations else "WT","; ".join(sample_other_mutations[d]) if d in sample_other_mutations else "WT",) for d in sorted(drug2genes)])))
		else:
			print("%s,%s" % (s,",".join(["; ".join(sample_dr_mutations[d]) if d in sample_dr_mutations else "WT" for d in sorted(drug2genes)])))
Exemplo n.º 3
0
def phylogeny(prefix,conf_file,sample_file=None,base_dir = ".",threads=3):
    conf = json.load(open(conf_file))

    if sample_file:
        samples = [x.rstrip() for x in open(sample_file).readlines()]
    else:
        samples = [x.replace(".results.json","") for x in os.listdir("results/") if x[-13:]==".results.json"]

    samples_file = pp.get_random_file()
    OUT = open(samples_file,"w")
    OUT.write("%s\n"%"\n".join(samples))
    OUT.close()
    for s in samples:
        tprefix = s+".genome"
        gbcf_file = "%s.gbcf" % tprefix
        if pp.nofile("%s/vcf/%s.genome.gbcf" % (base_dir,s)):
            bam_file = "%s/bam/%s.bam" % (base_dir,s)
            bam_obj = pp.bam(bam_file,s,conf["ref"])
            bam_obj.gbcf(prefix=tprefix)
            pp.run_cmd("mv %s* %s/vcf" % (gbcf_file,base_dir))
    cmd = "merge_vcfs.py %s %s %s --vcf_dir %s/vcf/ --vcf_ext genome.gbcf" % (samples_file,conf["ref"],prefix,base_dir)
    print(cmd)
def calculate(args):
    sample_file = args.samples
    dst_file = args.dst

    dst = load_dst(dst_file)
    drug_loci = pp.load_bed(args.bed, [6], 4)  # {'Rv0668': ('rifampicin')}
    FAIL = open("samples_not_found.txt", "w")
    samples = [x.rstrip() for x in open(sample_file).readlines()]
    ext = ".results.json"
    drugs = [d.lower() for d in dst[samples[0]].keys()]
    results = {
        d: {
            "tp": [],
            "tn": [],
            "fp": [],
            "fn": []
        }
        for d in drugs + ["flq", "mdr", "xdr", "sus"]
    }
    counts = {
        d: {
            "tp": 0,
            "tn": 0,
            "fp": 0,
            "fn": 0
        }
        for d in drugs + ["flq", "mdr", "xdr", "sus"]
    }
    pre = args.dir if args.dir else ""
    for s in tqdm(samples):
        res_file = "%s/%s%s" % (pre, s, ext)
        if pp.nofile(res_file):
            pp.log("Warning: %s does not exist!" % res_file)
            FAIL.write("%s\n" % s)
            continue
        res = json.load(open(res_file))
        na_drugs = set()
        for locus in drug_loci:
            if res["missing_regions"][locus] > args.miss:
                for tmp in drug_loci[locus][0].split(","):
                    na_drugs.add(tmp)
        resistant_drugs = [d["drug"].lower() for d in res["dr_variants"]]
        for d in drugs:
            if d in na_drugs:
                dst[s][d] = "NA"

        for d in drugs:
            if dst[s][d] == "0" and d not in resistant_drugs:
                results[d]["tn"].append(s)
                counts[d]["tn"] += 1
            elif dst[s][d] == "0" and d in resistant_drugs:
                results[d]["fp"].append(s)
                counts[d]["fp"] += 1
            elif dst[s][d] == "1" and d not in resistant_drugs:
                results[d]["fn"].append(s)
                counts[d]["fn"] += 1
            elif dst[s][d] == "1" and d in resistant_drugs:
                results[d]["tp"].append(s)
                counts[d]["tp"] += 1

        #### Fluoroquinolones ####
        dst_flq = "0"
        dst_flq_NA = True

        for d in fluoroquinolones:
            if d not in dst[s]: continue
            if dst[s][d] != "NA": dst_flq_NA = False
            if dst[s][d] == "1": dst_flq = "1"

        dst_flq_list = [dst[s][d] for d in fluoroquinolones if d in dst[s]]
        if "1" in dst_flq_list and "0" in dst_flq_list:
            dst_flq = "NA"
        if dst_flq_NA: dst_flq = "NA"

        gst_flq = "0"
        for d in fluoroquinolones:
            if d in resistant_drugs: gst_flq = "1"

        if dst_flq == "1" and gst_flq == "1":
            results["flq"]["tp"].append(s)
            counts["flq"]["tp"] += 1
        if dst_flq == "0" and gst_flq == "1":
            results["flq"]["fp"].append(s)
            counts["flq"]["fp"] += 1
        if dst_flq == "1" and gst_flq == "0":
            results["flq"]["fn"].append(s)
            counts["flq"]["fn"] += 1
        if dst_flq == "0" and gst_flq == "0":
            results["flq"]["tn"].append(s)
            counts["flq"]["tn"] += 1

        #### MDR & XDR ####
        dst_mdr = "1" if dst[s]["rifampicin"] == "1" and dst[s][
            "isoniazid"] == "1" else "0"
        if dst[s]["rifampicin"] == "NA" or dst[s]["isoniazid"] == "NA":
            dst_mdr = "NA"
        flq = False
        flq_NA = True
        for d in fluoroquinolones:
            if d not in dst[s]: continue
            if dst[s][d] != "NA": flq_NA = False
            if dst[s][d] == "1": flq = True
        amg = False
        amg_NA = True
        for d in aminoglycosides:
            if d not in dst[s]: continue
            if dst[s][d] != "NA": amg_NA = False
            if dst[s][d] == "1": amg = True
        dst_xdr = "1" if dst_mdr == "1" and flq and amg else "0"

        if flq_NA or amg_NA: dst_xdr = "NA"
        if dst_mdr == "NA": dst_xdr = "NA"

        #### Profiling results #####
        gst_mdr = "1" if "rifampicin" in resistant_drugs and "isoniazid" in resistant_drugs else "0"
        flq = False
        for d in fluoroquinolones:
            if d in resistant_drugs: flq = True
        amg = False
        for d in aminoglycosides:
            if d in resistant_drugs: amg = True
        gst_xdr = "1" if gst_mdr == "1" and flq and amg else "0"
        if dst_mdr == "1" and gst_mdr == "1":
            results["mdr"]["tp"].append(s)
            counts["mdr"]["tp"] += 1
        if dst_mdr == "0" and gst_mdr == "1":
            results["mdr"]["fp"].append(s)
            counts["mdr"]["fp"] += 1
        if dst_mdr == "1" and gst_mdr == "0":
            results["mdr"]["fn"].append(s)
            counts["mdr"]["fn"] += 1
        if dst_mdr == "0" and gst_mdr == "0":
            results["mdr"]["tn"].append(s)
            counts["mdr"]["tn"] += 1
        if dst_xdr == "1" and gst_xdr == "1":
            results["xdr"]["tp"].append(s)
            counts["xdr"]["tp"] += 1
        if dst_xdr == "0" and gst_xdr == "1":
            results["xdr"]["fp"].append(s)
            counts["xdr"]["fp"] += 1
        if dst_xdr == "1" and gst_xdr == "0":
            results["xdr"]["fn"].append(s)
            counts["xdr"]["fn"] += 1
        if dst_xdr == "0" and gst_xdr == "0":
            results["xdr"]["tn"].append(s)
            counts["xdr"]["tn"] += 1
        ### susceptibility
        if "NA" not in [dst[s][d] for d in first_line]:
            dst_sus = "1" if "1" not in [dst[s][d] for d in drugs] else "0"
            gst_sus = "1" if all(
                [x not in resistant_drugs for x in first_line]) else "0"
            if dst_sus == "1" and gst_sus == "1":
                results["sus"]["tp"].append(s)
                counts["sus"]["tp"] += 1
            if dst_sus == "0" and gst_sus == "1":
                results["sus"]["fp"].append(s)
                counts["sus"]["fp"] += 1
            if dst_sus == "1" and gst_sus == "0":
                results["sus"]["fn"].append(s)
                counts["sus"]["fn"] += 1
            if dst_sus == "0" and gst_sus == "0":
                results["sus"]["tn"].append(s)
                counts["sus"]["tn"] += 1
    json.dump(results, open("results.json", "w"))
    json.dump(counts, open("counts.json", "w"))
    counts = json.load(open("counts.json"))
    drugs = [x.rstrip().lower() for x in open(args.drugs).readlines()
             ] if args.drugs else list(counts.keys())
    print("Drug\tNum\tSusceptible\tResistant\tSensitivity\tSpecificity")
    for d in drugs:
        if d not in counts: continue
        if counts[d]["tp"] + counts[d]["fn"] == 0 or counts[d]["tn"] + counts[
                d]["fp"] == 0:
            continue
        sensitivity = counts[d]["tp"] / (counts[d]["tp"] + counts[d]["fn"])
        specificity = counts[d]["tn"] / (counts[d]["tn"] + counts[d]["fp"])
        total = counts[d]["tp"] + counts[d]["fp"] + counts[d]["tn"] + counts[
            d]["fn"]
        suc = counts[d]["tn"] + counts[d]["fp"]
        res = counts[d]["tp"] + counts[d]["fn"]
        print("%s\t%s\t%s\t%s\t%s\t%s" %
              (d.capitalize(), total, suc, res, sensitivity, specificity))
def main(args):

    vcf_class = pp.vcf(args.vcf)
    vcf_positions = vcf_class.get_positions()

    if not args.fasta:
        if not args.ref:
            sys.stderr.write(
                "\nERROR: Please supply a reference with --ref\n\n")
            quit()
        pp.run_cmd(
            "vcf2fasta.py --vcf %(vcf)s --snps --ref %(ref)s --snps-no-filt" %
            vars(args))
        args.fasta = "%s.snps.fa" % vcf_class.prefix
    if pp.nofile("%s.asr.state" % args.fasta):
        pp.run_cmd(
            "iqtree -m %(model)s -te %(tree)s -s %(fasta)s -nt AUTO -asr -pre %(fasta)s.asr"
            % vars(args))

    tree = ete3.Tree("%s.asr.treefile" % args.fasta, format=1)
    node_names = set([tree.name] +
                     [n.name.split("/")[0] for n in tree.get_descendants()])
    leaf_names = set(tree.get_leaf_names())
    internal_node_names = node_names - leaf_names

    states_file = "%s.asr.state" % args.fasta
    states = defaultdict(dict)
    sys.stderr.write("Loading states\n")
    for l in tqdm(open(states_file)):
        if l[0] == "#": continue
        row = l.strip().split()
        if row[0] == "Node": continue
        site = int(row[1])
        if row[0] not in internal_node_names: continue
        states[site][row[0]] = row[2]

    seqs = pp.fasta(args.fasta).fa_dict
    for site in tqdm(list(states)):
        for sample in seqs:
            states[site][sample] = seqs[sample][site - 1]

    acgt = set(["A", "C", "G", "T", "a", "c", "g", "t"])
    convergent_sites = []
    for site in tqdm(list(states)):
        nucleotides = set([states[site][n] for n in node_names])
        if len(nucleotides) == 1: continue

        # Set up storage objects
        origins = []

        tree.add_feature("state", states[site][tree.name])
        for n in tree.traverse():
            if n == tree: continue
            node_state = states[site][n.name]
            if node_state != n.get_ancestors(
            )[0].state and node_state in acgt and n.get_ancestors(
            )[0].state in acgt:
                origins.append(n.name)
            n.add_feature("state", node_state)
        if len(origins) > 1:
            convergent_sites.append((site, vcf_positions[site - 1], origins))

    with open(args.out, "w") as O:
        for site in convergent_sites:
            O.write("%s\t%s\n" % (site[1][1], len(site[2])))
Exemplo n.º 6
0
def main_profile(args):
    #### Setup conf dictionary ###
    if args.db == "tbdb" and not args.external_db and pp.nofile(
            sys.base_prefix + "/share/tbprofiler/tbdb.fasta"):
        pp.log(
            "Can't find the tbdb file at %s. Please run 'tb-profiler update_tbdb' to load the default library or specify another using the '--external_db' flag"
            % sys.base_prefix,
            ext=True)
    if args.external_db:
        conf = get_conf_dict(args.external_db)
    else:
        conf = get_conf_dict(sys.base_prefix +
                             "/share/tbprofiler/%s" % args.db)

    ### Create folders for results if they don't exist ###
    if pp.nofolder(args.dir):
        os.mkdir(args.dir)

    for x in ["bam", "vcf", "results"]:
        if pp.nofolder(args.dir + "/" + x):
            os.mkdir(args.dir + "/" + x)

    ### Set up platform dependant parameters ###
    if args.platform == "nanopore":
        args.mapper = "minimap2"
        args.caller = "bcftools"
        args.no_trim = True
        run_delly = False
    else:
        if args.no_delly:
            run_delly = False
        else:
            run_delly = True

    ### Setup prefix for files ###
    files_prefix = args.dir + "/" + args.prefix

    ### Create bam file if fastq has been supplied ###
    if args.bam == None:
        if args.read1 and args.read2 and args.no_trim:
            # Paired + no trimming
            fastq_obj = pp.fastq(args.read1, args.read2)
        elif args.read1 and args.read2 and not args.no_trim:
            # Paired + trimming
            untrimmed_fastq_obj = pp.fastq(args.read1, args.read2)
            fastq_obj = untrimmed_fastq_obj.trim(files_prefix,
                                                 threads=args.threads)
        elif args.read1 and not args.read2 and args.no_trim:
            # Unpaired + trimming
            fastq_obj = pp.fastq(args.read1, args.read2)
        elif args.read1 and not args.read2 and not args.no_trim:
            # Unpaired + trimming
            untrimmed_fastq_obj = pp.fastq(args.read1)
            fastq_obj = untrimmed_fastq_obj.trim(files_prefix,
                                                 threads=args.threads)
        else:
            exit("\nPlease provide a bam file or a fastq file(s)...Exiting!\n")
        bam_obj = fastq_obj.map_to_ref(ref_file=conf["ref"],
                                       prefix=files_prefix,
                                       sample_name=args.prefix,
                                       aligner=args.mapper,
                                       platform=args.platform,
                                       threads=args.threads)
        bam_file = bam_obj.bam_file
    else:
        bam_file = args.bam

    print(args.delly_bcf_file)
    run_coverage = False if args.no_coverage else True
    ### Run profiling module from pathogen-profiler ###
    results = pp.bam_profiler(
        conf=conf,
        bam_file=bam_file,
        prefix=files_prefix,
        platform=args.platform,
        caller=args.caller,
        threads=args.threads,
        no_flagstat=args.no_flagstat,
        run_delly=run_delly,
        calling_params=args.calling_params,
        coverage_fraction_threshold=args.coverage_fraction_threshold,
        missing_cov_threshold=args.missing_cov_threshold,
        delly_bcf_file=args.delly_bcf_file)
    json.dump(results, open(args.prefix + ".tmp_results.json", "w"))
    ### Reformat the results to TB-Profiler style ###
    results = tbp.reformat(results, conf, reporting_af=args.reporting_af)
    results["id"] = args.prefix
    results["tbprofiler_version"] = tbp._VERSION
    results["pipeline"] = {
        "mapper": args.mapper if not args.bam else "N/A",
        "variant_caller": args.caller
    }

    json_output = args.dir + "/results/" + args.prefix + ".results.json"
    tex_output = args.dir + "/results/" + args.prefix + ".results.tex"
    text_output = args.dir + "/results/" + args.prefix + ".results.txt"
    csv_output = args.dir + "/results/" + args.prefix + ".results.csv"

    json.dump(results, open(json_output, "w"))
    extra_columns = [x.lower() for x in args.add_columns.split(",")
                     ] if args.add_columns else []
    if args.pdf:
        tbp.write_tex(results, conf, tex_output, extra_columns)
        pp.run_cmd("pdflatex %s" % tex_output, verbose=1)
        pp.rm_files([
            tex_output, args.dir + "/" + args.prefix + ".results.aux",
            args.dir + "/" + args.prefix + ".results.log"
        ])
    if args.txt:
        tbp.write_text(results,
                       conf,
                       text_output,
                       extra_columns,
                       reporting_af=args.reporting_af)
    if args.csv:
        tbp.write_csv(results, conf, csv_output, extra_columns)

    ### Move files to respective directories ###
    if not args.bam:
        pp.run_cmd("mv %(dir)s/%(prefix)s.bam* %(dir)s/bam/" % vars(args))
        if not args.no_trim:
            pp.run_cmd("rm -f %s" % " ".join(fastq_obj.files))
    pp.run_cmd("mv -f %(dir)s/%(prefix)s*.vcf.gz* %(dir)s/vcf/" % vars(args))
    if run_delly and results["delly"] == "success" and not args.delly_bcf_file:
        pp.run_cmd("mv -f %(dir)s/%(prefix)s.delly.bcf* %(dir)s/vcf/" %
                   vars(args))

    ### Add meta data to results
    if args.meta:
        for row in csv.DictReader(open(args.meta)):
            if row["id"] == results["id"]:
                for col in row:
                    results["meta_" + col] = row[col]
    pp.log("Profiling finished sucessfully!")