def match(args): sample = dict([(m.id, m) for m in pwmfile_to_motifs(args.pwmfile)]) db = dict([(m.id, m) for m in pwmfile_to_motifs(args.dbpwmfile)]) mc = MotifComparer() result = mc.get_closest_match(sample.values(), db.values(), "partial", "wic", "mean") print("Motif\tMatch\tScore\tP-value") for motif, match in result.items(): pval, pos, orient = mc.compare_motifs(sample[motif], db[match[0]], "partial", "wic", "mean", pval=True) print("%s\t%s\t%0.2f\t%0.3e" % (motif, match[0], match[1][0], pval)) if args.img: plotdata = [] for query, match in result.items(): motif = sample[query] dbmotif = db[match[0]] pval, pos, orient = mc.compare_motifs(motif, dbmotif, "partial", "wic", "mean", pval=True) if orient == -1: tmp = dbmotif.id dbmotif = dbmotif.rc() dbmotif.id = tmp if pos < 0: tmp = motif.id motif = Motif([[0.25,0.25,0.25,0.25]] * -pos + motif.pwm) motif.id = tmp elif pos > 0: tmp = dbmotif.id dbmotif = Motif([[0.25,0.25,0.25,0.25]] * pos + dbmotif.pwm) dbmotif.id = tmp plotdata.append((motif, dbmotif, pval)) match_plot(plotdata, args.img)
def match(args): sample = dict([(m.id, m) for m in pwmfile_to_motifs(args.pwmfile)]) db = dict([(m.id, m) for m in pwmfile_to_motifs(args.dbpwmfile)]) mc = MotifComparer() result = mc.get_closest_match(sample.values(), db.values(), "partial", "wic", "mean") print "Motif\tMatch\tScore\tP-value" for motif, match in result.items(): pval, pos, orient = mc.compare_motifs(sample[motif], db[match[0]], "partial", "wic", "mean", pval=True) print "%s\t%s\t%0.2f\t%0.3e" % (motif, match[0], match[1][0], pval) if args.img: plotdata = [] for query, match in result.items(): motif = sample[query] dbmotif = db[match[0]] pval, pos, orient = mc.compare_motifs(motif, dbmotif, "partial", "wic", "mean", pval=True) if orient == -1: tmp = dbmotif.id dbmotif = dbmotif.rc() dbmotif.id = tmp if pos < 0: tmp = motif.id motif = Motif([[0.25,0.25,0.25,0.25]] * -pos + motif.pwm) motif.id = tmp elif pos > 0: tmp = dbmotif.id dbmotif = Motif([[0.25,0.25,0.25,0.25]] * pos + dbmotif.pwm) dbmotif.id = tmp plotdata.append((motif, dbmotif, pval)) match_plot(plotdata, args.img)
def determine_significant_motifs(self, background=["random"], organism="hg18", width=200, pvalue_cutoff=0.001, enrichment_cutoff=1.5): fg = [self.validation_fa, self.validation_gff] bg = [] for bg_type in background: bg.append([self.bg_file["fa"][bg_type], self.bg_file["gff"][bg_type],self.bg_file["enrichment"][bg_type]]) self.calculate_enrichment(self.predicted_pwm, fg, bg) self.logger.info("Determining significant motifs") self.logger.info("Thresholds: enrichment >= %s; p-value <= %s"% (enrichment_cutoff, pvalue_cutoff)) all_motifs = pwmfile_to_motifs(self.predicted_pwm) filt_ids = [x.id for x in all_motifs] for bg_id in background: filt_ids = self.filter_motifs(filt_ids, self.bg_file["enrichment"][bg_id], enrichment_cutoff, pvalue_cutoff) f = open(self.significant_pwm, "w") fp = open(self.significant_pfm, "w") for motif in pwmfile_to_motifs(self.predicted_pfm): if motif.id in filt_ids: f.write("%s\n" % motif.to_pwm()) fp.write("%s\n" % motif.to_pfm()) f.close() fp.close() self.logger.info("%s motifs are significant, written to %s" % (len(filt_ids), self.significant_pwm)) return len(filt_ids)
def _calc_report_values(self, pwm, background): self.p = dict([(b,{}) for b in background]) self.e = dict([(b,{}) for b in background]) e_files = { "random": self.bg_random_cluster_enrichment, "genomic": self.bg_genomic_cluster_enrichment, "genomic_matched": self.bg_genomic_matched_cluster_enrichment } for bg in self.p.keys(): for line in open(e_files[bg]).readlines(): if not (line.startswith("#") or line.startswith("Motif\tSig")): vals = line.strip().split("\t") self.p[bg][vals[0]] = float(vals[2]) self.e[bg][vals[0]] = float(vals[5]) self.auc = dict([(b,{}) for b in background]) self.mncp = dict([(b,{}) for b in background]) rocs = { "random": [self.bg_random_fa, self.cluster_bg_random_roc_metrics], "genomic": [self.bg_genomic_fa, self.cluster_bg_genomic_roc_metrics], "genomic_matched": [self.bg_genomic_matched_fa, self.cluster_bg_genomic_matched_roc_metrics], } for bg in self.auc.keys(): bg_fasta_file, roc_file = rocs[bg] self.auc[bg], self.mncp[bg] = self._roc_metrics(pwm, self.validation_fa, bg_fasta_file, roc_file) motifs = pwmfile_to_motifs(pwm) self.closest_match = self.determine_closest_match(motifs)
def scan_fasta_file_with_motifs(fastafile, motiffile, threshold, gfffile): from gimmemotifs.fasta import Fasta from gimmemotifs.motif import pwmfile_to_motifs motifs = pwmfile_to_motifs(motiffile) fa = Fasta(fastafile) for motif in motifs: motif.pwm_scan_to_gff(fa, gfffile, nreport=1, cutoff=float(threshold), append=True)
def threshold(args): if args.fdr < 0 or args.fdr > 1: print "Please specify a FDR between 0 and 1" sys.exit(1) motifs = pwmfile_to_motifs(args.pwmfile) s = Scanner() s.set_motifs(args.pwmfile) score_table = [] for scores in s.best_score(args.inputfile): score_table.append(scores) print "Motif\tScore\tCutoff" for i,scores in enumerate(np.array(score_table).transpose()): motif = motifs[i] pwm = motif.pwm min_score = motif.pwm_min_score() if len(scores) > 0: opt_score = scoreatpercentile(scores, 100 - (100 * args.fdr)) cutoff = (opt_score - min_score) / ( motif.pwm_max_score() - min_score) print "{0}\t{1}\t{2}".format( motif.id, opt_score , cutoff) else: sys.stderr.write("Warning: no matches for {0}\n".format(motif.id))
def threshold(args): if args.fdr < 0 or args.fdr > 1: print "Please specify a FDR between 0 and 1" sys.exit(1) motifs = pwmfile_to_motifs(args.pwmfile) s = Scanner() s.set_motifs(args.pwmfile) score_table = [] for scores in s.best_score(args.inputfile): score_table.append(scores) print "Motif\tScore\tCutoff" for i, scores in enumerate(np.array(score_table).transpose()): motif = motifs[i] pwm = motif.pwm min_score = motif.pwm_min_score() if len(scores) > 0: opt_score = scoreatpercentile(scores, 100 - (100 * args.fdr)) cutoff = (opt_score - min_score) / (motif.pwm_max_score() - min_score) print "{0}\t{1}\t{2}".format(motif.id, opt_score, cutoff) else: sys.stderr.write("Warning: no matches for {0}\n".format(motif.id))
def _calc_report_values(self, pwm, background): self.logger.info("Calculating final statistics for report") self.p = dict([(b,{}) for b in background]) self.e = dict([(b,{}) for b in background]) e_files = dict([(bg, self.bg_file["cluster_enrichment"][bg]) for bg in background]) for bg in self.p.keys(): for line in open(e_files[bg]).readlines(): if not (line.startswith("#") or line.startswith("Motif\tSig")): vals = line.strip().split("\t") self.p[bg][vals[0]] = float(vals[2]) self.e[bg][vals[0]] = float(vals[5]) self.auc = dict([(b,{}) for b in background]) self.mncp = dict([(b,{}) for b in background]) rocs = dict([(bg, [self.bg_file["fa"][bg], self.bg_file["roc"][bg]]) for bg in background]) for bg in self.auc.keys(): bg_fasta_file, roc_file = rocs[bg] self.auc[bg], self.mncp[bg] = self._roc_metrics(pwm, self.validation_fa, bg_fasta_file, roc_file) motifs = pwmfile_to_motifs(pwm) self.closest_match = self.determine_closest_match(motifs)
def location(args): """ Creates histrogram of motif location. Parameters ---------- args : argparse object Command line arguments. """ fastafile = args.fastafile pwmfile = args.pwmfile lwidth = args.width if not lwidth: f = Fasta(fastafile) lwidth = len(f.items()[0][1]) f = None jobs = [] motifs = pwmfile_to_motifs(pwmfile) ids = [motif.id for motif in motifs] if args.ids: ids = args.ids.split(",") for motif in motifs: if motif.id in ids: outfile = os.path.join("%s_histogram" % motif.id) jobs.append( pool.apply_async( motif_localization, (fastafile, motif, lwidth, outfile, args.cutoff))) for job in jobs: job.get()
def create_roc_plots(self, pwm_file, fg_fasta, bg_fasta, name): motifs = dict([(m.id, m) for m in pwmfile_to_motifs(pwm_file)]) jobs = {} for id,m in motifs.items(): jobs[id] = self.job_server().submit(get_roc_values, (motifs[id],fg_fasta,bg_fasta,)) roc_img_file = os.path.join(self.imgdir, "%s_%s_roc.png") for id in motifs.keys(): error, x, y = jobs[id]() if error: self.logger.error("Error in thread: %s" % error) sys.exit(1) fig = plt.figure() try: # matplotlib >= 0.99 rect = fig.patch # a rectangle instance except: # matplotlib 0.98 rect = fig.figurePatch # a rectangle instance plt.xlim(0,0.2) plt.ylim(0,1.0) colors = [cm.Paired(256 / 11 * i) for i in range(11)] plt.plot(x, y, color=colors[(0 * 2) % 10 + 1]) plt.axis([0,1,0,1]) plt.xlabel("1 - Specificity") plt.ylabel("Sensitivity") plt.savefig(roc_img_file % (id,name), format="png")
def location(args): fastafile = args.fastafile pwmfile = args.pwmfile lwidth = args.width if not lwidth: f = Fasta(fastafile) lwidth = len(f.items()[0][1]) f = None jobs = [] motifs = pwmfile_to_motifs(pwmfile) ids = [motif.id for motif in motifs] if args.ids: ids = args.ids.split(",") for motif in motifs: if motif.id in ids: outfile = os.path.join("%s_histogram" % motif.id) jobs.append( pool.apply_async( motif_localization, (fastafile,motif,lwidth,outfile, args.cutoff) )) for job in jobs: job.get()
def create_roc_plots(self, pwm_file, fg_fasta, bg_fasta, name): roc_cmd = "motif_roc.py -p %s -s %s -b %s -o %s -i %s -l" roc_img_file = os.path.join(self.imgdir, "%s_%s_roc") motifs = pwmfile_to_motifs(pwm_file) for motif in motifs: p = Popen(roc_cmd % (pwm_file, fg_fasta, bg_fasta, roc_img_file % (motif.id, name), motif.id), shell=True) p.communicate()
def _run_program(self, bin, fastafile, savedir, params={}): from gimmemotifs.motif import pwmfile_to_motifs import os fname = os.path.join(self.config.get_motif_dir(), "JASPAR2010_vertebrate.pwm") motifs = pwmfile_to_motifs(fname) for motif in motifs: motif.id = "JASPAR_%s" % motif.id return motifs, "", ""
def scan_fasta_file_with_motifs(fastafile, motiffile, threshold, gfffile, scan_rc=True, nreport=1): error = None try: motifs = pwmfile_to_motifs(motiffile) fa = Fasta(fastafile) for motif in motifs: motif.pwm_scan_to_gff(fa, gfffile, nreport=nreport, cutoff=float(threshold), scan_rc=scan_rc, append=True) except Exception,e : error = e
def maxenr(args): if not os.path.exists(args.sample): print "File %s does not exist!" % args.sample exit(1) if not os.path.exists(args.background): print "File %s does not exist!" % args.background exit(1) pwmfile = args.pwmfile fg_file = args.sample bg_file = args.background motifs = dict([(x.id, x) for x in pwmfile_to_motifs(pwmfile)]) ids = [] if args.ids: ids = args.ids.split(",") else: ids = motifs.keys() fg_jobs = {} bg_jobs = {} for id in ids: if motifs.has_key(id): bg_jobs[id] = pool.apply_async(get_scores, (motifs[id], bg_file)) fg_jobs[id] = pool.apply_async(get_scores, (motifs[id], fg_file)) else: print "Wrong id: %s" % id sys.exit() print "Motif\t# matches\tMax. enrichment\tScore\tCutoff" for id in ids: pos = array(fg_jobs[id].get()) neg = array(bg_jobs[id].get()) factor = len(neg) / float(len(pos)) scores = array([s for s in hstack((pos, neg)) if sum(neg >= s) > 1]) enr = array([(sum(pos >= x) / float(sum(neg >= x))) * factor for x in scores]) # print len(scores), len(enr) # for x,y in zip(enr, scores): # print "%s\t%s" % (x,y) max_score = scores[enr.argmax()] cutoff = (max_score - motifs[id].pwm_min_score()) / (motifs[id].pwm_max_score() - motifs[id].pwm_min_score()) print "%s\t%s\t%0.2f\t%0.2f\t%0.3f" % ( id, sum(pos >= scores[enr.argmax()]), max(enr), scores[enr.argmax()], cutoff, )
def _cluster_motifs(self, pfm_file, cluster_pwm, dir, threshold): self.logger.info("Clustering significant motifs.") trim_ic = 0.2 clusters = [] motifs = pwmfile_to_motifs(pfm_file) if len(motifs) == 1: clusters = [[motifs[0], motifs]] else: tree = cluster_motifs(pfm_file, "total", "wic", "mean", True, threshold=float(threshold), include_bg=True) clusters = tree.getResult() ids = [] mc = MotifComparer() for cluster,members in clusters: cluster.trim(trim_ic) cluster.to_img(os.path.join(self.imgdir,"%s.png" % cluster.id), format="PNG") ids.append([cluster.id, {"src":"images/%s.png" % cluster.id},[]]) if len(members) > 1: scores = {} for motif in members: scores[motif] = mc.compare_motifs(cluster, motif, "total", "wic", "mean", pval=True) add_pos = sorted(scores.values(),cmp=lambda x,y: cmp(x[1], y[1]))[0][1] for motif in members: score, pos, strand = scores[motif] add = pos - add_pos if strand in [1,"+"]: pass else: #print "RC %s" % motif.id rc = motif.rc() rc.id = motif.id motif = rc #print "%s\t%s" % (motif.id, add) motif.to_img(os.path.join(self.imgdir, "%s.png" % motif.id.replace(" ", "_")), format="PNG", add_left=add) ids[-1][2] = [dict([("src", "images/%s.png" % motif.id.replace(" ", "_")), ("alt", motif.id.replace(" ", "_"))]) for motif in members] kid.enable_import() template_file = os.path.join(self.config.get_template_dir(), "cluster_template_v2.kid") template = kid.Template(file=template_file, expname=self.name, motifs=ids, inputfile=self.inputfile, date=datetime.today().strftime("%d/%m/%Y"), version=GM_VERSION) f = open(self.cluster_report, "w") f.write(template.serialize()) f.close() f = open(cluster_pwm, "w") if len(clusters) == 1 and len(clusters[0][1]) == 1: f.write("%s\n" % clusters[0][0].to_pwm()) else: for motif in tree.get_clustered_motifs(): f.write("%s\n" % motif.to_pwm()) f.close() self.logger.info("Clustering done. See the result in %s" % self.cluster_report) return clusters
def logo(args): inputfile = args.pwmfile motifs = pwmfile_to_motifs(inputfile) if args.ids: ids = args.ids.split(",") motifs = [m for m in motifs if m.id in ids] for motif in motifs: motif.to_img(motif.id, fmt="PNG")
def _run_program(self, bin, fastafile, savedir="", params={}): import os, tempfile, shutil from subprocess import Popen, PIPE default_params = {"single":False, "background":None} default_params.update(params) trawler = bin fastafile = os.path.abspath(fastafile) if not default_params["background"]: print "Background file needed!" sys.exit() bgfile = os.path.abspath(default_params["background"]) savedir = os.path.abspath(savedir) #savedir = "/tmp/trawler/" tmp = tempfile.NamedTemporaryFile(dir=self.tmpdir, delete=False) shutil.copy(fastafile, tmp.name) fastafile = tmp.name current_path = os.getcwd() os.chdir(self.dir()) stdout = "" stderr = "" strand = "double" if default_params["single"]: strand = "single" cmd = "%s -sample %s -background %s -directory %s -strand %s" % (trawler, fastafile, bgfile, self.tmpdir, strand) p = Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE) out,err = p.communicate() stdout += out stderr += err os.chdir(current_path) motifs = [] out_name = [dir for dir in os.listdir(self.tmpdir) if dir.startswith("tmp")][-1] out_file = os.path.join(self.tmpdir, out_name, "result", "%s.pwm" % out_name) if os.path.exists(out_file): motifs = pwmfile_to_motifs(os.path.join( self.tmpdir, out_name, "result", "%s.pwm" % out_name)) # remove temporary files if os.path.exists(tmp.name): os.unlink(tmp.name) for motif in motifs: motif.id = "%s_%s" % (self.name, motif.id) return motifs, stdout, stderr
def pwmscan(args): inputfile = args.inputfile nreport = args.nreport cutoff = args.cutoff bed = args.bed scan_rc = args.scan_rc motifs = pwmfile_to_motifs(args.pwmfile) result = scan_it(inputfile, motifs, cutoff, nreport, scan_rc) p = re.compile(r'([^\s:]+):(\d+)-(\d+)') fa = Fasta(inputfile) if args.table: table = {} for seq_id in fa.ids: table[seq_id] = {} for motif, result in result: for seq_id, matches in result.items(): table[seq_id][motif] = len(matches) #mnames = [m.id for m in motifs] #print table print "\t{}".format("\t".join([m.id for m in motifs])) for seq_id in fa.ids: counts = [table[seq_id].get(m, 0) for m in motifs] print "{}\t{}".format(seq_id, "\t".join([str(x) for x in counts])) else: strandmap = {-1:"-",1:"+"} for motif, result in result: for seq_id, matches in result.items(): for (pos, score, strand) in matches: if bed: m = p.search(seq_id) if m: chrom = m.group(1) start = int(m.group(2)) end = int(m.group(3)) print "%s\t%s\t%s\t%s\t%s\t%s" % (chrom, start + pos, start + pos + len(motif) , motif.id, score, strandmap[strand]) else: print "%s\t%s\t%s\t%s\t%s\t%s" % (seq_id, pos, pos + len(motif), motif.id, score, strandmap[strand]) else: print "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\tmotif_name \"%s\" ; motif_instance \"%s\"" % ( seq_id, "pwmscan", "misc_feature", pos + 1, # GFF is 1-based pos + len(motif), score, strandmap[strand], ".", motif.id, fa[seq_id][pos: pos + len(motif)] )
def scan_fasta_file_with_motifs(fastafile, motiffile, threshold, gfffile, scan_rc=True): error = None try: from gimmemotifs.fasta import Fasta from gimmemotifs.motif import pwmfile_to_motifs motifs = pwmfile_to_motifs(motiffile) fa = Fasta(fastafile) for motif in motifs: motif.pwm_scan_to_gff(fa, gfffile, nreport=1, cutoff=float(threshold), scan_rc=scan_rc, append=True) except Exception,e : error = e
def _create_report(self, pwm, background): class ReportMotif: pass motifs = pwmfile_to_motifs(pwm) for m,match in self.closest_match.items(): match[0].to_img(os.path.join(self.imgdir,"%s.png" % match[0].id), format="PNG") random = "random" in background genomic = "genomic_matched" in background sort_key = "random" if genomic: sort_key = "genomic_matched" roc_img_file = "%s_%s_roc" report_motifs = [] for motif in sorted(motifs, cmp=lambda x,y: cmp(self.mncp[sort_key][y.id], self.mncp[sort_key][x.id])): rm = ReportMotif() rm.id = motif.id rm.id_href = {"href": "#%s" % motif.id} rm.id_name = {"name": motif.id} rm.img = {"src": os.path.join("images", "%s.png" % motif.id)} rm.consensus = motif.to_consensus() if random: rm.random_e = "%0.2f" % self.e["random"][motif.id] rm.random_p = "%0.2f" % self.p["random"][motif.id] rm.random_auc = "%0.3f" % self.auc["random"][motif.id] rm.random_mncp = "%0.3f" % self.mncp["random"][motif.id] rm.random_roc_img = {"src": "images/" + os.path.basename(roc_img_file % (motif.id, "random")) + ".png"} rm.random_roc_img_link = {"href": "images/" + os.path.basename(roc_img_file % (motif.id, "random")) + ".png"} if genomic: rm.genomic_e = "%0.2f" % self.e["genomic_matched"][motif.id] rm.genomic_p = "%0.2f" % self.p["genomic_matched"][motif.id] rm.genomic_auc = "%0.3f" % self.auc["genomic_matched"][motif.id] rm.genomic_mncp = "%0.3f" % self.mncp["genomic_matched"][motif.id] rm.genomic_roc_img = {"src": "images/" + os.path.basename(roc_img_file % (motif.id, "genomic_matched")) + ".png"} rm.genomic_roc_img_link = {"href": "images/" + os.path.basename(roc_img_file % (motif.id, "genomic_matched")) + ".png"} rm.histogram_img = {"data":"images/%s_histogram.svg" % motif.id} rm.histogram_link= {"href":"images/%s_histogram.svg" % motif.id} rm.match_img = {"src": "images/%s.png" % self.closest_match[motif.id][0].id} rm.match_id = self.closest_match[motif.id][0].id rm.match_pval = "%0.2e" % self.closest_match[motif.id][1] report_motifs.append(rm) total_report = self.motif_report kid.enable_import() template_file = os.path.join(self.config.get_template_dir(), "report_template_v2.kid") template = kid.Template(file=template_file, expname=self.name, motifs=report_motifs, random=random, genomic=genomic, inputfile=self.inputfile, date=datetime.today().strftime("%d/%m/%Y"), version=self.VERSION) f = open(total_report, "w") f.write(template.serialize()) f.close()
def _run_program(self, bin, fastafile, savedir="", params={}): import os, tempfile, shutil from subprocess import Popen, PIPE default_params = {"single":False, "background":None, "analysis":"medium", "number":5, "width":10} default_params.update(params) homer = bin fastafile = os.path.abspath(fastafile) # Background file is essential! if not default_params["background"]: print "Background file needed!" sys.exit() bgfile = os.path.abspath(default_params["background"]) outfile = tempfile.NamedTemporaryFile( dir=self.tmpdir, prefix= "homer_w{}.".format(default_params["width"]) ).name stderr = "" strand = "" if default_params["single"]: strand = " -strand + " cmd = "%s denovo -i %s -b %s -len %s -S %s %s -o %s -p 8" % ( homer, fastafile, bgfile, default_params["width"], default_params["number"], strand, outfile) stdout = "Running command:\n{}\n".format(cmd) p = Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE, cwd=self.tmpdir) out,err = p.communicate() stdout += out stderr += err motifs = [] if os.path.exists(outfile): motifs = pwmfile_to_motifs(outfile) for i, m in enumerate(motifs): m.id = "{}_{}_{}".format(self.name, default_params["width"], i + 1) return motifs, stdout, stderr
def _create_report(self, pwm, background): self.logger.info("Creating graphical report") class ReportMotif: pass motifs = pwmfile_to_motifs(pwm) for m,match in self.closest_match.items(): match[0].to_img(os.path.join(self.imgdir,"%s.png" % match[0].id), format="PNG") sort_key = background[0] if "genomic_matched" in background: sort_key = "genomic_matched" roc_img_file = "%s_%s_roc" report_motifs = [] for motif in sorted(motifs, cmp=lambda x,y: cmp(self.mncp[sort_key][y.id], self.mncp[sort_key][x.id])): rm = ReportMotif() rm.id = motif.id rm.id_href = {"href": "#%s" % motif.id} rm.id_name = {"name": motif.id} rm.img = {"src": os.path.join("images", "%s.png" % motif.id)} rm.consensus = motif.to_consensus() rm.bg = {} for bg in background: rm.bg[bg] = {} rm.bg[bg]["e"] = "%0.2f" % self.e[bg][motif.id] rm.bg[bg]["p"] = "%0.2f" % self.p[bg][motif.id] rm.bg[bg]["auc"] = "%0.3f" % self.auc[bg][motif.id] rm.bg[bg]["mncp"] = "%0.3f" % self.mncp[bg][motif.id] rm.bg[bg]["roc_img"] = {"src": "images/" + os.path.basename(roc_img_file % (motif.id, bg)) + ".png"} rm.bg[bg]["roc_img_link"] = {"href": "images/" + os.path.basename(roc_img_file % (motif.id, bg)) + ".png"} rm.histogram_img = {"data":"images/%s_histogram.svg" % motif.id} rm.histogram_link= {"href":"images/%s_histogram.svg" % motif.id} rm.match_img = {"src": "images/%s.png" % self.closest_match[motif.id][0].id} rm.match_id = self.closest_match[motif.id][0].id rm.match_pval = "%0.2e" % self.closest_match[motif.id][1] report_motifs.append(rm) total_report = self.motif_report kid.enable_import() template_file = os.path.join(self.config.get_template_dir(), "report_template_v2.kid") template = kid.Template(file=template_file, expname=self.name, motifs=report_motifs, inputfile=self.inputfile, date=datetime.today().strftime("%d/%m/%Y"), version=GM_VERSION) f = open(total_report, "w") f.write(template.serialize()) f.close()
def create_location_plots(self, motif_file, fasta_file, params): self.logger.info("Creating localization plots") index_dir = os.path.join(self.config.get_index_dir(), params["genome"]) lwidth = int(params["lwidth"]) width = int(params["width"]) extend = (lwidth - width) / 2 genome_index.track2fasta(index_dir, self.validation_bed, self.location_fa, extend_up=extend, extend_down=extend, use_strand=params["use_strand"]) jobs = [] motifs = pwmfile_to_motifs(motif_file) for motif in motifs: outfile = os.path.join(self.imgdir, "%s_histogram.svg" % motif.id) motif_localization(fasta_file, motif, lwidth, outfile)
def roc(args): """ Calculate ROC_AUC and other metrics and optionally plot ROC curve. """ pwmfile = args.pwmfile fg_file = args.sample bg_file = args.background outputfile = args.outfile # Default extension for image if outputfile and not outputfile.endswith(".png"): outputfile += ".png" motifs = dict([(x.id, x) for x in pwmfile_to_motifs(pwmfile)]) ids = [] if args.ids: ids = args.ids.split(",") else: ids = motifs.keys() fg_total = {} result = scan(fg_file, [motifs[x] for x in ids], 0.0, 1) for key,m in result.items(): fg_total[key.id.split("\t")[0]] = [matches[0][1] for matches in m.values()] bg_total = {} result = scan(bg_file, [motifs[x] for x in ids], 0.0, 1) for key,m in result.items(): bg_total[key.id.split("\t")[0]] = [matches[0][1] for matches in m.values()] plot_x = [] plot_y = [] # Print the metrics print "Motif\tROC AUC\tMNCP\tEnr. at 5% FDR\tMax enr." for id in ids: fg_vals = fg_total[id] bg_vals = bg_total[id] (x, y) = ROC_values(fg_vals, bg_vals) plot_x.append(x) plot_y.append(y) auc = ROC_AUC(fg_vals, bg_vals) mncp = MNCP(fg_vals, bg_vals) enr_fdr = enr_at_fdr(fg_vals, bg_vals) max_enr,score = max_enrichment(fg_vals, bg_vals) print "%s\t%0.3f\t%03f\t%0.2f\t%0.2f" % (id, auc, mncp, enr_fdr, max_enr) # Plot the ROC curve if outputfile: roc_plot(outputfile, plot_x, plot_y, ids=ids)
def determine_closest_match(self, motifs): jaspar = os.path.join(self.config.get_motif_dir(), [x for x in os.listdir(self.config.get_motif_dir()) if x.startswith("jaspar")][0]) db_motifs = [] if jaspar.endswith("pwm") or jaspar.endswith("pfm"): db_motifs = pwmfile_to_motifs(jaspar) elif jaspar.endswith("transfac"): db_motifs = transfac_to_motifs(jaspar) closest_match = {} mc = MotifComparer() db_motif_lookup = dict([(m.id, m) for m in db_motifs]) match = mc.get_closest_match(motifs, db_motifs, "partial", "wic", "mean") for motif in motifs: # Calculate p-value pval, pos, orient = mc.compare_motifs(motif, db_motif_lookup[match[motif.id][0]], "partial", "wic", "mean", pval=True) closest_match[motif.id] = [db_motif_lookup[match[motif.id][0]], pval] return closest_match
def cluster(args): outdir = os.path.abspath(args.outdir) if not os.path.exists(outdir): os.mkdir(outdir) ncpus = args.ncpus clusters = [] motifs = pwmfile_to_motifs(args.inputfile) if len(motifs) == 1: clusters = [[motifs[0], motifs]] else: tree = cluster_motifs(args.inputfile, "total", "wic", "mean", True, threshold=args.threshold, include_bg=True, ncpus=ncpus) clusters = tree.getResult() ids = _create_images(outdir, clusters) _write_report(outdir, ids, tree, clusters)
def determine_closest_match(self, motifs): self.logger.info("Determining closest matching motifs in database (JASPAR)") jaspar = os.path.join(self.config.get_motif_dir(), [x for x in os.listdir(self.config.get_motif_dir()) if x.startswith("jaspar")][0]) db_motifs = [] if jaspar.endswith("pwm") or jaspar.endswith("pfm"): db_motifs = pwmfile_to_motifs(jaspar) elif jaspar.endswith("transfac"): db_motifs = transfac_to_motifs(jaspar) closest_match = {} mc = MotifComparer() db_motif_lookup = dict([(m.id, m) for m in db_motifs]) match = mc.get_closest_match(motifs, db_motifs, "partial", "wic", "mean", parallel=False) for motif in motifs: # Calculate p-value pval, pos, orient = mc.compare_motifs(motif, db_motif_lookup[match[motif.id][0]], "partial", "wic", "mean", pval=True) closest_match[motif.id] = [db_motif_lookup[match[motif.id][0]], pval] return closest_match
def _create_text_report(self, pwm, background): self.logger.info("Creating text report") motifs = pwmfile_to_motifs(pwm) sort_key = background[0] if "genomic_matched" in background: sort_key = "genomic_matched" f = open(self.text_report, "w") header = "ID\tconsensus\tBest match JASPAR\tp-value JASPAR\t" + "\t".join("Enrichment (%s)\tp-value (%s)\tROC AUC (%s)\tMNCP (%s)" % (b,b,b,b) for b in background) #print header f.write("%s\n" % header) for motif in sorted(motifs, cmp=lambda x,y: cmp(self.mncp[sort_key][y.id], self.mncp[sort_key][x.id])): vals = [motif.id, motif.to_consensus(), self.closest_match[motif.id][0].id, self.closest_match[motif.id][1]] for bg in background: vals += [self.e[bg][motif.id], self.p[bg][motif.id], self.auc[bg][motif.id], self.mncp[bg][motif.id]] f.write("%s\n" % "\t".join([str(x) for x in vals])) #print "%s\n" % "\t".join([str(x) for x in vals]) f.close()
def threshold(args): if args.fdr < 0 or args.fdr > 1: print "Please specify a FDR between 0 and 1" sys.exit(1) motifs = pwmfile_to_motifs(args.pwmfile) result = scan(args.inputfile, motifs, 0.0, 1) print "Motif\tScore\tCutoff" for motif in result.keys(): pwm = motif.pwm scores = [] min_score = motif.pwm_min_score() scores = [x[0][1] for x in result[motif].values() if len(x) > 0] if len(scores) > 0: opt_score = scoreatpercentile(scores, 100 - (100 * args.fdr)) cutoff = (opt_score - min_score) / (motif.pwm_max_score() - min_score) print "{0}\t{1}\t{2}".format(motif.id, opt_score , cutoff) else: sys.stderr.write("Warning: no matches for {0}\n".format(motif.id))
def command_scan(inputfile, pwmfile, nreport=1, fpr=0.01, cutoff=None, bed=False, scan_rc=True, table=False, score_table=False, moods=False, pvalue=None, bgfile=None, genome=None, ncpus=None): motifs = pwmfile_to_motifs(pwmfile) fa = as_fasta(inputfile, genome) # initialize scanner s = Scanner(ncpus=ncpus) s.set_motifs(pwmfile) if not score_table: s.set_threshold(fpr=fpr, threshold=cutoff, genome=genome, length=fa.median_length(), filename=bgfile) if table: it = scan_table(s, inputfile, fa, motifs, cutoff, bgfile, nreport, scan_rc, pvalue, moods) elif score_table: it = scan_score_table(s, fa, motifs, scan_rc) else: it = scan_normal(s, inputfile, fa, motifs, cutoff, bgfile, nreport, scan_rc, pvalue, moods, bed) for row in it: yield row
def create_location_plots(self, motif_file, params): self.logger.info("Creating localization plots") if self.input_type == "BED": index_dir = os.path.join(self.config.get_index_dir(), params["genome"]) lwidth = int(params["lwidth"]) width = int(params["width"]) extend = (lwidth - width) / 2 genome_index.track2fasta(index_dir, self.validation_bed, self.location_fa, extend_up=extend, extend_down=extend, use_strand=params["use_strand"]) else: self.location_fa = self.validation_fa fa = Fasta(self.location_fa) seqs = fa.seqs lwidth = len(seqs[0]) all_same_width = not(False in [len(seq) == lwidth for seq in seqs]) if not all_same_width: self.logger.warn("PLEASE NOTE: FASTA file contains sequences of different lengths. Positional preference plots will be incorrect!") motifs = pwmfile_to_motifs(motif_file) for motif in motifs: outfile = os.path.join(self.imgdir, "%s_histogram.svg" % motif.id) motif_localization(self.location_fa, motif, lwidth, outfile)
def _roc_metrics(self, pwm, sample_fa, bg_fa, roc_file): motifs = dict([(m.id, m) for m in pwmfile_to_motifs(pwm)]) jobs = {} for id,m in motifs.items(): jobs[id] = self.job_server().submit(get_scores, (motifs[id],sample_fa,bg_fa,)) all_auc = {} all_mncp = {} f = open(roc_file, "w") f.write("Motif\tROC AUC\tMNCP\tMax f-measure\tSens @ max f-measure\n") for id in motifs.keys(): error, auc, mncp, max_f, y = jobs[id]() if error: self.logger.error("Error in thread: %s" % error) sys.exit(1) f.write("%s\t%s\t%s\t%s\t%s\n" % (id,auc,mncp,max_f,y)) all_auc[id] = auc all_mncp[id] = mncp f.close() return all_auc,all_mncp
def location(args): """ Creates histrogram of motif location. Parameters ---------- args : argparse object Command line arguments. """ fastafile = args.fastafile pwmfile = args.pwmfile lwidth = args.width if not lwidth: f = Fasta(fastafile) lwidth = len(f.items()[0][1]) f = None jobs = [] motifs = pwmfile_to_motifs(pwmfile) ids = [motif.id for motif in motifs] if args.ids: ids = args.ids.split(",") n_cpus = int(MotifConfig().get_default_params()["ncpus"]) pool = Pool(processes=n_cpus, maxtasksperchild=1000) for motif in motifs: if motif.id in ids: outfile = os.path.join("%s_histogram" % motif.id) jobs.append( pool.apply_async( motif_localization, (fastafile,motif,lwidth,outfile, args.cutoff) )) for job in jobs: job.get()
def cluster(args): revcomp = not args.single outdir = os.path.abspath(args.outdir) if not os.path.exists(outdir): os.mkdir(outdir) trim_ic = 0.2 clusters = [] motifs = pwmfile_to_motifs(args.inputfile) if len(motifs) == 1: clusters = [[motifs[0], motifs]] else: tree = cluster_motifs(args.inputfile, "total", "wic", "mean", True, threshold=args.threshold, include_bg=True) clusters = tree.getResult() ids = [] mc = MotifComparer() sys.stderr.write("Creating images\n") for cluster,members in clusters: cluster.trim(trim_ic) cluster.to_img(os.path.join(outdir,"%s.png" % cluster.id), format="PNG") ids.append([cluster.id, {"src":"%s.png" % cluster.id},[]]) if len(members) > 1: scores = {} for motif in members: scores[motif] = mc.compare_motifs(cluster, motif, "total", "wic", "mean", pval=True) add_pos = sorted(scores.values(),cmp=lambda x,y: cmp(x[1], y[1]))[0][1] for motif in members: score, pos, strand = scores[motif] add = pos - add_pos if strand in [1,"+"]: pass else: #print "RC %s" % motif.id rc = motif.rc() rc.id = motif.id motif = rc #print "%s\t%s" % (motif.id, add) motif.to_img(os.path.join(outdir, "%s.png" % motif.id.replace(" ", "_")), format="PNG", add_left=add) ids[-1][2] = [dict([("src", "%s.png" % motif.id.replace(" ", "_")), ("alt", motif.id.replace(" ", "_"))]) for motif in members] config = MotifConfig() env = jinja2.Environment(loader=jinja2.FileSystemLoader([config.get_template_dir()])) template = env.get_template("cluster_template.jinja.html") result = template.render(motifs=ids) with open(os.path.join(outdir, "cluster_report.html"), "w") as f: f.write(result.encode('utf-8')) f = open(os.path.join(outdir, "cluster_key.txt"), "w") for id in ids: f.write("%s\t%s\n" % (id[0], ",".join([x["alt"] for x in id[2]]))) f.close() f = open(os.path.join(outdir, "clustered_motifs.pwm"), "w") if len(clusters) == 1 and len(clusters[0][1]) == 1: f.write("%s\n" % clusters[0][0].to_pwm()) else: for motif in tree.get_clustered_motifs(): f.write("%s\n" % motif.to_pwm()) f.close()
def command_scan(inputfile, pwmfile, nreport=1, cutoff=0.9, bed=False, scan_rc=True, table=False, score_table=False, moods=False, pvalue=None, bgfile=None, genome=None): motifs = pwmfile_to_motifs(pwmfile) index_dir = None if genome is not None: index_dir = os.path.join(MotifConfig().get_index_dir(), genome) # initialize scanner s = Scanner() s.set_motifs(pwmfile) fa = as_fasta(inputfile, index_dir) if moods: result_it = scan_it_moods(inputfile, motifs, cutoff, bgfile, nreport, scan_rc, pvalue, table) else: result_it = s.scan(fa, nreport, scan_rc, cutoff) if table: # header yield "\t{}".format("\t".join([m.id for m in motifs])) if moods: result_it = scan_it_moods(inputfile, motifs, cutoff, bgfile, nreport, scan_rc, pvalue, table) for seq_id, counts in result_it: yield "{}\t{}".format(seq_id, "\t".join([str(x) for x in counts])) else: # get iterator result_it = s.count(fa, nreport, scan_rc, cutoff) # counts table for i, counts in enumerate(result_it): yield "{}\t{}".format( fa.ids[i], "\t".join([str(x) for x in counts]) ) elif score_table: # get iterator result_it = s.best_score(fa, scan_rc) # header yield "\t{}".format("\t".join([m.id for m in motifs])) # score table for i,scores in enumerate(result_it): yield "{}\t{}".format( fa.ids[i], "\t".join([str(x) for x in scores]) ) else: if moods: for motif, d in result_it: for seq_id,matches in d.items(): for pos,score,strand in matches: yield format_line(fa, seq_id, motif, score, pos, strand, bed=bed) else: for i, result in enumerate(result_it): seq_id = fa.ids[i] for motif, matches in zip(motifs, result): for (score, pos, strand) in matches: yield format_line(fa, seq_id, motif, score, pos, strand, bed=bed)
tiny = options.tiny pwmfile = options.pwmfile fg_file = options.sample bg_file = options.background outputfile = options.output if options.tiny: if not outputfile.endswith(".png"): outputfile += ".png" else: if not outputfile.endswith(".png"): outputfile += ".png" motifs = dict([(x.id, x) for x in pwmfile_to_motifs(pwmfile)]) ids = [] if options.ids: ids = options.ids.split(",") else: ids = motifs.keys() fg_jobs = {} bg_jobs = {} for id in ids: if motifs.has_key(id): bg_jobs[id] = job_server.submit(get_scores, (motifs[id],bg_file,)) fg_jobs[id] = job_server.submit(get_scores, (motifs[id],fg_file,)) else:
def diff(args): infiles = args.inputfiles.split(",") bgfile = args.bgfile outfile = args.outputfile pwmfile = args.pwmfile cutoff = args.cutoff genome = args.genome minenr = float(args.minenr) minfreq = float(args.minfreq) tmpdir = mkdtemp() # Retrieve FASTA clusters from BED file if len(infiles) == 1 and infiles[0].endswith("bed"): if not args.genome: sys.stderr.write("Can't convert BED file without genome!\n") sys.exit(1) clusters = {} for line in open(infiles[0]): vals = line.strip().split("\t") clusters.setdefault(vals[4], []).append(vals[:3]) infiles = [] for cluster, regions in clusters.items(): sys.stderr.write("Creating FASTA file for {0}\n".format(cluster)) inbed = os.path.join(tmpdir, "{0}.bed".format(cluster)) outfa = os.path.join(tmpdir, "{0}.fa".format(cluster)) with open(inbed, "w") as f: for vals in regions: f.write("{0}\t{1}\t{2}\n".format(*vals)) Genome(genome).track2fasta(inbed, outfa) infiles.append(outfa) pwms = dict([(m.id, m) for m in pwmfile_to_motifs(pwmfile)]) motifs = [m for m in pwms.keys()] names = [os.path.basename(os.path.splitext(fname)[0]) for fname in infiles] s = Scanner() s.set_motifs(pwmfile) s.set_threshold(threshold=cutoff) # Get background frequencies nbg = float(len(Fasta(bgfile).seqs)) bgcounts = s.total_count(bgfile, nreport=1) bgfreq = [(c + 0.01) / nbg for c in bgcounts] # Get frequences in input files freq = {} counts = {} for fname in infiles: mcounts = s.total_count(fname, nreport=1) n = float(len(Fasta(fname).seqs)) counts[fname] = mcounts freq[fname] = [(c + 0.01) / n for c in mcounts] freq = np.array([freq[fname] for fname in infiles]).transpose() counts = np.array([counts[fname] for fname in infiles]).transpose() #for row in freq: # print freq diff_plot(motifs, pwms, names, freq, counts, bgfreq, bgcounts, outfile, minenr=minenr, minfreq=minfreq) shutil.rmtree(tmpdir)
dest="fpr", help="Desired fpr", type="float", metavar="FLOAT") (options, args) = parser.parse_args() if not options.pwmfile or not options.inputfile or not options.fpr: parser.print_help() exit() if options.fpr < 0 or options.fpr > 1: print "Please specify a FPR between 0 and 1" sys.exit() f = Fasta(options.inputfile) motifs = pwmfile_to_motifs(options.pwmfile) print "Motif\tScore\tCutoff" for motif in motifs: pwm = motif.pwm scores = [] min_score = motif.pwm_min_score() for name, seq in f.items(): result = pwmscan(seq.upper(), pwm, min_score, 1, True) score = result[0][0] scores.append(score) opt_score = scoreatpercentile(scores, 100 - (100 * options.fpr)) cutoff = (opt_score - min_score) / (motif.pwm_max_score() - min_score) print "%s\t%s\t%s" % (motif.id, opt_score, cutoff)
def maxenr(args): if not os.path.exists(args.sample): print "File %s does not exist!" % args.sample exit(1) if not os.path.exists(args.background): print "File %s does not exist!" % args.background exit(1) pwmfile = args.pwmfile fg_file = args.sample bg_file = args.background motifs = dict([(x.id, x) for x in pwmfile_to_motifs(pwmfile)]) ids = [] if args.ids: ids = args.ids.split(",") else: ids = motifs.keys() fg_jobs = {} bg_jobs = {} for id in ids: if motifs.has_key(id): bg_jobs[id] = pool.apply_async(get_scores, ( motifs[id], bg_file, )) fg_jobs[id] = pool.apply_async(get_scores, ( motifs[id], fg_file, )) else: print "Wrong id: %s" % id sys.exit() print "Motif\t# matches\tMax. enrichment\tScore\tCutoff" for id in ids: pos = array(fg_jobs[id].get()) neg = array(bg_jobs[id].get()) factor = len(neg) / float(len(pos)) scores = array([s for s in hstack((pos, neg)) if sum(neg >= s) > 1]) enr = array([(sum(pos >= x) / float(sum(neg >= x))) * factor for x in scores]) #print len(scores), len(enr) #for x,y in zip(enr, scores): # print "%s\t%s" % (x,y) max_score = scores[enr.argmax()] cutoff = (max_score - motifs[id].pwm_min_score()) / ( motifs[id].pwm_max_score() - motifs[id].pwm_min_score()) print "%s\t%s\t%0.2f\t%0.2f\t%0.3f" % ( id, sum(pos >= scores[enr.argmax()]), max(enr), scores[enr.argmax()], cutoff)
def get_scores(motif, file): from gimmemotifs.fasta import Fasta result = motif.pwm_scan_score(Fasta(file), cutoff=0.0, nreport=1) vals = [sorted(x)[-1] for x in result.values()] return vals job_server = pp.Server(secret="pumpkinrisotto") pwmfile = options.pwmfile fg_file = options.sample bg_file = options.background motifs = dict([(x.id, x) for x in pwmfile_to_motifs(pwmfile)]) ids = [] if options.ids: ids = options.ids.split(",") else: ids = motifs.keys() fg_jobs = {} bg_jobs = {} for id in ids: if motifs.has_key(id): bg_jobs[id] = job_server.submit(get_scores, ( motifs[id], bg_file,