def download(self, outdir=DEFAULT_OUT): tmpdir = mkdtemp() file_tmp = urlretrieve(self.URL, filename=None)[0] with zipfile.ZipFile(file_tmp,"r") as zip_ref: zip_ref.extractall(tmpdir) motifs = [] for fname in glob.glob(os.path.join(tmpdir, "pwms/*")): m_id = os.path.splitext(os.path.basename(fname))[0] for m in read_motifs(fname, fmt="transfac"): if len(m) > 0: m.id = m_id motifs.append(m) outfile = os.path.join(outdir, self.NAME) with open(outfile, "w") as f: print("# CIS-BP motif database (v{})".format(self.VERSION), file=f) print("# Retrieved from: {}".format(self.URL), file=f) print("# Date: {}".format(self.date), file=f) for motif in motifs: print(motif.to_pwm(), file=f) shutil.rmtree(tmpdir) motifs = read_motifs(outfile) anno = self.annotate_factors(motifs) self.create_annotation(os.path.join(outdir, self.NAME), anno)
def command_scan(inputfile, pwmfile, nreport=1, fpr=0.01, cutoff=None, bed=False, scan_rc=True, table=False, score_table=False, moods=False, pvalue=None, bgfile=None, genome=None, ncpus=None, normalize=False): motifs = read_motifs(pwmfile) fa = as_fasta(inputfile, genome) # initialize scanner s = Scanner(ncpus=ncpus) s.set_motifs(pwmfile) if genome: s.set_genome(genome=genome) if genome or bgfile: s.set_background(genome=genome, fname=bgfile, length=fa.median_length()) if not score_table: s.set_threshold(fpr=fpr, threshold=cutoff) if table: it = scan_table(s, inputfile, fa, motifs, cutoff, bgfile, nreport, scan_rc, pvalue, moods) elif score_table: it = scan_score_table(s, fa, motifs, scan_rc, normalize=normalize) else: it = scan_normal(s, inputfile, fa, motifs, cutoff, bgfile, nreport, scan_rc, pvalue, moods, bed, normalize=normalize) for row in it: yield row
def scan_to_table(input_table, genome, data_dir, scoring, pwmfile=None): threshold = check_threshold(data_dir, genome, scoring) config = MotifConfig() if pwmfile is None: pwmfile = config.get_default_params().get("motif_db", None) if pwmfile is not None: pwmfile = os.path.join(config.get_motif_dir(), pwmfile) if pwmfile is None: raise ValueError("no pwmfile given and no default database specified") df = pd.read_table(input_table, index_col=0) regions = list(df.index) s = Scanner() s.set_motifs(pwmfile) s.set_genome(genome) scores = [] if scoring == "count": for row in s.count(regions, cutoff=threshold): scores.append(row) else: for row in s.best_score(regions): scores.append(row) motif_names = [m.id for m in read_motifs(open(pwmfile))] return pd.DataFrame(scores, index=df.index, columns=motif_names)
def set_motifs(self, motifs): self.motifs = motifs self.motif_ids = [m.id for m in read_motifs(open(motifs))] self.checksum = {} if self.use_cache: chksum = CityHash64("\n".join(sorted(self.motif_ids))) self.checksum[self.motifs] = chksum
def download(self, outdir=DEFAULT_OUT): ### JASPAR ### for group in self.GROUPS: if group != "": group = "_" + group outfile = os.path.join(outdir, self.NAME.format(group)) url = self.URL.format(group) with open(outfile, "w") as f: with urlopen(url) as response: for line in response: line = line.decode().strip() if line.startswith(">"): line = "_".join(line.split("\t")[:2]) print(line, file=f) motifs = read_motifs(outfile, fmt="jaspar") with open(outfile, "w") as f: print("# JASPAR2018{} motif database".format(group), file=f) print("# Retrieved from: {}".format(url), file=f) print("# Date: {}".format(self.date), file=f) for motif in motifs: print(motif.to_pwm(), file=f) #if group == "_vertebrates": anno = self.annotate_factors(motifs) self.create_annotation(os.path.join(outdir, self.NAME.format(group)), anno)
def create_roc_plots(pwmfile, fgfa, background, outdir): """Make ROC plots for all motifs.""" motifs = read_motifs(pwmfile, fmt="pwm", as_dict=True) ncpus = int(MotifConfig().get_default_params()['ncpus']) pool = Pool(processes=ncpus) jobs = {} for bg,fname in background.items(): for m_id, m in motifs.items(): k = "{}_{}".format(str(m), bg) jobs[k] = pool.apply_async( get_roc_values, (motifs[m_id], fgfa, fname,) ) imgdir = os.path.join(outdir, "images") if not os.path.exists(imgdir): os.mkdir(imgdir) roc_img_file = os.path.join(outdir, "images", "{}_roc.{}.png") for motif in motifs.values(): for bg in background: k = "{}_{}".format(str(motif), bg) error, x, y = jobs[k].get() if error: logger.error("Error in thread: %s", error) logger.error("Motif: %s", motif) sys.exit(1) roc_plot(roc_img_file.format(motif.id, bg), x, y)
def _calc_report_values(self, pwm, background): self.logger.debug("Calculating final statistics for report") self.p = dict([(b,{}) for b in background]) self.e = dict([(b,{}) for b in background]) e_files = dict([(bg, self.bg_file["cluster_enrichment"][bg]) for bg in background]) for bg in self.p.keys(): for line in open(e_files[bg]).readlines(): if not (line.startswith("#") or line.startswith("Motif\tSig")): vals = line.strip().split("\t") self.p[bg][vals[0]] = float(vals[2]) self.e[bg][vals[0]] = float(vals[5]) self.auc = dict([(b,{}) for b in background]) self.mncp = dict([(b,{}) for b in background]) rocs = dict([(bg, [self.bg_file["fa"][bg], self.bg_file["roc"][bg]]) for bg in background]) for bg in self.auc.keys(): bg_fasta_file, roc_file = rocs[bg] self.auc[bg], self.mncp[bg] = self._roc_metrics(pwm, self.validation_fa, bg_fasta_file, roc_file) motifs = read_motifs(open(pwm), fmt="pwm") self.closest_match = self.determine_closest_match(motifs)
def load_motifs(motif_file, cutoff=0.95): motifs = read_motifs(open(motif_file)) d = parse_cutoff(motifs, cutoff) cutoffs = [] for m in motifs: c = m.pwm_min_score() + (m.pwm_max_score() - m.pwm_min_score()) * d[m.id] cutoffs.append(c) return zip(motifs, cutoffs)
def determine_closest_match(self, motifs): self.logger.debug("Determining closest matching motifs in database") motif_db = self.config.get_default_params()["motif_db"] db = os.path.join(self.config.get_motif_dir(), motif_db) db_motifs = [] if db.endswith("pwm") or db.endswith("pfm"): db_motifs = read_motifs(open(db), fmt="pwm") elif db.endswith("transfac"): db_motifs = read_motifs(db, fmt="transfac") closest_match = {} mc = MotifComparer() db_motif_lookup = dict([(m.id, m) for m in db_motifs]) match = mc.get_closest_match(motifs, db_motifs, "partial", "wic", "mean", parallel=False) for motif in motifs: # Calculate p-value pval, pos, orient = mc.compare_motifs(motif, db_motif_lookup[match[motif.id][0]], "partial", "wic", "mean", pval=True) closest_match[motif.id] = [db_motif_lookup[match[motif.id][0]], pval] return closest_match
def _run_program(self, bin, fastafile, savedir="", params=None): default_params = {"single":False, "background":None} if params is not None: default_params.update(params) trawler = bin fastafile = os.path.abspath(fastafile) if not default_params["background"]: print "Background file needed!" sys.exit() bgfile = os.path.abspath(default_params["background"]) savedir = os.path.abspath(savedir) #savedir = "/tmp/trawler/" tmp = NamedTemporaryFile(dir=self.tmpdir, delete=False) shutil.copy(fastafile, tmp.name) fastafile = tmp.name current_path = os.getcwd() os.chdir(self.dir()) stdout = "" stderr = "" strand = "double" if default_params["single"]: strand = "single" cmd = "%s -sample %s -background %s -directory %s -strand %s" % (trawler, fastafile, bgfile, self.tmpdir, strand) p = Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE) out,err = p.communicate() stdout += out stderr += err os.chdir(current_path) motifs = [] out_name = [dir for dir in os.listdir(self.tmpdir) if dir.startswith("tmp")][-1] out_file = os.path.join(self.tmpdir, out_name, "result", "%s.pwm" % out_name) if os.path.exists(out_file): motifs = read_motifs(open(os.path.join( self.tmpdir, out_name, "result", "%s.pwm" % out_name)), fmt="pwm") # remove temporary files if os.path.exists(tmp.name): os.unlink(tmp.name) for motif in motifs: motif.id = "%s_%s" % (self.name, motif.id) return motifs, stdout, stderr
def download(self, outdir=DEFAULT_OUT): outfile = os.path.join(outdir, self.NAME) with open(outfile, "w") as f: with urlopen(self.URL) as response: for line in response: line = line.decode().strip() print(line, file=f) motifs = read_motifs(outfile, fmt="transfac") with open(outfile, "w") as f: print("# SwissRegulon motif database (hg19:FANTOM5)", file=f) print("# Retrieved from: {}".format(self.URL), file=f) print("# Date: {}".format(self.date), file=f) for motif in motifs: if len(motif) > 0: print(motif.to_pwm(), file=f) motifs = read_motifs(outfile) anno = self.annotate_factors(motifs) self.create_annotation(os.path.join(outdir, self.NAME), anno)
def download(self, outdir=DEFAULT_OUT): # Factorbook is only supplied in non-redundant form as a supplemental pdf # For now, use the non-redundant version included with GimmeMotifs infile = "data/motif_databases/factorbook.pfm" outfile = os.path.join(outdir, self.NAME) motifs = read_motifs(infile) with open(outfile, "w") as f: for motif in motifs: print(motif.to_pwm(), file=f) anno = self.annotate_factors(motifs) self.create_annotation(os.path.join(outdir, self.NAME), anno)
def test2_stats_single_motif(self): """ Calculate motif statistics """ m_id = "p53_Average_8_CATGyCnGGrCATGy" with open(self.motifs) as f: motifs = read_motifs(f) motif = [m for m in motifs if str(m) == m_id][0] stats = calc_stats(motif, self.fg_fa, self.bg_fa, stats=["roc_auc"]) self.assertGreater(stats[m_id]["roc_auc"] , 0.9)
def test1_prediction_result(self): """ Calculates statistics of motifs """ tmp = tempfile.NamedTemporaryFile().name p = PredictionResult(tmp, fg_file=self.fg_fa, background={"random":self.bg_fa}) with open(self.motifs) as f: motifs = read_motifs(f) p.add_motifs((0, (motifs, "", ""))) p.wait_for_stats() self.assertEqual(2, len(p.stats))
def roc(args): """ Calculate ROC_AUC and other metrics and optionally plot ROC curve. """ pwmfile = args.pwmfile fg_file = args.sample bg_file = args.background outputfile = args.outfile # Default extension for image if outputfile and not outputfile.endswith(".png"): outputfile += ".png" motifs = read_motifs(open(pwmfile), fmt="pwm") s = Scanner() s.set_motifs(pwmfile) ids = [] if args.ids: ids = args.ids.split(",") else: ids = [m.id for m in motifs] fg_total = dict([(m.id, []) for m in motifs]) for scores in s.best_score(fg_file): for motif,score in zip(motifs, scores): fg_total[motif.id].append(score) bg_total = dict([(m.id, []) for m in motifs]) for scores in s.best_score(bg_file): for motif,score in zip(motifs, scores): bg_total[motif.id].append(score) plot_x = [] plot_y = [] # Print the metrics print "Motif\tROC AUC\tMNCP\tEnr. at 5% FDR\tMax enr.\tRecall at 10% FDR" for motif_id in ids: fg_vals = fg_total[motif_id] bg_vals = bg_total[motif_id] (x, y) = ROC_values(fg_vals, bg_vals) plot_x.append(x) plot_y.append(y) auc = ROC_AUC(fg_vals, bg_vals) mncp = MNCP(fg_vals, bg_vals) enr_fdr = enr_at_fdr(fg_vals, bg_vals) max_enr,score = max_enrichment(fg_vals, bg_vals) recall = recall_at_fdr(fg_vals, bg_vals, 0.1) print "%s\t%0.3f\t%03f\t%0.2f\t%0.2f\t%0.4f" % ( motif_id, auc, mncp, enr_fdr, max_enr, recall) # Plot the ROC curve if outputfile: roc_plot(outputfile, plot_x, plot_y, ids=ids)
def download(self, outdir=DEFAULT_OUT): tmpdir = mkdtemp() file_tmp = urlretrieve(self.URL, filename=None)[0] tar = tarfile.open(file_tmp) fname = "IMAGE/utils/Collection.motif" members = [tar.getmember(fname)] tar.extractall(tmpdir, members=members) outfile = os.path.join(outdir, self.NAME) motifs = read_motifs(os.path.join(tmpdir,fname)) with open(outfile, "w") as f: print("# IMAGE motif database (v1.1)", file=f) print("# Retrieved from: {}".format(self.URL), file=f) print("# Date: {}".format(self.date), file=f) for motif in motifs: print(motif.to_pwm(), file=f) shutil.rmtree(tmpdir) motifs = read_motifs(outfile) anno = self.annotate_factors(motifs) self.create_annotation(os.path.join(outdir, self.NAME), anno)
def _run_program(self, bin, fastafile, savedir="", params=None): default_params = {"single":False, "background":None, "analysis":"medium", "number":5, "width":10} if params is not None: default_params.update(params) homer = bin fastafile = os.path.abspath(fastafile) # Background file is essential! if not default_params["background"]: print "Background file needed!" sys.exit() bgfile = os.path.abspath(default_params["background"]) outfile = NamedTemporaryFile( dir=self.tmpdir, prefix= "homer_w{}.".format(default_params["width"]) ).name stderr = "" strand = "" if default_params["single"]: strand = " -strand + " cmd = "%s denovo -i %s -b %s -len %s -S %s %s -o %s -p 8" % ( homer, fastafile, bgfile, default_params["width"], default_params["number"], strand, outfile) stdout = "Running command:\n{}\n".format(cmd) p = Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE, cwd=self.tmpdir) out,err = p.communicate() stdout += out stderr += err motifs = [] if os.path.exists(outfile): motifs = read_motifs(open(outfile), fmt="pwm") for i, m in enumerate(motifs): m.id = "{}_{}_{}".format(self.name, default_params["width"], i + 1) return motifs, stdout, stderr
def _run_program(self, bin, fastafile, savedir="", params=None): if params is None: params = {} default_params = {"single":False, "background":None, "analysis":"medium", "number":5, "width":10} default_params.update(params) cmd = bin fastafile = os.path.abspath(fastafile) bgfile = os.path.abspath(default_params["background"]) background = "" if bgfile: background = " --negSet {0} ".format(bgfile) outfile = os.path.join(self.tmpdir, os.path.basename(fastafile.replace(".fa", ".pwm"))) stdout = "" stderr = "" strand = "" if not default_params["single"]: strand = " --revcomp " cmd = "%s %s %s --localization --batch --no-graphics %s %s" % ( cmd, self.tmpdir, fastafile, background, strand ) p = Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE) out,err = p.communicate() stdout += out stderr += err motifs = [] if os.path.exists(outfile): motifs = read_motifs(open(outfile), fmt="xxmotifs") for m in motifs: m.id = "{0}_{1}".format(self.name, m.id) else: stdout += "\nMotif file {0} not found!\n".format(outfile) stderr += "\nMotif file {0} not found!\n".format(outfile) return motifs, stdout, stderr
def download(self, outdir=DEFAULT_OUT): outfile = os.path.join(outdir, self.NAME) with open(outfile, "w") as f: print("# ENCODE motif database", file=f) print("# Retrieved from: {}".format(self.URL), file=f) print("# Date: Dec. 2013", file=f) with urlopen(self.URL) as response: for line in response: line = line.decode().strip() if line.startswith(">"): line = line.replace("\t", " ") print(line, file=f) motifs = read_motifs(outfile) anno = self.annotate_factors(motifs) self.create_annotation(os.path.join(outdir, self.NAME), anno)
def create_roc_plots(self, pwm_file, fg_fasta, bg_fasta, name): motifs = dict([(m.id, m) for m in read_motifs(open(pwm_file), fmt="pwm")]) jobs = {} for id,m in motifs.items(): jobs[id] = self.job_server().apply_async(get_roc_values, (motifs[id],fg_fasta,bg_fasta,)) roc_img_file = os.path.join(self.imgdir, "%s_%s_roc.png") for id in motifs.keys(): error, x, y = jobs[id].get() if error: self.logger.error("Error in thread: %s", error) sys.exit(1) roc_plot(roc_img_file % (id,name), x, y)
def download(self, outdir=DEFAULT_OUT): ### Homer ### pfm_out = os.path.join(outdir, self.NAME) with open(pfm_out, "w") as f: print("# Homer motif database (v4.10)", file=f) print("# Retrieved from: {}".format(self.URL), file=f) print("# Date: {}".format(self.date), file=f) with urlopen(self.URL) as response: for line in response: line = line.decode().strip() if line.startswith(">"): line = "_".join(line.split("\t")[:2]) print(line, file=f) motifs = read_motifs(pfm_out) anno = self.annotate_factors(motifs) self.create_annotation(os.path.join(outdir, self.NAME), anno)
def download(self, outdir=DEFAULT_OUT): for group in ["HUMAN", "MOUSE"]: outfile = os.path.join(outdir, self.NAME.format(group)) url = self.URL.format(group) with open(outfile, "w") as f: print("# HOCOMOCOv10_{} motif database".format(group), file=f) print("# Retrieved from: {}".format(url), file=f) print("# Date: {}".format(self.date), file=f) with urlopen(url) as response: for line in response: line = line.decode().strip() if line.startswith(">"): line = "_".join(line.split("\t")[:2]) print(line, file=f) motifs = read_motifs(outfile) anno = self.annotate_factors(motifs, self.ANNO_URL.format(group)) self.create_annotation(os.path.join(outdir, self.NAME.format(group)), anno)
def set_motifs(self, motifs): try: # Check if motifs is a list of Motif instances motifs[0].to_pwm() tmp = NamedTemporaryFile(mode="w", delete=False) for m in motifs: tmp.write("{}\n".format(m.to_pwm())) tmp.close() motif_file = tmp.name except AttributeError: motif_file = motifs self.motifs = motif_file self.motif_ids = [m.id for m in read_motifs(motif_file)] self.checksum = {} if self.use_cache: chksum = xxhash.xxh64("\n".join(sorted(self.motif_ids))).digest() self.checksum[self.motifs] = chksum
def _create_text_report(self, pwm, background): self.logger.debug("Creating text report") motifs = read_motifs(open(pwm), fmt="pwm") sort_key = background[0] if "gc" in background: sort_key = "gc" f = open(self.text_report, "w") header = "ID\tconsensus\tBest match db\tp-value best match\t" + "\t".join("Enrichment (%s)\tp-value (%s)\tROC AUC (%s)\tMNCP (%s)" % (b,b,b,b) for b in background) #print header f.write("%s\n" % header) for motif in sorted(motifs, cmp=lambda x,y: cmp(self.mncp[sort_key][y.id], self.mncp[sort_key][x.id])): vals = [motif.id, motif.to_consensus(), self.closest_match[motif.id][0].id, self.closest_match[motif.id][1]] for bg in background: vals += [self.e[bg][motif.id], self.p[bg][motif.id], self.auc[bg][motif.id], self.mncp[bg][motif.id]] f.write("%s\n" % "\t".join([str(x) for x in vals])) #print "%s\n" % "\t".join([str(x) for x in vals]) f.close()
def scan_to_best_match(fname, motifs, ncpus=None, genome=None, score=False): """Scan a FASTA file with motifs. Scan a FASTA file and return a dictionary with the best match per motif. Parameters ---------- fname : str Filename of a sequence file in FASTA format. motifs : list List of motif instances. Returns ------- result : dict Dictionary with motif scanning results. """ # Initialize scanner s = Scanner(ncpus=ncpus) s.set_motifs(motifs) s.set_threshold(threshold=0.0) if genome: s.set_genome(genome) if isinstance(motifs, six.string_types): motifs = read_motifs(motifs) logger.debug("scanning %s...", fname) result = dict([(m.id, []) for m in motifs]) if score: it = s.best_score(fname) else: it = s.best_match(fname) for scores in it: for motif, score in zip(motifs, scores): result[motif.id].append(score) # Close the pool and reclaim memory del s return result
def create_denovo_motif_report( inputfile, pfmfile, fgfa, background, locfa, outdir, params, stats=None ): """Create text and graphical (.html) motif reports.""" logger.info("creating de novo reports") motifs = read_motifs(pfmfile, fmt="pwm") # ROC plots create_roc_plots(pfmfile, fgfa, background, outdir, params["genome"]) # Closest match in database mc = MotifComparer() closest_match = mc.get_closest_match(motifs) if stats is None: stats = {} for bg, bgfa in background.items(): for m, s in calc_stats(fg_file=fgfa, bg_file=bgfa, motifs=motifs).items(): if m not in stats: stats[m] = {} stats[m][bg] = s stats = add_star(stats) if not params: params = {} cutoff_fpr = params.get("cutoff_fpr", 0.9) lsize = np.median([len(seq) for seq in Fasta(locfa).seqs]) # Location plots logger.debug("Creating localization plots") for motif in motifs: logger.debug(" {} {}".format(motif.id, motif)) outfile = os.path.join(outdir, "images/{}_histogram.svg".format(motif.id)) motif_localization(locfa, motif, lsize, outfile, cutoff=cutoff_fpr) # Create reports _create_text_report(inputfile, motifs, closest_match, stats, outdir) _create_graphical_report( inputfile, pfmfile, background, closest_match, outdir, stats )
def get_gc_thresholds(self, seqs, motifs=None, zscore=False): # Simple case, only one threshold if np.all(self.threshold.nunique(axis=0) == 1): return self.threshold.iloc[0].to_dict() if motifs is None: motifs = read_motifs(self.motifs) seq_gc_bins = [self.get_seq_bin(seq) for seq in seqs] gc_bin_count = Counter(seq_gc_bins) _threshold = self.threshold if zscore: grouped = _threshold.groupby(_threshold.index).apply(scale, axis=0) _threshold = pd.DataFrame( np.vstack(grouped.values), index=_threshold.index, columns=_threshold.columns, ) nseqs = int(20000 / np.sum(list(gc_bin_count.values()))) t = {} maxt = pd.Series([m.pwm_max_score() for m in motifs], index=_threshold.columns) # We do this in a loop as the DataFrame will get too big to fit in memory # when the difference between the number of sequences per gc_bin is very # high. _threshold = _threshold.reset_index() idx = np.hstack([ _threshold[_threshold[_threshold.columns[0]] == gc_bin].sample( nseqs * count, replace=True, random_state=42).index.values for gc_bin, count in gc_bin_count.items() ]) for motif in _threshold.columns[1:]: val = _threshold.loc[idx, motif].quantile(0.99, interpolation="higher") if val < maxt.loc[motif]: t[motif] = val else: t[motif] = None return t
def logo(args): if args.pfmfile is None and args.ids is None: name = os.path.splitext(os.path.split(pfmfile_location(None))[-1])[0] print( "Use the -i argument to specify which motif ids you want to use for logos." ) print("If you really want to create logos for all of the motifs in the default") print("PFM file use the following command:") print(f"gimme logo -p {name}") sys.exit(1) inputfile = args.pfmfile motifs = read_motifs(inputfile) if args.ids: ids = args.ids.split(",") motifs = [m for m in motifs if m.id in ids] for motif in motifs: motif.plot_logo( fname="{}.png".format(motif.id), kind=args.kind, title=args.title )
def create_roc_plots(self, pwm_file, fg_fasta, bg_fasta, name): motifs = dict([(m.id, m) for m in read_motifs(open(pwm_file), fmt="pwm")]) jobs = {} for id, m in motifs.items(): jobs[id] = self.job_server().apply_async(get_roc_values, ( motifs[id], fg_fasta, bg_fasta, )) roc_img_file = os.path.join(self.imgdir, "%s_%s_roc.png") for id in motifs.keys(): error, x, y = jobs[id].get() if error: self.logger.error("Error in thread: %s", error) sys.exit(1) roc_plot(roc_img_file % (id, name), x, y)
def download(self, outdir=DEFAULT_OUT): for tax in ["insects", "plants", "vertebrates"]: tax_ = tax if not tax.endswith("es"): tax_ = tax[:-1] url = self.URL.format(tax.capitalize(), tax_) print(url) name = self.NAME.format(tax) file_tmp = urlretrieve(url, filename=None)[0] motifs = read_motifs(file_tmp, fmt="transfac") outfile = os.path.join(outdir, name) with open(outfile, "w") as f: print("# RSAT non-redundant {} motif database".format(tax), file=f) print("# Retrieved from: {}".format(url), file=f) print("# Date: {}".format(self.date), file=f) for motif in motifs: print(motif.to_pwm(), file=f) anno = self.annotate_factors(motifs) self.create_annotation(os.path.join(outdir, self.NAME.format(tax)), anno)
def threshold(args): """Calculate motif score threshold for a given FPR.""" if args.fpr < 0 or args.fpr > 1: print("Please specify a FPR between 0 and 1") sys.exit(1) motifs = read_motifs(args.pwmfile) s = Scanner() s.set_motifs(args.pwmfile) s.set_threshold(args.fpr, filename=args.inputfile) print("Motif\tScore\tCutoff") for motif in motifs: min_score = motif.pwm_min_score() max_score = motif.pwm_max_score() opt_score = s.threshold[motif.id] if opt_score is None: opt_score = motif.pwm_max_score() threshold = (opt_score - min_score) / (max_score - min_score) print("{0}\t{1}\t{2}".format(motif.id, opt_score, threshold))
def _load_factor2motifs(self, pfmfile=None, indirect=True, factors=None): motifs = read_motifs(pfmfile, as_dict=True) f2m = {} if self.is_human_genome(): valid_factors = self._load_human_factors() for name, motif in motifs.items(): for factor in get_motif_factors(motif, indirect=indirect): if factors is not None and factor not in factors: continue # TODO: this is temporary, while the motif database we use # not very clean... if self.is_human_genome(): factor = factor.upper() if self.is_human_genome() and factor not in valid_factors: continue f2m.setdefault(factor, []).append(name) return f2m
def threshold(args): """Calculate motif score threshold for a given FPR.""" if args.fpr < 0 or args.fpr > 1: print("Please specify a FPR between 0 and 1") sys.exit(1) motifs = read_motifs(args.pwmfile) s = Scanner() s.set_motifs(args.pwmfile) s.set_threshold(args.fpr, filename=args.inputfile) print("Motif\tScore\tCutoff") for motif in motifs: min_score = motif.pwm_min_score() max_score = motif.pwm_max_score() opt_score = s.threshold[motif.id] if opt_score is None: opt_score = motif.pwm_max_score() threshold = (opt_score - min_score) / (max_score - min_score) print("{0}\t{1}\t{2}".format( motif.id, opt_score, threshold))
def test1_denovo(self): """ de novo motif prediction """ gimme_motifs("test/data/denovo/input.fa", self.outdir, params={ "tools":"BioProspector,Homer,MDmodule", "fraction":0.5, "background":"random" }, filter_significant=True, cluster=True) fnames = ["motifs.pwm", "motif_report.html", "cluster_report.html", "params.txt", "stats.random.txt"] with open(os.path.join(self.outdir, 'gimmemotifs.log')) as f: log = f.read() self.assertIn("clustering", log) # Check if all output files are there for fname in fnames: self.assertTrue(os.path.exists(os.path.join(self.outdir, fname))) # Check if correct motif is predicted with open(os.path.join(self.outdir, "motifs.pwm")) as f: predicted_motifs = read_motifs(f) ap1 = motif_from_consensus("TGASTCA") mc = MotifComparer() ap1_predicted = False for motif in predicted_motifs: match = mc.get_closest_match(ap1, motif) if match["TGASTCA"][1][3] < 1e-5: ap1_predicted = True break self.assertTrue(ap1_predicted)
def create_denovo_motif_report(inputfile, pwmfile, fgfa, background, locfa, outdir, params, stats=None): """Create text and graphical (.html) motif reports.""" logger.info("creating reports") motifs = read_motifs(pwmfile, fmt="pwm") # ROC plots create_roc_plots(pwmfile, fgfa, background, outdir) # Closest match in database mc = MotifComparer() closest_match = mc.get_closest_match(motifs) if stats is None: stats = {} for bg, bgfa in background.items(): for m, s in calc_stats(motifs, fgfa, bgfa).items(): if m not in stats: stats[m] = {} stats[m][bg] = s stats = add_star(stats) if not params: params = {} cutoff_fpr = params.get('cutoff_fpr', 0.9) lwidth = np.median([len(seq) for seq in Fasta(locfa).seqs]) # Location plots logger.debug("Creating localization plots") for motif in motifs: logger.debug(" {} {}".format(motif.id, motif)) outfile = os.path.join(outdir, "images/{}_histogram.svg".format(motif.id)) motif_localization(locfa, motif, lwidth, outfile, cutoff=cutoff_fpr) # Create reports _create_text_report(inputfile, motifs, closest_match, stats, outdir) _create_graphical_report(inputfile, pwmfile, background, closest_match, outdir, stats)
def motif_to_img_series(series, pfmfile=None, motifs=None, outdir=".", subdir="logos"): if motifs is None: motifs = read_motifs(pfmfile, as_dict=True) if not os.path.exists(outdir): os.makedirs(outdir) if not os.path.exists(os.path.join(outdir, subdir)): os.makedirs(os.path.join(outdir, subdir)) img_series = [] for motif in series: if motif not in motifs: raise ValueError(f"Motif {motif} does not occur in motif database") fname = subdir + "/{}.png".format(re.sub(r"[^a-zA-Z0-9\-]+", "_", motif)) if not os.path.exists(fname): motifs[motif].plot_logo(fname=os.path.join(outdir, fname)) img_series.append(fname) if isinstance(series, pd.Index): index = series else: index = series.index return pd.Series(data=img_series, index=index)
def _roc_metrics(self, pwm, sample_fa, bg_fa, roc_file): motifs = dict([(m.id, m) for m in read_motifs(open(pwm), fmt="pwm")]) jobs = {} for id,m in motifs.items(): jobs[id] = self.job_server().apply_async(get_scores, (motifs[id],sample_fa,bg_fa,)) all_auc = {} all_mncp = {} f = open(roc_file, "w") f.write("Motif\tROC AUC\tMNCP\tMax f-measure\tSens @ max f-measure\n") for id in motifs.keys(): error, auc, mncp, max_f, y = jobs[id].get() if error: self.logger.error("Error in thread: %s", error) sys.exit(1) f.write("%s\t%s\t%s\t%s\t%s\n" % (id,auc,mncp,max_f,y)) all_auc[id] = auc all_mncp[id] = mncp f.close() return all_auc,all_mncp
def load_motifs(motifs_name): """ Load motifs from celloracle motif database Args: motifs_name (str) : Name of motifs. Returns: list : List of gimmemotifs.motif object. """ if motifs_name not in MOTIFS_LIST: raise ValueError( "The motifs name was not in the list. Available motifs: ", MOTIFS_LIST) path = MOTIFS_PATH_DICT[motifs_name] motifs = read_motifs(path) return motifs
def location(args): """ Creates histrogram of motif location. Parameters ---------- args : argparse object Command line arguments. """ fastafile = args.fastafile pfmfile = args.pfmfile lsize = args.size if not lsize: f = Fasta(fastafile) lsize = len(f.items()[0][1]) f = None jobs = [] motifs = read_motifs(pfmfile) ids = [motif.id for motif in motifs] if args.ids: ids = args.ids.split(",") n_cpus = int(MotifConfig().get_default_params()["ncpus"]) pool = Pool(processes=n_cpus, maxtasksperchild=1000) for motif in motifs: if motif.id in ids: outfile = os.path.join("%s_histogram" % motif.id) jobs.append( pool.apply_async( motif_localization, (fastafile, motif, lsize, outfile, args.cutoff))) for job in jobs: job.get()
def maelstrom_html_report(outdir, infile, pwmfile=None, threshold=2): df = pd.read_table(infile, index_col=0) df = df[np.any(abs(df) >= threshold, 1)] M = max(abs(df.min().min()), df.max().max()) m = -M motifs = read_motifs(pwmfile) del df.index.name cols = df.columns motifs = read_motifs(pwmfile) idx = [motif.id for motif in motifs] direct = [",".join(sorted(set([x.upper() for x in motif.factors[DIRECT_NAME]]))) for motif in motifs] indirect = [",".join(sorted(set([x.upper() for x in motif.factors[INDIRECT_NAME]]))) for motif in motifs] m2f = pd.DataFrame({DIRECT_NAME:direct, INDIRECT_NAME:indirect}, index=idx) factor_cols = [DIRECT_NAME, INDIRECT_NAME] if True: for factor_col in factor_cols: f = m2f[factor_col].str.len() > 30 m2f[factor_col] = '<div title="' + m2f[factor_col] + '">' + m2f[factor_col].str.slice(0,30) m2f.loc[f, factor_col] += '(...)' m2f[factor_col] += '</div>' df = df.join(m2f) df["logo"] = ['<img src="logos/{}.png" height=40/>'.format(re.sub('[()/]', '_', x)) for x in list(df.index)] if not os.path.exists(outdir + "/logos"): os.makedirs(outdir + "/logos") for motif in motifs: if motif.id in df.index: motif.to_img(outdir + "/logos/{}.png".format(re.sub('[()/]', '_',motif.id)), fmt="PNG") template_dir = MotifConfig().get_template_dir() js = open(os.path.join(template_dir, "sortable/sortable.min.js"), encoding="utf-8").read() css = open(os.path.join(template_dir, "sortable/sortable-theme-slick.css"), encoding="utf-8").read() cm = sns.diverging_palette(240, 10, as_cmap=True) df = df[factor_cols + ["logo"] + list(cols)] df_styled = df.style absmax = np.max((abs(df[cols].max().max()), abs(df[cols].min().min()))) target = absmax * 1.75 for col in cols: smin = df[col].min() smax = df[col].max() diff = smax - smin low = abs((-target - smin) / diff) high = (target - smax) / diff df_styled = df_styled.background_gradient(cmap='RdBu_r', low=low, high=high, subset=[col]) df_styled = df_styled.set_precision(3) df_styled = df_styled.set_table_attributes("data-sortable") df_styled = df_styled.render() df_styled = df_styled.replace("data-sortable", 'class="sortable-theme-slick" data-sortable') with open(outdir + "/gimme.maelstrom.report.html", "w", encoding="utf-8") as f: f.write("<head>\n") f.write("<style>{}</style>\n".format(css)) f.write("</head>\n") f.write("<body>\n") f.write(df_styled) f.write("<script>{}</script>\n".format(js)) f.write("</body>\n")
def _create_report(self, pwm, background, stats=None, best_id=None): if stats is None: stats = {} if best_id is None: best_id = {} self.logger.debug("Creating graphical report") class ReportMotif: pass motifs = read_motifs(open(pwm), fmt="pwm") for m, match in self.closest_match.items(): match[0].to_img(os.path.join(self.imgdir, "%s.png" % match[0].id), format="PNG") sort_key = background[0] if "gc" in background: sort_key = "gc" roc_img_file = "%s_%s_roc" report_motifs = [] sorted_motifs = sorted(motifs, cmp=lambda x, y: cmp(self.mncp[sort_key][y.id], self.mncp[sort_key][x.id])) for motif in sorted_motifs: rm = ReportMotif() rm.id = motif.id rm.id_href = {"href": "#%s" % motif.id} rm.id_name = {"name": motif.id} rm.img = {"src": os.path.join("images", "%s.png" % motif.id)} rm.best = best_id[motif.id] rm.consensus = motif.to_consensus() rm.stars = stats["%s_%s" % (motif.id, motif.to_consensus())]["stars"] rm.bg = {} for bg in background: rm.bg[bg] = {} rm.bg[bg]["e"] = "%0.2f" % self.e[bg].setdefault(motif.id, 0.0) rm.bg[bg]["p"] = "%0.2f" % self.p[bg].setdefault(motif.id, 1.0) rm.bg[bg]["auc"] = "%0.3f" % self.auc[bg][motif.id] rm.bg[bg]["mncp"] = "%0.3f" % self.mncp[bg][motif.id] rm.bg[bg]["roc_img"] = { "src": "images/" + os.path.basename(roc_img_file % (motif.id, bg)) + ".png" } rm.bg[bg]["roc_img_link"] = { "href": "images/" + os.path.basename(roc_img_file % (motif.id, bg)) + ".png" } rm.histogram_img = {"data": "images/%s_histogram.svg" % motif.id} rm.histogram_link = {"href": "images/%s_histogram.svg" % motif.id} rm.match_img = { "src": "images/%s.png" % self.closest_match[motif.id][0].id } rm.match_id = self.closest_match[motif.id][0].id rm.match_pval = "%0.2e" % self.closest_match[motif.id][1] report_motifs.append(rm) total_report = self.motif_report env = jinja2.Environment( loader=jinja2.FileSystemLoader([self.config.get_template_dir()])) template = env.get_template("report_template.jinja.html") result = template.render(expname=self.basename, motifs=report_motifs, inputfile=self.inputfile, date=datetime.today().strftime("%d/%m/%Y"), version=GM_VERSION) f = open(total_report, "w") f.write(result.encode('utf-8')) f.close()
def _create_graphical_report(inputfile, pwm, background, closest_match, outdir, stats, best_id=None): """Create main gimme_motifs output html report.""" if best_id is None: best_id = {} logger.debug("Creating graphical report") class ReportMotif(object): """Placeholder for motif stats.""" pass config = MotifConfig() imgdir = os.path.join(outdir, "images") if not os.path.exists(imgdir): os.mkdir(imgdir) motifs = read_motifs(pwm, fmt="pwm") roc_img_file = "%s_roc.%s" dbpwm = config.get_default_params()["motif_db"] pwmdir = config.get_motif_dir() dbmotifs = read_motifs(os.path.join(pwmdir, dbpwm), as_dict=True) report_motifs = [] for motif in motifs: rm = ReportMotif() rm.id = motif.id rm.id_href = {"href": "#%s" % motif.id} rm.id_name = {"name": motif.id} rm.img = {"src": os.path.join("images", "%s.png" % motif.id)} motif.to_img(os.path.join(outdir, "images/{}.png".format(motif.id)), fmt="PNG") # TODO: fix best ID rm.best = "Gimme"#best_id[motif.id] rm.consensus = motif.to_consensus() rm.stars = int(np.mean( [stats[str(motif)][bg].get("stars", 0) for bg in background] ) + 0.5) rm.bg = {} for bg in background: rm.bg[bg] = {} this_stats = stats.get(str(motif), {}).get(bg) # TODO: fix these stats rm.bg[bg]["e"] = "%0.2f" % this_stats.get("enr_at_fpr", 1.0) rm.bg[bg]["p"] = "%0.2f" % this_stats.get("phyper_at_fpr", 1.0) rm.bg[bg]["auc"] = "%0.3f" % this_stats.get("roc_auc", 0.5) rm.bg[bg]["mncp"] = "%0.3f" % this_stats.get("mncp", 1.0) rm.bg[bg]["roc_img"] = {"src": "images/" + os.path.basename(roc_img_file % (motif.id, bg)) + ".png"} rm.bg[bg][u"roc_img_link"] = {u"href": "images/" + os.path.basename(roc_img_file % (motif.id, bg)) + ".png"} rm.histogram_img = {"data":"images/%s_histogram.svg" % motif.id} rm.histogram_link= {"href":"images/%s_histogram.svg" % motif.id} match_id = closest_match[motif.id][0] dbmotifs[match_id].to_img(os.path.join(outdir, "images/{}.png".format(match_id)), fmt="PNG") rm.match_img = {"src": "images/{}.png".format(match_id)} rm.match_id = closest_match[motif.id][0] rm.match_pval = "%0.2e" % closest_match[motif.id][1][-1] report_motifs.append(rm) total_report = os.path.join(outdir, "motif_report.html") star_img = os.path.join(config.get_template_dir(), "star.png") shutil.copyfile(star_img, os.path.join(outdir, "images", "star.png")) env = jinja2.Environment(loader=jinja2.FileSystemLoader([config.get_template_dir()])) template = env.get_template("report_template.jinja.html") # TODO: title result = template.render( motifs=report_motifs, inputfile=inputfile, date=datetime.today().strftime("%d/%m/%Y"), version=__version__, bg_types=list(background.keys())) with open(total_report, "wb") as f: f.write(result.encode('utf-8'))
def set_threshold(self, fpr=None, threshold=None, gc=False): """Set motif scanning threshold based on background sequences. Parameters ---------- fpr : float, optional Desired FPR, between 0.0 and 1.0. threshold : float or str, optional Desired motif threshold, expressed as the fraction of the difference between minimum and maximum score of the PWM. Should either be a float between 0.0 and 1.0 or a filename with thresholds as created by 'gimme threshold'. """ if threshold and fpr: raise ValueError("Need either fpr or threshold.") if fpr: fpr = float(fpr) if not (0.0 < fpr < 1.0): raise ValueError("Parameter fpr should be between 0 and 1") if not self.motifs: raise ValueError("please run set_motifs() first") thresholds = {} motifs = read_motifs(self.motifs) if threshold is not None: self.threshold = parse_threshold_values(self.motifs, threshold) return if not self.background: try: self.set_background(gc=gc) except Exception: raise ValueError("please run set_background() first") seqs = self.background.seqs lock.acquire() with Cache(CACHE_DIR) as cache: scan_motifs = [] for motif in motifs: k = "{}|{}|{:.4f}".format(motif.hash(), self.background_hash, fpr) threshold = cache.get(k) if threshold is None: scan_motifs.append(motif) else: if np.isclose(threshold, motif.pwm_max_score()): thresholds[motif.id] = None elif np.isclose(threshold, motif.pwm_min_score()): thresholds[motif.id] = 0.0 else: thresholds[motif.id] = threshold if len(scan_motifs) > 0: logger.info("determining FPR-based threshold") for motif, threshold in self._threshold_from_seqs( scan_motifs, seqs, fpr): k = "{}|{}|{:.4f}".format(motif.hash(), self.background_hash, fpr) cache.set(k, threshold) if np.isclose(threshold, motif.pwm_max_score()): thresholds[motif.id] = None elif np.isclose(threshold, motif.pwm_min_score()): thresholds[motif.id] = 0.0 else: thresholds[motif.id] = threshold lock.release() self.threshold_str = "{}_{}_{}".format(fpr, threshold, self.background_hash) self.threshold = thresholds
def scan_to_table( input_table, genome, scoring, pfmfile=None, ncpus=None, zscore=True, gc=True ): """Scan regions in input table with motifs. Parameters ---------- input_table : str Filename of input table. Can be either a text-separated tab file or a feather file. genome : str Genome name. Can be either the name of a FASTA-formatted file or a genomepy genome name. scoring : str "count" or "score" pfmfile : str, optional Specify a PFM file for scanning. ncpus : int, optional If defined this specifies the number of cores to use. Returns ------- table : pandas.DataFrame DataFrame with motif ids as column names and regions as index. Values are either counts or scores depending on the 'scoring' parameter.s """ config = MotifConfig() if pfmfile is None: pfmfile = config.get_default_params().get("motif_db", None) if pfmfile is not None: pfmfile = os.path.join(config.get_motif_dir(), pfmfile) if pfmfile is None: raise ValueError("no pfmfile given and no default database specified") logger.info("reading table") if input_table.endswith("feather"): df = pd.read_feather(input_table) idx = df.iloc[:, 0].values else: df = pd.read_table(input_table, index_col=0, comment="#") idx = df.index regions = list(idx) if len(regions) >= 1000: check_regions = np.random.choice(regions, size=1000, replace=False) else: check_regions = regions size = int( np.median([len(seq) for seq in as_fasta(check_regions, genome=genome).seqs]) ) s = Scanner(ncpus=ncpus) s.set_motifs(pfmfile) s.set_genome(genome) s.set_background(genome=genome, gc=gc, size=size) scores = [] if scoring == "count": logger.info("setting threshold") s.set_threshold(fpr=FPR) logger.info("creating count table") for row in s.count(regions): scores.append(row) logger.info("done") else: s.set_threshold(threshold=0.0) msg = "creating score table" if zscore: msg += " (z-score" if gc: msg += ", GC%" msg += ")" else: msg += " (logodds)" logger.info(msg) for row in s.best_score(regions, zscore=zscore, gc=gc): scores.append(row) logger.info("done") motif_names = [m.id for m in read_motifs(pfmfile)] logger.info("creating dataframe") return pd.DataFrame(scores, index=idx, columns=motif_names)
def _cluster_motifs(self, pfm_file, cluster_pwm, dir, threshold): self.logger.info("clustering significant motifs.") trim_ic = 0.2 clusters = [] motifs = read_motifs(open(pfm_file), fmt="pwm") if len(motifs) == 1: clusters = [[motifs[0], motifs]] else: tree = cluster_motifs(pfm_file, "total", "wic", "mean", True, threshold=float(threshold), include_bg=True, progress=False) clusters = tree.getResult() ids = [] mc = MotifComparer() for cluster, members in clusters: cluster.trim(trim_ic) cluster.to_img(os.path.join(self.imgdir, "%s.png" % cluster.id), format="PNG") ids.append([cluster.id, {"src": "images/%s.png" % cluster.id}, []]) if len(members) > 1: scores = {} for motif in members: scores[motif] = mc.compare_motifs(cluster, motif, "total", "wic", "mean", pval=True) add_pos = sorted(scores.values(), cmp=lambda x, y: cmp(x[1], y[1]))[0][1] for motif in members: score, pos, strand = scores[motif] add = pos - add_pos if strand in [1, "+"]: pass else: #print "RC %s" % motif.id rc = motif.rc() rc.id = motif.id motif = rc #print "%s\t%s" % (motif.id, add) motif.to_img(os.path.join( self.imgdir, "%s.png" % motif.id.replace(" ", "_")), format="PNG", add_left=add) ids[-1][2] = [ dict([("src", "images/%s.png" % motif.id.replace(" ", "_")), ("alt", motif.id.replace(" ", "_"))]) for motif in members ] env = jinja2.Environment( loader=jinja2.FileSystemLoader([self.config.get_template_dir()])) template = env.get_template("cluster_template.jinja.html") result = template.render(expname=self.basename, motifs=ids, inputfile=self.inputfile, date=datetime.today().strftime("%d/%m/%Y"), version=GM_VERSION) f = open(self.cluster_report, "w") f.write(result.encode('utf-8')) f.close() f = open(cluster_pwm, "w") if len(clusters) == 1 and len(clusters[0][1]) == 1: f.write("%s\n" % clusters[0][0].to_pwm()) else: for motif in tree.get_clustered_motifs(): f.write("%s\n" % motif.to_pwm()) f.close() self.logger.debug("Clustering done. See the result in %s", self.cluster_report) return clusters
def select_nonredundant_motifs(roc_report, pfmfile, fg_table, bg_table, tolerance=0.001): pfmfile = pfmfile_location(pfmfile) motifs = read_motifs(pfmfile) motif_dict = read_motifs(pfmfile, as_dict=True) mc = MotifComparer() df = pd.read_csv(roc_report, sep="\t", index_col=0) df = df[df["Enr. at 1% FPR"] >= 2] motifs = [m for m in motifs if m.id in df.index] cols = ["ROC AUC", "PR AUC", "Enr. at 1% FPR", "Recall at 10% FDR"] rank = df[cols].rank().mean(1).sort_values(ascending=False) redundant_motifs = [] keep = [] while df[~df.index.isin(redundant_motifs)].shape[0] > 0: motif = rank[~rank.index.isin(redundant_motifs)].head(1).index[0] keep.append(motif) result = mc.get_all_scores( [motif_dict[motif]], [m for m in motifs if m.id not in redundant_motifs], "partial", "seqcor", "mean", ) result = result[motif] redundant_motifs += [m for m in result.keys() if result[m][0] >= 0.7] logger.debug(f"Selected {len(keep)} motifs for feature elimination") # Read motif scan results fg_table = pd.read_csv(fg_table, index_col=0, comment="#", sep="\t") bg_table = pd.read_csv(bg_table, index_col=0, comment="#", sep="\t") X = pd.concat((fg_table, bg_table), axis=0) y = np.hstack((np.ones(fg_table.shape[0]), np.zeros(bg_table.shape[0]))) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.4, random_state=2, shuffle=True, ) X_bla = X_train[keep] model = LogisticRegression(solver="liblinear", max_iter=500, penalty="l1") # = RandomForestClassifier(n_estimators=100) max_score = np.mean( cross_val_score(model, X_bla, y_train, cv=5, scoring="average_precision")) mean_scores = [] step = 1 logger.info("selecting non-redundant motifs") n_features = 1 for i in range(1, X_bla.shape[1], step): rfe = RFE(model, i) fit = rfe.fit(X_bla, y_train) mean_score = np.mean( cross_val_score( model, X_bla.loc[:, fit.support_], y_train, cv=5, scoring="average_precision", )) if i > 1 and mean_score - mean_scores[-1] < (max_score * tolerance): n_features = i - 1 break mean_scores.append(mean_score) rfe = RFE(model, n_features) fit = rfe.fit(X_bla, y_train) selected_features = X_bla.columns[fit.support_] model.fit(X_train.loc[:, selected_features], y_train) y_pred = model.predict_proba(X_test.loc[:, selected_features])[:, 1] pr_auc = average_precision_score(y_test, y_pred) roc_auc = roc_auc_score(y_test, y_pred) logger.info( f"selected {len(selected_features)} non-redundant motifs: ROC AUC {roc_auc:.3f}, PR AUC {pr_auc:.3f}" ) return selected_features
def _create_graphical_report(inputfile, pwm, background, closest_match, outdir, stats, best_id=None): """Create main gimme_motifs output html report.""" if best_id is None: best_id = {} logger.debug("Creating graphical report") class ReportMotif(object): """Placeholder for motif stats.""" pass config = MotifConfig() imgdir = os.path.join(outdir, "images") if not os.path.exists(imgdir): os.mkdir(imgdir) motifs = read_motifs(pwm, fmt="pfm") roc_img_file = "%s_roc.%s" dbpwm = config.get_default_params()["motif_db"] pwmdir = config.get_motif_dir() dbmotifs = read_motifs(os.path.join(pwmdir, dbpwm), as_dict=True) report_motifs = [] for motif in motifs: rm = ReportMotif() rm.id = motif.id rm.id_href = {"href": "#%s" % motif.id} rm.id_name = {"name": motif.id} rm.img = {"src": os.path.join("images", "%s.png" % motif.id)} motif.plot_logo( fname=os.path.join(outdir, "images/{}.png".format(motif.id))) # TODO: fix best ID rm.best = "Gimme" # best_id[motif.id] rm.consensus = motif.to_consensus() rm.stars = int( np.mean( [stats[str(motif)][bg].get("stars", 0) for bg in background]) + 0.5) rm.bg = {} for bg in background: rm.bg[bg] = {} this_stats = stats.get(str(motif), {}).get(bg) # TODO: fix these stats rm.bg[bg]["e"] = "%0.2f" % this_stats.get("enr_at_fpr", 1.0) rm.bg[bg]["p"] = "%0.2f" % this_stats.get("phyper_at_fpr", 1.0) rm.bg[bg]["auc"] = "%0.3f" % this_stats.get("roc_auc", 0.5) rm.bg[bg]["mncp"] = "%0.3f" % this_stats.get("mncp", 1.0) rm.bg[bg]["roc_img"] = { "src": "images/" + os.path.basename(roc_img_file % (motif.id, bg)) + ".png" } rm.bg[bg][u"roc_img_link"] = { u"href": "images/" + os.path.basename(roc_img_file % (motif.id, bg)) + ".png" } rm.histogram_img = {"data": "images/%s_histogram.svg" % motif.id} rm.histogram_link = {"href": "images/%s_histogram.svg" % motif.id} match_id = closest_match[motif.id][0] dbmotifs[match_id].plot_logo( fname=os.path.join(outdir, "images/{}.png".format(match_id))) rm.match_img = {"src": "images/{}.png".format(match_id)} rm.match_id = closest_match[motif.id][0] rm.match_pval = "%0.2e" % closest_match[motif.id][1][-1] report_motifs.append(rm) total_report = os.path.join(outdir, "gimme.denovo.html") star_img = os.path.join(config.get_template_dir(), "star.png") shutil.copyfile(star_img, os.path.join(outdir, "images", "star.png")) env = jinja2.Environment( loader=jinja2.FileSystemLoader([config.get_template_dir()])) template = env.get_template("report_template.jinja.html") # TODO: title result = template.render( motifs=report_motifs, inputfile=inputfile, date=datetime.today().strftime("%d/%m/%Y"), version=__version__, bg_types=list(background.keys()), ) with open(total_report, "wb") as f: f.write(result.encode("utf-8"))
def maelstrom_html_report(outdir, infile, pfmfile=None, threshold=2): df = pd.read_table(infile, index_col=0) df = df[np.any(abs(df) >= threshold, 1)] motifs = read_motifs(pfmfile) del df.index.name cols = df.columns motifs = read_motifs(pfmfile) idx = [motif.id for motif in motifs] direct = [ ",".join(sorted(set([x.upper() for x in motif.factors[DIRECT_NAME]]))) for motif in motifs ] indirect = [ ",".join(sorted(set([x.upper() for x in motif.factors[INDIRECT_NAME]]))) for motif in motifs ] m2f = pd.DataFrame({ DIRECT_NAME: direct, INDIRECT_NAME: indirect }, index=idx) factor_cols = [DIRECT_NAME, INDIRECT_NAME] if True: for factor_col in factor_cols: f = m2f[factor_col].str.len() > 30 m2f[factor_col] = ('<div title="' + m2f[factor_col] + '">' + m2f[factor_col].str.slice(0, 30)) m2f.loc[f, factor_col] += "(...)" m2f[factor_col] += "</div>" df = df.join(m2f) df["logo"] = [ '<img src="logos/{}.png" height=40/>'.format(re.sub("[()/]", "_", x)) for x in list(df.index) ] if not os.path.exists(outdir + "/logos"): os.makedirs(outdir + "/logos") for motif in motifs: if motif.id in df.index: motif.plot_logo( fname=outdir + "/logos/{}.png".format(re.sub("[()/]", "_", motif.id))) template_dir = MotifConfig().get_template_dir() js = open(os.path.join(template_dir, "sortable/sortable.min.js"), encoding="utf-8").read() css = open( os.path.join(template_dir, "sortable/sortable-theme-slick.css"), encoding="utf-8", ).read() df = df[factor_cols + ["logo"] + list(cols)] df_styled = df.style absmax = np.max((abs(df[cols].max().max()), abs(df[cols].min().min()))) target = absmax * 1.75 for col in cols: smin = df[col].min() smax = df[col].max() diff = smax - smin low = abs((-target - smin) / diff) high = (target - smax) / diff df_styled = df_styled.background_gradient(cmap="RdBu_r", low=low, high=high, subset=[col]) df_styled = df_styled.set_precision(3) df_styled = df_styled.set_table_attributes("data-sortable") df_styled = df_styled.render() df_styled = df_styled.replace( "data-sortable", 'class="sortable-theme-slick" data-sortable') with open(outdir + "/gimme.maelstrom.report.html", "w", encoding="utf-8") as f: f.write("<head>\n") f.write("<style>{}</style>\n".format(css)) f.write("</head>\n") f.write("<body>\n") f.write(df_styled) f.write("<script>{}</script>\n".format(js)) f.write("</body>\n")
def roc_html_report( outdir, infile, pfmfile, outname="gimme.motifs.html", threshold=0.01, use_motifs=None, link_matches=False, ): df = pd.read_table(infile, index_col=0) del df.index.name df["corrected P-value"] = multipletests(df["P-value"], method="fdr_bh")[1] cols = [ "logo", "# matches", "# matches background", "P-value", "log10 P-value", "corrected P-value", "ROC AUC", "PR AUC", "Enr. at 1% FPR", "Recall at 10% FDR", ] motifs = read_motifs(pfmfile) if use_motifs is not None: motifs = [m for m in motifs if m.id in use_motifs] idx = [motif.id for motif in motifs] df = df.loc[idx] direct = [",".join(motif.factors[DIRECT_NAME]) for motif in motifs] indirect = [",".join(motif.factors[INDIRECT_NAME]) for motif in motifs] m2f = pd.DataFrame({ DIRECT_NAME: direct, INDIRECT_NAME: indirect }, index=idx) factor_cols = [DIRECT_NAME, INDIRECT_NAME] if True: for factor_col in factor_cols: f = m2f[factor_col].str.len() > 30 m2f[factor_col] = ('<div title="' + m2f[factor_col] + '">' + m2f[factor_col].str.slice(0, 30)) m2f.loc[f, factor_col] += "(...)" m2f[factor_col] += "</div>" df = df.join(m2f) cols = factor_cols + cols df = df[df["corrected P-value"] <= threshold] if link_matches: df["# matches"] = ("<a href=motif_scan_results/" + df.index.to_series() + ".matches.bed>" + df["# matches"].astype(str) + "</a>") df["logo"] = [ '<img src="logos/{}.png" height=40/>'.format( re.sub(r"[^-_\w]+", "_", x)) for x in list(df.index) ] df = df[cols] if not os.path.exists(outdir + "/logos"): os.makedirs(outdir + "/logos") for motif in motifs: if motif.id in df.index: motif.plot_logo( fname=outdir + "/logos/{}.png".format(re.sub(r"[^-_\w]+", "_", motif.id))) bar_cols = [ "log10 P-value", "ROC AUC", "PR AUC", "MNCP", "Enr. at 1% FPR", "Recall at 10% FDR", ] template_dir = MotifConfig().get_template_dir() js = open(os.path.join(template_dir, "sortable/sortable.min.js"), encoding="utf-8").read() css = open( os.path.join(template_dir, "sortable/sortable-theme-slick.css"), encoding="utf-8", ).read() with open(os.path.join(outdir, outname), "w", encoding="utf-8") as f: f.write("<head>\n") f.write("<style>{}</style>\n".format(css)) f.write("</head>\n") f.write("<body>\n") if df.shape[0] > 0: f.write( df.sort_values( "ROC AUC", ascending=False).style.bar(bar_cols).set_precision(3). set_table_attributes("data-sortable").render().replace( "data-sortable", 'class="sortable-theme-slick" data-sortable')) else: f.write("No enriched motifs found.") f.write("<script>{}</script>\n".format(js)) f.write("</body>\n")
def cluster_motifs_with_report(infile, outfile, outdir, threshold, title=None): # Cluster significant motifs if title is None: title = infile motifs = read_motifs(infile, fmt="pwm") trim_ic = 0.2 clusters = [] if len(motifs) == 0: return [] elif len(motifs) == 1: clusters = [[motifs[0], motifs]] else: logger.info("clustering %d motifs.", len(motifs)) tree = cluster_motifs(infile, "total", "wic", "mean", True, threshold=float(threshold), include_bg=True, progress=False) clusters = tree.getResult() ids = [] mc = MotifComparer() img_dir = os.path.join(outdir, "images") if not os.path.exists(img_dir): os.mkdir(img_dir) for cluster, members in clusters: cluster.trim(trim_ic) png = "images/{}.png".format(cluster.id) cluster.to_img(os.path.join(outdir, png), fmt="PNG") ids.append([cluster.id, {"src": png}, []]) if len(members) > 1: scores = {} for motif in members: scores[motif] = mc.compare_motifs(cluster, motif, "total", "wic", "mean", pval=True) add_pos = sorted(scores.values(), key=lambda x: x[1])[0][1] for motif in members: score, pos, strand = scores[motif] add = pos - add_pos if strand in [1, "+"]: pass else: rc = motif.rc() rc.id = motif.id motif = rc #print "%s\t%s" % (motif.id, add) png = "images/{}.png".format(motif.id.replace(" ", "_")) motif.to_img(os.path.join(outdir, png), fmt="PNG", add_left=add) ids[-1][2] = [ dict([("src", "images/{}.png".format(motif.id.replace(" ", "_"))), ("alt", motif.id.replace(" ", "_"))]) for motif in members ] config = MotifConfig() env = jinja2.Environment( loader=jinja2.FileSystemLoader([config.get_template_dir()])) template = env.get_template("cluster_template.jinja.html") result = template.render(motifs=ids, inputfile=title, date=datetime.today().strftime("%d/%m/%Y"), version=__version__) cluster_report = os.path.join(outdir, "cluster_report.html") with open(cluster_report, "wb") as f: f.write(result.encode('utf-8')) f = open(outfile, "w") if len(clusters) == 1 and len(clusters[0][1]) == 1: f.write("%s\n" % clusters[0][0].to_pwm()) else: for motif in tree.get_clustered_motifs(): f.write("%s\n" % motif.to_pwm()) f.close() logger.debug("Clustering done. See the result in %s", cluster_report) return clusters
def command_scan( inputfile, pfmfile, nreport=1, fpr=0.01, cutoff=None, bed=False, scan_rc=True, table=False, score_table=False, moods=False, pvalue=None, bgfile=None, genome=None, ncpus=None, zscore=False, gcnorm=False, ): motifs = read_motifs(pfmfile) fa = as_fasta(inputfile, genome) # initialize scanner s = Scanner(ncpus=ncpus) s.set_motifs(pfmfile) if genome: s.set_genome(genome=genome) if genome: s.set_background(genome=genome, fname=bgfile, size=fa.median_length(), gc=gcnorm) if bgfile: s.set_background(genome=genome, fname=bgfile, size=fa.median_length()) if not score_table: s.set_threshold(fpr=fpr, threshold=cutoff) if table: it = scan_table(s, inputfile, fa, motifs, cutoff, bgfile, nreport, scan_rc, pvalue, moods) elif score_table: it = scan_score_table(s, fa, motifs, scan_rc, zscore=zscore, gcnorm=gcnorm) else: it = scan_normal( s, inputfile, fa, motifs, cutoff, bgfile, nreport, scan_rc, pvalue, moods, bed, zscore=zscore, gcnorm=gcnorm, ) for row in it: yield row
m2f = {} fnames = glob.glob(os.path.join(m2f_dir, "*.motif2factors.txt")) for fname in fnames: with open(fname) as f: for line in f: vals = line.strip().split("\t") if len(vals) == 4: m2f[vals[0]] = m2f.get(vals[0], []) + [vals[1:]] #print(m2f) # Read factor to family mapping from the CIS-BP databse anno = pd.read_table(tf_info) anno = anno[["TF_Name", "Family_Name"]].drop_duplicates().set_index("TF_Name") # read motifs motifs = dict([(m.id, m) for m in read_motifs(open(pfmfile))]) df_cluster = pd.read_table(clusterfile) ic_cutoff = 5 mc = MotifComparer() id_count = {} df = df_cluster.loc[k] sys.stderr.write(str(k) + "\n") seen_line = {} with open("{}.pfm".format(outname), "w") as out: with open("{}.motif2factors.txt".format(outname), "w") as m2f_out: print("Motif\tFactor\tEvidence\tCurated", file=m2f_out) for cluster in range(k): if cluster % 10 == 0: sys.stderr.write("{}\n".format(cluster)) out.flush()
def motifs(args): """ Calculate ROC_AUC and other metrics and optionally plot ROC curve.""" if args.outdir is None: raise ValueError("an output directory is required!") if not os.path.exists(args.outdir): os.makedirs(args.outdir) scan_dir = os.path.join(args.outdir, "motif_scan_results") if not os.path.exists(scan_dir): os.makedirs(scan_dir) file_type = determine_file_type(args.sample) outfile = os.path.join(args.outdir, f"input.w{args.size}.bed") sample = args.sample if file_type == "narrowpeak": narrowpeak_to_bed(args.sample, outfile, size=args.size) sample = outfile elif args.size and args.size > 0: if file_type == "fasta": logger.warn("size parameter will be ignored for FASTA input") elif file_type == "bed": write_equalsize_bedfile(args.sample, args.size, outfile) sample = outfile genome = args.genome if genome is None: args.zscore = False args.gc = False bgfile = None bg = args.background if bg is None: if genome is None: bg = "random" else: bg = "gc" if os.path.isfile(bg): bgfile = bg bg = "custom" else: # create background if not provided bgfile = os.path.join(args.outdir, "generated_background.{}.fa".format(bg)) size = args.size if size <= 0: size = None if bg == "gc": logger.info("creating background (matched GC%)") else: logger.info("creating background (random)") create_background_file( bgfile, bg, fmt="fasta", genome=genome, inputfile=sample, size=size, number=10000, ) pfmfile = args.pfmfile motifs = [] if args.known: motifs = read_motifs(pfmfile, fmt="pfm") if args.denovo: gimme_motifs( sample, args.outdir, params={ "tools": args.tools, "analysis": args.analysis, "background": bg, "custom_background": bgfile, "genome": args.genome, "size": args.size, }, ) denovo = read_motifs(os.path.join(args.outdir, "gimme.denovo.pfm")) mc = MotifComparer() result = mc.get_closest_match(denovo, dbmotifs=pfmfile, metric="seqcor") match_motifs = read_motifs(pfmfile, as_dict=True) new_map_file = os.path.join(args.outdir, "combined.motif2factors.txt") base = os.path.splitext(pfmfile)[0] map_file = base + ".motif2factors.txt" if os.path.exists(map_file): shutil.copyfile(map_file, new_map_file) motifs += denovo pfmfile = os.path.join(args.outdir, "combined.pfm") with open(pfmfile, "w") as f: for m in motifs: print(m.to_pwm(), file=f) with open(new_map_file, "a") as f: for m in denovo: print("{}\t{}\t{}\t{}".format(m.id, "de novo", "GimmeMotifs", "Y"), file=f) if result[m.id][0] in match_motifs: for factor in match_motifs[result[m.id] [0]].factors["direct"]: print( "{}\t{}\t{}\t{}".format(m.id, factor, "inferred (GimmeMotifs)", "N"), file=f, ) else: logger.info("skipping de novo") stats = [ "phyper_at_fpr", "roc_auc", "pr_auc", "enr_at_fpr", "recall_at_fdr", "roc_values", "matches_at_fpr", ] f_out = sys.stdout if args.outdir: f_out = open(args.outdir + "/gimme.roc.report.txt", "w") # Print the metrics f_out.write( "Motif\t# matches\t% matches input\t# matches background\t%matches background\tP-value\tlog10 P-value\tROC AUC\tPR AUC\tEnr. at 1% FPR\tRecall at 10% FDR\n" ) logger.info("creating motif scan tables") # ftype = determine_file_type(args.sample) # sample = args.sample # delete_sample = False # if ftype == "narrowpeak": # f = NamedTemporaryFile(delete=False) # logger.debug("Using {} as temporary BED file".format(f.name)) # narrowpeak_to_bed(args.sample, f.name, size=args.size) # sample = f.name # delete_sample = True # Create a table with the best score per motif for all motifs. # This has three reasons: # * Can be used to calculate statistics; # * Can be used to select a set of non-redundant motifs; # * These files are included in the output and can be used for further analyis. score_table = os.path.join(scan_dir, "input.motif.score.txt") bg_score_table = os.path.join(scan_dir, "background.motif.score.txt") for infile, outfile in [(sample, score_table), (bgfile, bg_score_table)]: scan_to_file( infile, pfmfile, filepath_or_buffer=outfile, score_table=True, genome=args.genome, zscore=True, gcnorm=True, ) n_input = pd.read_csv(score_table, comment="#", sep="\t").shape[0] n_background = pd.read_csv(bg_score_table, comment="#", sep="\t").shape[0] logger.info("calculating stats") for motif_stats in calc_stats_iterator( motifs=pfmfile, fg_table=score_table, bg_table=bg_score_table, stats=stats, ncpus=args.ncpus, ): for motif in motifs: if str(motif) in motif_stats: log_pvalue = np.inf if motif_stats[str(motif)]["phyper_at_fpr"] > 0: log_pvalue = -np.log10( motif_stats[str(motif)]["phyper_at_fpr"]) f_out.write( "{}\t{:d}\t{:.3f}\t{:d}\t{:.3f}\t{:.2e}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.2f}\t{:0.4f}\n" .format( motif.id, motif_stats[str(motif)]["matches_at_fpr"][0], motif_stats[str(motif)]["matches_at_fpr"][0] / n_input * 100, motif_stats[str(motif)]["matches_at_fpr"][1], motif_stats[str(motif)]["matches_at_fpr"][1] / n_background * 100, motif_stats[str(motif)]["phyper_at_fpr"], log_pvalue, motif_stats[str(motif)]["roc_auc"], motif_stats[str(motif)]["pr_auc"], motif_stats[str(motif)]["enr_at_fpr"], motif_stats[str(motif)]["recall_at_fdr"], )) f_out.close() # Select a set of "non-redundant" motifs. # Using Recursive Feature Elimination, a set of motifs is selected that # best explains the peaks in comparison to the background sequences. nr_motifs = select_nonredundant_motifs( args.outdir + "/gimme.roc.report.txt", pfmfile, score_table, bg_score_table, tolerance=0.001, ) # Provide BED files with motif scan results for the non-redundant motifs # At the moment this is not ideal, as scanning is now performed twice # for this set of non-redundant motifs. motif_dict = dict([(m.id, m) for m in motifs]) for motif in nr_motifs: with NamedTemporaryFile(mode="w") as f: print(motif_dict[motif].to_pwm(), file=f) f.flush() safe_name = re.sub(r"[^a-zA-Z0-9\-]+", "_", motif) scan_to_file( sample, f.name, filepath_or_buffer=os.path.join(scan_dir, f"{safe_name}.matches.bed"), bed=True, fpr=0.01, genome=args.genome, zscore=True, gcnorm=True, ) if args.report: logger.info("creating statistics report") if args.outdir: roc_html_report( args.outdir, args.outdir + "/gimme.roc.report.txt", pfmfile, threshold=0.01, outname="gimme.motifs.redundant.html", link_matches=False, ) roc_html_report( args.outdir, args.outdir + "/gimme.roc.report.txt", pfmfile, threshold=0.01, use_motifs=nr_motifs, link_matches=True, ) logger.info( f"gimme motifs final report: {os.path.join(args.outdir, 'gimme.motifs.html')}" )
def best_motif_in_cluster( single_pwm, clus_pwm, clusters, fg_fa, background, genome, stats=None, metrics=("roc_auc", "recall_at_fdr"), ): """Return the best motif per cluster for a clustering results. The motif can be either the average motif or one of the clustered motifs. Parameters ---------- single_pwm : str Filename of motifs. clus_pwm : str Filename of motifs. clusters : Motif clustering result. fg_fa : str Filename of FASTA file. background : dict Dictionary for background file names. genome : str Genome name. stats : dict, optional If statistics are not supplied they will be computed. metrics : sequence, optional Metrics to use for motif evaluation. Default are "roc_auc" and "recall_at_fdr". Returns ------- motifs : list List of Motif instances. """ # combine original and clustered motifs motifs = read_motifs(single_pwm) + read_motifs(clus_pwm) motifs = dict([(str(m), m) for m in motifs]) # get the statistics for those motifs that were not yet checked clustered_motifs = [] for clus, singles in clusters: for motif in set([clus] + singles): if str(motif) not in stats: clustered_motifs.append(motifs[str(motif)]) new_stats = {} for bg, bg_fa in background.items(): for m, s in calc_stats(fg_file=fg_fa, bg_file=bg_fa, motifs=clustered_motifs, genome=genome).items(): if m not in new_stats: new_stats[m] = {} new_stats[m][bg] = s stats.update(new_stats) rank = rank_motifs(stats, metrics) # rank the motifs best_motifs = [] for clus, singles in clusters: if len(singles) > 1: eval_motifs = singles if clus not in motifs: eval_motifs.append(clus) eval_motifs = [motifs[str(e)] for e in eval_motifs] best_motif = sorted(eval_motifs, key=lambda x: rank[str(x)])[-1] best_motifs.append(best_motif) else: best_motifs.append(clus) for bg in background: stats[str(best_motifs[-1])][bg]["num_cluster"] = len(singles) best_motifs = sorted(best_motifs, key=lambda x: rank[str(x)], reverse=True) return best_motifs
def run_full_analysis(self, inputfile, user_params=None): """ Full analysis: from bed-file to motifs (including clustering, ROC-curves, location plots and html report) """ self.logger.info("starting full motif analysis") self.logger.debug("Using temporary directory {0}".format(mytmpdir())) if user_params is None: user_params = {} params = self.config.get_default_params() params.update(user_params) if params["torque"]: from gimmemotifs.prediction_torque import pp_predict_motifs, PredictionResult self.logger.debug("Using torque") else: from gimmemotifs.prediction import pp_predict_motifs, PredictionResult self.logger.debug("Using multiprocessing") self.params = params #self.weird = params["weird_option"] background = [x.strip() for x in params["background"].split(",")] self.logger.debug("Parameters:") for param, value in params.items(): self.logger.debug(" %s: %s", param, value) # Checking input self.input_type = "BED" # If we can load it as fasta then it is a fasta, yeh? try: Fasta(inputfile) self.logger.debug("Inputfile is a FASTA file") self.input_type = "FASTA" except Exception: # Leave it to BED pass index_msg = ("No index found for genome {}! " "Has GimmeMotifs been configured correctly and is the " "genome indexed?").format(params["genome"]) index_dir = os.path.join(self.config.get_index_dir(), params["genome"]) if self.input_type == "FASTA": for bg in background: if not bg in FA_VALID_BGS: self.logger.info( "Input type is FASTA, can't use background type '%s'", bg) if bg == "genomic": if not os.path.exists(index_dir): self.logger.error(index_msg) sys.exit(1) background = [bg for bg in background if bg in FA_VALID_BGS] elif self.input_type == "BED": # Does the index_dir exist? #bed-specific if not os.path.exists(index_dir): self.logger.error(index_msg) sys.exit(1) # is it a valid bed-file etc. self._check_input(inputfile) # bed-specific # Check for valid background for bg in background: if not bg in BED_VALID_BGS: self.logger.info( "Input type is BED, can't use background type '%s'", bg) background = [bg for bg in background if bg in BED_VALID_BGS] if len(background) == 0: self.logger.error("No valid backgrounds specified!") sys.exit(1) self.max_time = None max_time = None # Maximum time? if params["max_time"]: try: max_time = float(params["max_time"]) except Exception: self.logger.debug( "Could not parse max_time value, setting to no limit") self.max_time = None if max_time > 0: self.logger.debug( "Time limit for motif prediction: %0.2f hours" % max_time) max_time = 3600 * max_time self.max_time = max_time self.logger.debug("Max_time in seconds %0.0f" % self.max_time) else: self.logger.debug( "Invalid time limit for motif prediction, setting to no limit" ) self.max_time = None else: self.logger.debug("No time limit for motif prediction") if "random" in background: self.markov_model = params["markov_model"] # Create the necessary files for motif prediction and validation if self.input_type == "BED": self.prepare_input_bed(inputfile, params["genome"], params["width"], params["fraction"], params["abs_max"], params["use_strand"]) # Create file for location plots index_dir = os.path.join(self.config.get_index_dir(), params["genome"]) lwidth = int(params["lwidth"]) width = int(params["width"]) extend = (lwidth - width) / 2 genome_index.track2fasta(index_dir, self.validation_bed, self.location_fa, extend_up=extend, extend_down=extend, use_strand=params["use_strand"], ignore_missing=True) elif self.input_type == "FASTA": self.prepare_input_fa(inputfile, params["width"], params["fraction"], params["abs_max"]) # File for location plots self.location_fa = self.validation_fa fa = Fasta(self.location_fa) seqs = fa.seqs lwidth = len(seqs[0]) all_same_width = not (False in [len(seq) == lwidth for seq in seqs]) if not all_same_width: self.logger.warn( "PLEASE NOTE: FASTA file contains sequences of different lengths. Positional preference plots will be incorrect!" ) else: self.logger.error("Unknown input type, shouldn't happen") sys.exit(1) tools = dict([(x.strip(), x in [y.strip() for y in params["tools"].split(",")]) for x in params["available_tools"].split(",")]) self.create_background(background, params["genome"], params["width"]) # Predict the motifs analysis = params["analysis"] """ Predict motifs, input is a FASTA-file""" self.logger.info("starting motif prediction (%s)", analysis) self.logger.info("tools: %s", ", ".join([x for x in tools.keys() if tools[x]])) bg_file = self.bg_file["fa"][sorted( background, lambda x, y: cmp(BG_RANK[x], BG_RANK[y]))[0]] self.logger.debug("Using bg_file %s for significance" % bg_file) result = pp_predict_motifs(self.prediction_fa, self.predicted_pfm, analysis, params["genome"], params["use_strand"], self.prediction_bg, tools, self.job_server(), logger=self.logger, max_time=self.max_time, fg_file=self.validation_fa, bg_file=bg_file) motifs = result.motifs self.logger.info("predicted %s motifs", len(motifs)) self.logger.debug("written to %s", self.predicted_pfm) if len(motifs) == 0: self.logger.info("no motifs found") sys.exit() # Write stats output to file f = open(self.stats_file, "w") stat_keys = result.stats.values()[0].keys() f.write("%s\t%s\n" % ("Motif", "\t".join(stat_keys))) self.logger.debug(result.stats) for motif in motifs: stats = result.stats["%s_%s" % (motif.id, motif.to_consensus())] if stats: f.write( "%s\t%s\n" % (motif.id, "\t".join([str(stats[k]) for k in stat_keys]))) else: self.logger.error( "No stats for motif {0}, skipping this motif!".format( motif.id)) motifs.remove(motif) f.close() self.motifs_with_stats = motifs f = open(self.ranks_file, "w") tools = dict((m.id.split("_")[0], 1) for m in motifs).keys() f.write("Metric\tType\t%s\n" % ("\t".join(tools))) for stat in ["mncp", "roc_auc", "maxenr"]: best_motif = {} for motif in self.motifs_with_stats: val = result.stats["%s_%s" % (motif.id, motif.to_consensus())][stat] name = motif.id.split("_")[0] if val > best_motif.setdefault(name, 0): best_motif[name] = val names = best_motif.keys() vals = [best_motif[name] for name in names] rank = rankdata(vals) ind = [names.index(x) for x in tools] f.write("%s\t%s\t%s\n" % (stat, "value", "\t".join([str(vals[i]) for i in ind]))) f.write("%s\t%s\t%s\n" % (stat, "rank", "\t".join([str(rank[i]) for i in ind]))) f.close() #self.logger.debug("RANK: %s" % stat) #self.logger.debug("\t".join([str(x) for x in names])) #self.logger.debug("\t".join([str(x) for x in vals])) #self.logger.debug("\t".join([str(x) for x in rank])) # Determine significant motifs nsig = 0 f = open(self.significant_pfm, "w") for motif in motifs: stats = result.stats["%s_%s" % (motif.id, motif.to_consensus())] if stats["maxenr"] >= 3 and stats["roc_auc"] >= 0.55 and stats[ 'enr_fdr'] >= 2: f.write("%s\n" % motif.to_pfm()) nsig += 1 f.close() self.logger.info("%s motifs are significant", nsig) self.logger.debug("written to %s", self.significant_pfm) if nsig == 0: self.logger.info("no significant motifs found") return # ROC metrics of significant motifs for bg in background: self._roc_metrics(self.significant_pfm, self.validation_fa, self.bg_file["fa"][bg], self.bg_file["roc"][bg]) # Cluster significant motifs clusters = self._cluster_motifs(self.significant_pfm, self.cluster_pwm, self.outdir, params["cluster_threshold"]) # Determine best motif in cluster num_cluster, best_id = self._determine_best_motif_in_cluster( clusters, self.final_pwm, self.validation_fa, bg_file, self.imgdir) ### Enable parallel and modular evaluation of results # Scan (multiple) files with motifs # Define callback functions once scanning is finished: # - ROC plot # - Statistics # - Location plots (histogram) # - # Stars tmp = NamedTemporaryFile(dir=mytmpdir()).name p = PredictionResult(tmp, logger=self.logger, job_server=self.server, fg_file=self.validation_fa, bg_file=bg_file, do_counter=False) p.add_motifs( ("clustering", (read_motifs(open(self.final_pwm)), "", ""))) while len(p.stats.keys()) < len(p.motifs): sleep(5) #print "p.stats" #print p.stats #print "num_cluster" #print num_cluster for mid, num in num_cluster.items(): p.stats[mid]["numcluster"] = num all_stats = { "mncp": [2, 5, 8], "roc_auc": [0.6, 0.75, 0.9], "maxenr": [10, 20, 30], "enr_fdr": [4, 8, 12], "fraction": [0.4, 0.6, 0.8], "ks_sig": [4, 7, 10], "numcluster": [3, 6, 9], } self.logger.info("creating report") # ROC plots for bg in background: self.create_roc_plots(self.final_pwm, self.validation_fa, self.bg_file["fa"][bg], bg) # Location plots self.logger.debug("Creating localization plots") motifs = read_motifs(open(self.final_pwm), fmt="pwm") for motif in motifs: m = "%s_%s" % (motif.id, motif.to_consensus()) s = p.stats[m] outfile = os.path.join(self.imgdir, "%s_histogram.svg" % motif.id) motif_localization(self.location_fa, motif, lwidth, outfile, cutoff=s["cutoff_fdr"]) s["stars"] = int( mean([star(s[x], all_stats[x]) for x in all_stats.keys()]) + 0.5) self.logger.debug("Motif %s: %s stars" % (m, s["stars"])) # Calculate enrichment of final, clustered motifs self.calculate_cluster_enrichment(self.final_pwm, background) # Create report self.print_params() self._calc_report_values(self.final_pwm, background) self._create_report(self.final_pwm, background, stats=p.stats, best_id=best_id) self._create_text_report(self.final_pwm, background) self.logger.info("finished") self.logger.info("output dir: %s", os.path.split(self.motif_report)[0]) self.logger.info("report: %s", os.path.split(self.motif_report)[-1]) #self.logger.info("Open %s in your browser to see your results." % (self.motif_report)) if not (params["keep_intermediate"]): self.logger.debug( "Deleting intermediate files. Please specifify the -k option if you want to keep these files." ) shutil.rmtree(self.tmpdir) self.logger.debug("Done") return self.motif_report
def cluster_motifs(motifs, match="total", metric="wic", combine="mean", pval=True, threshold=0.95, trim_edges=False, edge_ic_cutoff=0.2, include_bg=True, progress=True): """ Clusters a set of sequence motifs. Required arg 'motifs' is a file containing positional frequency matrices or an array with motifs. Optional args: 'match', 'metric' and 'combine' specify the method used to compare and score the motifs. By default the WIC score is used (metric='wic'), using the the score over the whole alignment (match='total'), with the total motif score calculated as the mean score of all positions (combine='mean'). 'match' can be either 'total' for the total alignment or 'subtotal' for the maximum scoring subsequence of the alignment. 'metric' can be any metric defined in MotifComparer, currently: 'pcc', 'ed', 'distance', 'wic' or 'chisq' 'combine' determines how the total score is calculated from the score of individual positions and can be either 'sum' or 'mean' 'pval' can be True or False and determines if the score should be converted to an empirical p-value 'threshold' determines the score (or p-value) cutoff If 'trim_edges' is set to True, all motif edges with an IC below 'edge_ic_cutoff' will be removed before clustering When computing the average of two motifs 'include_bg' determines if, at a position only present in one motif, the information in that motif should be kept, or if it should be averaged with background frequencies. Should probably be left set to True. """ # First read pfm or pfm formatted motiffile if type([]) != type(motifs): motifs = read_motifs(open(motifs), fmt="pwm") mc = MotifComparer() # Trim edges with low information content if trim_edges: for motif in motifs: motif.trim(edge_ic_cutoff) # Make a MotifTree node for every motif nodes = [MotifTree(m) for m in motifs] # Determine all pairwise scores and maxscore per motif scores = {} motif_nodes = dict([(n.motif.id,n) for n in nodes]) motifs = [n.motif for n in nodes] if progress: sys.stderr.write("Calculating initial scores\n") result = mc.get_all_scores(motifs, motifs, match, metric, combine, pval, parallel=True) for m1, other_motifs in result.items(): for m2, score in other_motifs.items(): if m1 == m2: if pval: motif_nodes[m1].maxscore = 1 - score[0] else: motif_nodes[m1].maxscore = score[0] else: if pval: score = [1 - score[0]] + score[1:] scores[(motif_nodes[m1],motif_nodes[m2])] = score cluster_nodes = [node for node in nodes] ave_count = 1 total = len(cluster_nodes) while len(cluster_nodes) > 1: l = sorted(scores.keys(), key=lambda x: scores[x][0]) i = -1 (n1, n2) = l[i] while not n1 in cluster_nodes or not n2 in cluster_nodes: i -= 1 (n1,n2) = l[i] (score, pos, orientation) = scores[(n1,n2)] ave_motif = n1.motif.average_motifs(n2.motif, pos, orientation, include_bg=include_bg) ave_motif.trim(edge_ic_cutoff) ave_motif.id = "Average_%s" % ave_count ave_count += 1 new_node = MotifTree(ave_motif) if pval: new_node.maxscore = 1 - mc.compare_motifs(new_node.motif, new_node.motif, match, metric, combine, pval)[0] else: new_node.maxscore = mc.compare_motifs(new_node.motif, new_node.motif, match, metric, combine, pval)[0] new_node.mergescore = score #print "%s + %s = %s with score %s" % (n1.motif.id, n2.motif.id, ave_motif.id, score) n1.parent = new_node n2.parent = new_node new_node.left = n1 new_node.right = n2 cmp_nodes = dict([(node.motif, node) for node in nodes if not node.parent]) if progress: progress = (1 - len(cmp_nodes) / float(total)) * 100 sys.stderr.write('\rClustering [{0}{1}] {2}%'.format( '#'*(int(progress)/10), " "*(10 - int(progress)/10), int(progress))) result = mc.get_all_scores( [new_node.motif], cmp_nodes.keys(), match, metric, combine, pval, parallel=True) for motif, n in cmp_nodes.items(): x = result[new_node.motif.id][motif.id] if pval: x = [1 - x[0]] + x[1:] scores[(new_node, n)] = x nodes.append(new_node) cluster_nodes = [node for node in nodes if not node.parent] if progress: sys.stderr.write("\n") root = nodes[-1] for node in [node for node in nodes if not node.left]: node.parent.checkMerge(root, threshold) return root
def moap( inputfile, method="hypergeom", scoring=None, outfile=None, motiffile=None, pfmfile=None, genome=None, fpr=0.01, ncpus=None, subsample=None, zscore=True, gc=True, ): """Run a single motif activity prediction algorithm. Parameters ---------- inputfile : str :1File with regions (chr:start-end) in first column and either cluster name in second column or a table with values. method : str, optional Motif activity method to use. Any of 'hypergeom', 'lasso', 'bayesianridge', 'rf', 'xgboost'. Default is 'hypergeom'. scoring: str, optional Either 'score' or 'count' outfile : str, optional Name of outputfile to save the fitted activity values. motiffile : str, optional Table with motif scan results. First column should be exactly the same regions as in the inputfile. pfmfile : str, optional File with motifs in pwm format. Required when motiffile is not supplied. genome : str, optional Genome name, as indexed by gimme. Required when motiffile is not supplied fpr : float, optional FPR for motif scanning ncpus : int, optional Number of threads to use. Default is the number specified in the config. zscore : bool, optional Use z-score normalized motif scores. gc : bool optional Use GC% bins for z-score. Returns ------- pandas DataFrame with motif activity """ if scoring and scoring not in ["score", "count"]: raise ValueError("valid values are 'score' and 'count'") if inputfile.endswith("feather"): df = pd.read_feather(inputfile) df = df.set_index(df.columns[0]) else: # read data df = pd.read_table(inputfile, index_col=0, comment="#") clf = Moap.create(method, ncpus=ncpus) if clf.ptype == "classification": if df.shape[1] != 1: raise ValueError("1 column expected for {}".format(method)) else: if np.dtype("object") in set(df.dtypes): raise ValueError( "columns should all be numeric for {}".format(method)) if motiffile is None: if genome is None: raise ValueError("need a genome") pfmfile = pfmfile_location(pfmfile) try: motifs = read_motifs(pfmfile) except Exception: sys.stderr.write("can't read motifs from {}".format(pfmfile)) raise # scan for motifs motif_names = [m.id for m in read_motifs(pfmfile)] scores = [] if method == "classic" or scoring == "count": logger.info("motif scanning (scores)") scores = scan_regionfile_to_table( inputfile, genome, "count", pfmfile=pfmfile, ncpus=ncpus, zscore=zscore, gc=gc, ) else: logger.info("motif scanning (scores)") scores = scan_regionfile_to_table( inputfile, genome, "score", pfmfile=pfmfile, ncpus=ncpus, zscore=zscore, gc=gc, ) motifs = pd.DataFrame(scores, index=df.index, columns=motif_names) elif isinstance(motiffile, pd.DataFrame): motifs = motiffile else: motifs = pd.read_table(motiffile, index_col=0, comment="#") if outfile and os.path.exists(outfile): out = pd.read_table(outfile, index_col=0, comment="#") ncols = df.shape[1] if ncols == 1: ncols = len(df.iloc[:, 0].unique()) if out.shape[0] == motifs.shape[1] and out.shape[1] == ncols: logger.warn("%s output already exists... skipping", method) return out if subsample is not None: n = int(subsample * df.shape[0]) logger.debug("Subsampling %d regions", n) df = df.sample(n) motifs = motifs.loc[df.index] clf.fit(motifs, df) if outfile: with open(outfile, "w") as f: f.write( "# maelstrom - GimmeMotifs version {}\n".format(__version__)) f.write("# method: {} with motif {}\n".format(method, scoring)) if genome: f.write("# genome: {}\n".format(genome)) if isinstance(motiffile, str): f.write("# motif table: {}\n".format(motiffile)) f.write("# {}\n".format(clf.act_description)) with open(outfile, "a") as f: clf.act_.to_csv(f, sep="\t") return clf.act_
def set_threshold(self, fpr=None, threshold=None, genome=None, length=200, filename=None): """Set motif scanning threshold based on background sequences. Parameters ---------- fpr : float, optional Desired FPR, between 0.0 and 1.0. threshold : float or str, optional Desired motif threshold, expressed as the fraction of the difference between minimum and maximum score of the PWM. Should either be a float between 0.0 and 1.0 or a filename with thresholds as created by 'gimme threshold'. """ if threshold: if fpr: raise ValueError("Need either fpr or threshold.") if genome: sys.stderr.write( "Parameter genome ignored when threshold is specified.\n" "Did you want to use fpr?\n") if filename: sys.stderr.write( "Parameter filename ignored when threshold is specified.\n" "Did you want to use fpr?\n") if genome and filename: raise ValueError("Need either genome or filename.") if fpr: fpr = float(fpr) if not (0.0 < fpr < 1.0): raise ValueError("Parameter fpr should be between 0 and 1") if not self.motifs: raise ValueError("please run set_motifs() first") thresholds = {} with open(self.motifs) as f: motifs = read_motifs(f) if threshold is not None: self.threshold = parse_threshold_values(self.motifs, threshold) return if filename: if not os.path.exists(filename): raise IOError("File {} does not exist.".format(filename)) bg_hash = file_checksum(filename) seqs = Fasta(filename).seqs elif genome: bg_hash = "{}\{}".format(genome, int(length)) else: raise ValueError("Need genome or filename") with Cache(CACHE_DIR) as cache: scan_motifs = [] for motif in motifs: k = "{}|{}|{:.4f}".format(motif.hash(), bg_hash, fpr) threshold = cache.get(k) if threshold is None: scan_motifs.append(motif) else: if np.isclose(threshold, motif.pwm_max_score()): thresholds[motif.id] = None else: thresholds[motif.id] = threshold if len(scan_motifs) > 0: if genome: Genome(genome) sys.stderr.write( "Determining threshold for fpr {} and length {} based on {}\n" .format(fpr, int(length), genome)) fa = RandomGenomicFasta(genome, length, 10000) seqs = fa.seqs else: sys.stderr.write( "Determining threshold for fpr {} based on {}\n". format(fpr, filename)) for motif, threshold in self._threshold_from_seqs( scan_motifs, seqs, fpr): k = "{}|{}|{:.4f}".format(motif.hash(), bg_hash, fpr) cache.set(k, threshold) if np.isclose(threshold, motif.pwm_max_score()): thresholds[motif.id] = None else: thresholds[motif.id] = threshold self.threshold_str = "{}_{}_{}_{}_{}".format(fpr, threshold, genome, length, filename) self.threshold = thresholds