def _determine_best_motif_in_cluster(self, clusters, pwm, sample_fa, bg_fa, imgdir=None): num_cluster = {} best_id = {} out = open(pwm, "w") for i, (clus, singles) in enumerate(clusters): motifs = [clus] + singles tmp = NamedTemporaryFile(dir=mytmpdir()) tmp2 = NamedTemporaryFile(dir=mytmpdir()) for m in motifs: tmp.write("%s\n" % m.to_pwm()) tmp.flush() auc,mncp = self._roc_metrics(tmp.name, sample_fa, bg_fa, tmp2.name) bla = sorted(motifs, cmp=lambda x,y: cmp(mncp[x.id], mncp[y.id])) for m in bla: self.logger.debug("sorted: %s %s %s" % (str(m), mncp[m.id], auc[m.id])) self.logger.debug("end list") best_motif = sorted(motifs, cmp=lambda x,y: cmp(mncp[x.id], mncp[y.id]))[-1] old_id = best_motif.id best_motif.id = "GimmeMotifs_%d" % (i + 1) best_id[best_motif.id] = old_id.split("_")[0] num_cluster["%s_%s" % (best_motif.id, best_motif.to_consensus())] = len(singles) if imgdir: best_motif.to_img(os.path.join(imgdir, best_motif.id), format="PNG") out.write("%s\n" % best_motif.to_pwm()) tmp.close() tmp2.close() out.close() return num_cluster, best_id
def match_plot(plotdata, outfile): """Plot list of motifs with database match and p-value "param plotdata: list of (motif, dbmotif, pval) """ fig_h = 2 fig_w = 7 nrows = len(plotdata) ncols = 2 fig = plt.figure(figsize=(fig_w, nrows * fig_h)) for i, (motif, dbmotif, pval) in enumerate(plotdata): text = "Motif: %s\nBest match: %s\np-value: %0.2e" % (motif.id, dbmotif.id, pval) grid = ImageGrid(fig, (nrows, ncols, i * 2 + 1), nrows_ncols=(2, 1), axes_pad=0) for j in range(2): axes_off(grid[j]) tmp = NamedTemporaryFile(dir=mytmpdir(), suffix=".png") motif.to_img(tmp.name, format="PNG", height=6) grid[0].imshow(plt.imread(tmp.name), interpolation="none") tmp = NamedTemporaryFile(dir=mytmpdir(), suffix=".png") dbmotif.to_img(tmp.name, format="PNG") grid[1].imshow(plt.imread(tmp.name), interpolation="none") ax = plt.subplot(nrows, ncols, i * 2 + 2) axes_off(ax) ax.text(0, 0.5, text, horizontalalignment="left", verticalalignment="center") plt.savefig(outfile, dpi=300, bbox_inches="tight") plt.close(fig)
def _run_tool(job_name, t, fastafile, params): try: result = t.run(fastafile, ".", params, mytmpdir()) except Exception as e: result = ([], "", "{} failed to run: {}".format(job_name, e)) return job_name, result
def _run_tool(job_name, t, fastafile, params): """Parallel motif prediction.""" try: result = t.run(fastafile, params, mytmpdir()) except Exception as e: result = ([], "", "{} failed to run: {}".format(job_name, e)) return job_name, result
def __init__(self, matchfile, genome="hg19", number=None, size=None): # Create temporary files tmpbed = NamedTemporaryFile(dir=mytmpdir()).name tmpfasta = NamedTemporaryFile(dir=mytmpdir()).name # Create bed-file with coordinates of random sequences matched_gc_bedfile(tmpbed, matchfile, genome, number, size=size) # Convert track to fasta Genome(genome).track2fasta(tmpbed, fastafile=tmpfasta) # Initialize super Fasta object Fasta.__init__(self, tmpfasta) # Delete the temporary files os.remove(tmpbed) os.remove(tmpfasta)
def __init__(self, genome, size=None, n=None): size = int(size) # Create temporary files tmpbed = NamedTemporaryFile(dir=mytmpdir()).name tmpfasta = NamedTemporaryFile(dir=mytmpdir()).name # Create bed-file with coordinates of random sequences create_random_genomic_bedfile(tmpbed, genome, size, n) # Convert track to fasta Genome(genome).track2fasta(tmpbed, fastafile=tmpfasta, stranded=True) # Initialize super Fasta object Fasta.__init__(self, tmpfasta) # Delete the temporary files os.remove(tmpbed) os.remove(tmpfasta)
def _determine_best_motif_in_cluster(self, clusters, pwm, sample_fa, bg_fa, imgdir=None): num_cluster = {} best_id = {} out = open(pwm, "w") for i, (clus, singles) in enumerate(clusters): best_motif = clus if len(singles) > 1: motifs = [clus] + singles tmp = NamedTemporaryFile(dir=mytmpdir()) tmp2 = NamedTemporaryFile(dir=mytmpdir()) for m in motifs: tmp.write("%s\n" % m.to_pwm()) tmp.flush() auc, mncp = self._roc_metrics(tmp.name, sample_fa, bg_fa, tmp2.name) bla = sorted(motifs, cmp=lambda x, y: cmp(mncp[x.id], mncp[y.id])) for m in bla: self.logger.debug("sorted: %s %s %s", str(m), mncp[m.id], auc[m.id]) self.logger.debug("end list") best_motif = sorted( motifs, cmp=lambda x, y: cmp(mncp[x.id], mncp[y.id]))[-1] tmp.close() tmp2.close() old_id = best_motif.id best_motif.id = "GimmeMotifs_%d" % (i + 1) best_id[best_motif.id] = old_id.split("_")[0] num_cluster["%s_%s" % (best_motif.id, best_motif.to_consensus())] = len(singles) if imgdir: best_motif.to_img(os.path.join(imgdir, best_motif.id), format="PNG") out.write("%s\n" % best_motif.to_pwm()) out.close() return num_cluster, best_id
def match_plot(plotdata, outfile): """Plot list of motifs with database match and p-value "param plotdata: list of (motif, dbmotif, pval) """ fig_h = 2 fig_w = 7 nrows = len(plotdata) ncols = 2 fig = plt.figure(figsize=(fig_w, nrows * fig_h)) for i, (motif, dbmotif, pval) in enumerate(plotdata): text = "Motif: %s\nBest match: %s\np-value: %0.2e" % ( motif.id, dbmotif.id, pval, ) grid = ImageGrid(fig, (nrows, ncols, i * 2 + 1), nrows_ncols=(2, 1), axes_pad=0) for j in range(2): axes_off(grid[j]) tmp = NamedTemporaryFile(dir=mytmpdir(), suffix=".png", delete=False) motif.plot_logo(fname=tmp.name, title=False) grid[0].imshow(plt.imread(tmp.name), interpolation="none") tmp = NamedTemporaryFile(dir=mytmpdir(), suffix=".png", delete=False) dbmotif.plot_logo(fname=tmp.name, title=False) grid[1].imshow(plt.imread(tmp.name), interpolation="none") ax = plt.subplot(nrows, ncols, i * 2 + 2) axes_off(ax) ax.text(0, 0.5, text, horizontalalignment="left", verticalalignment="center") plt.savefig(outfile, dpi=300, bbox_inches="tight") plt.close(fig)
def __init__(self, index="/usr/share/gimmemotifs/genome_index/hg18", length=None, n=None): length = int(length) # Create temporary files tmpbed = NamedTemporaryFile(dir=mytmpdir()).name tmpfasta = NamedTemporaryFile(dir=mytmpdir()).name # Create bed-file with coordinates of random sequences create_random_genomic_bedfile(tmpbed, index, length, n) # Convert track to fasta track2fasta(index, tmpbed, tmpfasta, use_strand=True) # Initialize super Fasta object Fasta.__init__(self, tmpfasta) # Delete the temporary files os.remove(tmpbed) os.remove(tmpfasta)
def to_img(self, fname, fmt="EPS", add_left=0, seqlogo=None, height=6): """ Valid formats EPS, GIF, PDF, PNG """ if not seqlogo: seqlogo = self.seqlogo if not seqlogo: raise ValueError("seqlogo not specified or configured") #TODO: split to_align function VALID_FORMATS = ["EPS", "GIF", "PDF", "PNG"] N = 1000 fmt = fmt.upper() if not fmt in VALID_FORMATS: sys.stderr.write("Invalid motif format\n") return if fname[-4:].upper() == (".%s" % fmt): fname = fname[:-4] seqs = [] if add_left == 0: seqs = ["" for i in range(N)] else: for nuc in ["A", "C", "T", "G"]: seqs += [nuc * add_left for i in range(N // 4)] for pos in range(len(self.pwm)): vals = [self.pwm[pos][0] * N] for i in range(1,4): vals.append(vals[i-1] + self.pwm[pos][i] * N) if vals[3] - N != 0: #print "Motif weights don't add up to 1! Error of %s%%" % ((vals[3] - n)/ n * 100) vals[3] = N for i in range(N): if i <= vals[0]: seqs[i] += "A" elif i <= vals[1]: seqs[i] += "C" elif i <= vals[2]: seqs[i] += "G" elif i <= vals[3]: seqs[i] += "T" f = NamedTemporaryFile(mode="w", dir=mytmpdir()) for seq in seqs: f.write("%s\n" % seq) f.flush() makelogo = "{0} -f {1} -F {2} -c -a -h {3} -w {4} -o {5} -b -n -Y" cmd = makelogo.format( seqlogo, f.name, fmt, height, len(self) + add_left, fname) sp.call(cmd, shell=True)
def to_img(self, fname, format="EPS", add_left=0, seqlogo=None, height=6): """ Valid formats EPS, GIF, PDF, PNG """ if not seqlogo: seqlogo = self.seqlogo if not seqlogo: raise ValueError("seqlogo not specified or configured") #TODO: split to_align function VALID_FORMATS = ["EPS", "GIF", "PDF", "PNG"] N = 1000 format = format.upper() if not format in VALID_FORMATS: sys.stderr.write("Invalid motif format\n") return if fname[-4:].upper() == (".%s" % format): fname = fname[:-4] seqs = [] if add_left == 0: seqs = ["" for i in range(N)] else: for nuc in ["A", "C", "T", "G"]: seqs += [nuc * add_left for i in range(N / 4)] for pos in range(len(self.pwm)): vals = [self.pwm[pos][0] * N] for i in range(1,4): vals.append(vals[i-1] + self.pwm[pos][i] * N) if vals[3] - N != 0: #print "Motif weights don't add up to 1! Error of %s%%" % ((vals[3] - n)/ n * 100) vals[3] = N for i in range(N): if i <= vals[0]: seqs[i] += "A" elif i <= vals[1]: seqs[i] += "C" elif i <= vals[2]: seqs[i] += "G" elif i <= vals[3]: seqs[i] += "T" f = NamedTemporaryFile(dir=mytmpdir()) for seq in seqs: f.write("%s\n" % seq) f.flush() makelogo = "{0} -f {1} -F {2} -c -a -h {3} -w {4} -o {5} -b -n -Y" cmd = makelogo.format( seqlogo, f.name, format, height, len(self) + add_left, fname) sp.call(cmd, shell=True)
def __init__(self, matchfile, genome="hg19", number=None): config = MotifConfig() index = os.path.join(config.get_index_dir(), genome) # Create temporary files tmpbed = NamedTemporaryFile(dir=mytmpdir()).name tmpfasta = NamedTemporaryFile(dir=mytmpdir()).name # Create bed-file with coordinates of random sequences matched_gc_bedfile(tmpbed, matchfile, genome, number) # Convert track to fasta track2fasta(index, tmpbed, tmpfasta) # Initialize super Fasta object Fasta.__init__(self, tmpfasta) # Delete the temporary files os.remove(tmpbed) os.remove(tmpfasta)
def pp_predict_motifs(fastafile, outfile, analysis="small", organism="hg18", single=False, background="", tools={}, job_server="", ncpus=8, logger=None, max_time=None, fg_file=None, bg_file=None): config = MotifConfig() if not tools: tools = dict([(x,1) for x in config.get_default_params["tools"].split(",")]) if not logger: logger = logging.getLogger('prediction.pp_predict_motifs') wmin = 5 step = 1 if analysis in ["large","xl"]: step = 2 wmin = 6 analysis_max = {"xs":5,"small":8, "medium":10,"large":14, "xl":20} wmax = analysis_max[analysis] if analysis == "xs": sys.stderr.write("Setting analysis xs to small") analysis = "small" if not job_server: job_server = pp.Server(ncpus, secret='pumpkinrisotto') jobs = {} result = PredictionResult(outfile, logger=logger, fg_file=fg_file, bg_file=bg_file, job_server=job_server) # Dynamically load all tools toolio = [x[1]() for x in inspect.getmembers( tool_classes, lambda x: inspect.isclass(x) and issubclass(x, tool_classes.MotifProgram) ) if x[0] != 'MotifProgram'] # TODO: # Add warnings for running time: Weeder, GADEM ### Add all jobs to the job_server ### params = {'analysis': analysis, 'background':background, "single":single, "organism":organism} for t in toolio: if tools.has_key(t.name) and tools[t.name]: if t.use_width: for i in range(wmin, wmax + 1, step): logger.debug("Starting %s job, width %s" % (t.name, i)) job_name = "%s_width_%s" % (t.name, i) params['width'] = i jobs[job_name] = job_server.submit( t.run, (fastafile, ".", params, mytmpdir()), (tool_classes.MotifProgram,), ("gimmemotifs.config",), result.add_motifs, (job_name,)) else: logger.debug("Starting %s job" % t.name) job_name = t.name jobs[job_name] = job_server.submit( t.run, (fastafile, ".", params, mytmpdir()), (tool_classes.MotifProgram,), ("gimmemotifs.config",), result.add_motifs, (job_name,)) else: logger.debug("Skipping %s" % t.name) ### Wait until all jobs are finished or the time runs out ### start_time = time() try: # Run until all jobs are finished while len(result.finished) < len(jobs.keys()) and (not(max_time) or time() - start_time < max_time): pass if len(result.finished) < len(jobs.keys()): logger.info("Maximum allowed running time reached, destroying remaining jobs") job_server.destroy() result.get_remaining_stats() ### Or the user gets impatient... ### except KeyboardInterrupt, e: # Destroy all running jobs logger.info("Caught interrupt, destroying all running jobs") job_server.destroy() result.get_remaining_stats()
def run_full_analysis(self, inputfile, user_params=None): """ Full analysis: from bed-file to motifs (including clustering, ROC-curves, location plots and html report) """ self.logger.info("starting full motif analysis") self.logger.debug("Using temporary directory {0}".format(mytmpdir())) if user_params is None: user_params = {} params = self.config.get_default_params() params.update(user_params) if params["torque"]: from gimmemotifs.prediction_torque import pp_predict_motifs, PredictionResult self.logger.debug("Using torque") else: from gimmemotifs.prediction import pp_predict_motifs, PredictionResult self.logger.debug("Using multiprocessing") self.params = params #self.weird = params["weird_option"] background = [x.strip() for x in params["background"].split(",")] self.logger.debug("Parameters:") for param, value in params.items(): self.logger.debug(" %s: %s", param, value) # Checking input self.input_type = "BED" # If we can load it as fasta then it is a fasta, yeh? try: Fasta(inputfile) self.logger.debug("Inputfile is a FASTA file") self.input_type = "FASTA" except Exception: # Leave it to BED pass index_msg = ("No index found for genome {}! " "Has GimmeMotifs been configured correctly and is the " "genome indexed?").format(params["genome"]) index_dir = os.path.join(self.config.get_index_dir(), params["genome"]) if self.input_type == "FASTA": for bg in background: if not bg in FA_VALID_BGS: self.logger.info( "Input type is FASTA, can't use background type '%s'", bg) if bg == "genomic": if not os.path.exists(index_dir): self.logger.error(index_msg) sys.exit(1) background = [bg for bg in background if bg in FA_VALID_BGS] elif self.input_type == "BED": # Does the index_dir exist? #bed-specific if not os.path.exists(index_dir): self.logger.error(index_msg) sys.exit(1) # is it a valid bed-file etc. self._check_input(inputfile) # bed-specific # Check for valid background for bg in background: if not bg in BED_VALID_BGS: self.logger.info( "Input type is BED, can't use background type '%s'", bg) background = [bg for bg in background if bg in BED_VALID_BGS] if len(background) == 0: self.logger.error("No valid backgrounds specified!") sys.exit(1) self.max_time = None max_time = None # Maximum time? if params["max_time"]: try: max_time = float(params["max_time"]) except Exception: self.logger.debug( "Could not parse max_time value, setting to no limit") self.max_time = None if max_time > 0: self.logger.debug( "Time limit for motif prediction: %0.2f hours" % max_time) max_time = 3600 * max_time self.max_time = max_time self.logger.debug("Max_time in seconds %0.0f" % self.max_time) else: self.logger.debug( "Invalid time limit for motif prediction, setting to no limit" ) self.max_time = None else: self.logger.debug("No time limit for motif prediction") if "random" in background: self.markov_model = params["markov_model"] # Create the necessary files for motif prediction and validation if self.input_type == "BED": self.prepare_input_bed(inputfile, params["genome"], params["width"], params["fraction"], params["abs_max"], params["use_strand"]) # Create file for location plots index_dir = os.path.join(self.config.get_index_dir(), params["genome"]) lwidth = int(params["lwidth"]) width = int(params["width"]) extend = (lwidth - width) / 2 genome_index.track2fasta(index_dir, self.validation_bed, self.location_fa, extend_up=extend, extend_down=extend, use_strand=params["use_strand"], ignore_missing=True) elif self.input_type == "FASTA": self.prepare_input_fa(inputfile, params["width"], params["fraction"], params["abs_max"]) # File for location plots self.location_fa = self.validation_fa fa = Fasta(self.location_fa) seqs = fa.seqs lwidth = len(seqs[0]) all_same_width = not (False in [len(seq) == lwidth for seq in seqs]) if not all_same_width: self.logger.warn( "PLEASE NOTE: FASTA file contains sequences of different lengths. Positional preference plots will be incorrect!" ) else: self.logger.error("Unknown input type, shouldn't happen") sys.exit(1) tools = dict([(x.strip(), x in [y.strip() for y in params["tools"].split(",")]) for x in params["available_tools"].split(",")]) self.create_background(background, params["genome"], params["width"]) # Predict the motifs analysis = params["analysis"] """ Predict motifs, input is a FASTA-file""" self.logger.info("starting motif prediction (%s)", analysis) self.logger.info("tools: %s", ", ".join([x for x in tools.keys() if tools[x]])) bg_file = self.bg_file["fa"][sorted( background, lambda x, y: cmp(BG_RANK[x], BG_RANK[y]))[0]] self.logger.debug("Using bg_file %s for significance" % bg_file) result = pp_predict_motifs(self.prediction_fa, self.predicted_pfm, analysis, params["genome"], params["use_strand"], self.prediction_bg, tools, self.job_server(), logger=self.logger, max_time=self.max_time, fg_file=self.validation_fa, bg_file=bg_file) motifs = result.motifs self.logger.info("predicted %s motifs", len(motifs)) self.logger.debug("written to %s", self.predicted_pfm) if len(motifs) == 0: self.logger.info("no motifs found") sys.exit() # Write stats output to file f = open(self.stats_file, "w") stat_keys = result.stats.values()[0].keys() f.write("%s\t%s\n" % ("Motif", "\t".join(stat_keys))) self.logger.debug(result.stats) for motif in motifs: stats = result.stats["%s_%s" % (motif.id, motif.to_consensus())] if stats: f.write( "%s\t%s\n" % (motif.id, "\t".join([str(stats[k]) for k in stat_keys]))) else: self.logger.error( "No stats for motif {0}, skipping this motif!".format( motif.id)) motifs.remove(motif) f.close() self.motifs_with_stats = motifs f = open(self.ranks_file, "w") tools = dict((m.id.split("_")[0], 1) for m in motifs).keys() f.write("Metric\tType\t%s\n" % ("\t".join(tools))) for stat in ["mncp", "roc_auc", "maxenr"]: best_motif = {} for motif in self.motifs_with_stats: val = result.stats["%s_%s" % (motif.id, motif.to_consensus())][stat] name = motif.id.split("_")[0] if val > best_motif.setdefault(name, 0): best_motif[name] = val names = best_motif.keys() vals = [best_motif[name] for name in names] rank = rankdata(vals) ind = [names.index(x) for x in tools] f.write("%s\t%s\t%s\n" % (stat, "value", "\t".join([str(vals[i]) for i in ind]))) f.write("%s\t%s\t%s\n" % (stat, "rank", "\t".join([str(rank[i]) for i in ind]))) f.close() #self.logger.debug("RANK: %s" % stat) #self.logger.debug("\t".join([str(x) for x in names])) #self.logger.debug("\t".join([str(x) for x in vals])) #self.logger.debug("\t".join([str(x) for x in rank])) # Determine significant motifs nsig = 0 f = open(self.significant_pfm, "w") for motif in motifs: stats = result.stats["%s_%s" % (motif.id, motif.to_consensus())] if stats["maxenr"] >= 3 and stats["roc_auc"] >= 0.55 and stats[ 'enr_fdr'] >= 2: f.write("%s\n" % motif.to_pfm()) nsig += 1 f.close() self.logger.info("%s motifs are significant", nsig) self.logger.debug("written to %s", self.significant_pfm) if nsig == 0: self.logger.info("no significant motifs found") return # ROC metrics of significant motifs for bg in background: self._roc_metrics(self.significant_pfm, self.validation_fa, self.bg_file["fa"][bg], self.bg_file["roc"][bg]) # Cluster significant motifs clusters = self._cluster_motifs(self.significant_pfm, self.cluster_pwm, self.outdir, params["cluster_threshold"]) # Determine best motif in cluster num_cluster, best_id = self._determine_best_motif_in_cluster( clusters, self.final_pwm, self.validation_fa, bg_file, self.imgdir) ### Enable parallel and modular evaluation of results # Scan (multiple) files with motifs # Define callback functions once scanning is finished: # - ROC plot # - Statistics # - Location plots (histogram) # - # Stars tmp = NamedTemporaryFile(dir=mytmpdir()).name p = PredictionResult(tmp, logger=self.logger, job_server=self.server, fg_file=self.validation_fa, bg_file=bg_file, do_counter=False) p.add_motifs( ("clustering", (read_motifs(open(self.final_pwm)), "", ""))) while len(p.stats.keys()) < len(p.motifs): sleep(5) #print "p.stats" #print p.stats #print "num_cluster" #print num_cluster for mid, num in num_cluster.items(): p.stats[mid]["numcluster"] = num all_stats = { "mncp": [2, 5, 8], "roc_auc": [0.6, 0.75, 0.9], "maxenr": [10, 20, 30], "enr_fdr": [4, 8, 12], "fraction": [0.4, 0.6, 0.8], "ks_sig": [4, 7, 10], "numcluster": [3, 6, 9], } self.logger.info("creating report") # ROC plots for bg in background: self.create_roc_plots(self.final_pwm, self.validation_fa, self.bg_file["fa"][bg], bg) # Location plots self.logger.debug("Creating localization plots") motifs = read_motifs(open(self.final_pwm), fmt="pwm") for motif in motifs: m = "%s_%s" % (motif.id, motif.to_consensus()) s = p.stats[m] outfile = os.path.join(self.imgdir, "%s_histogram.svg" % motif.id) motif_localization(self.location_fa, motif, lwidth, outfile, cutoff=s["cutoff_fdr"]) s["stars"] = int( mean([star(s[x], all_stats[x]) for x in all_stats.keys()]) + 0.5) self.logger.debug("Motif %s: %s stars" % (m, s["stars"])) # Calculate enrichment of final, clustered motifs self.calculate_cluster_enrichment(self.final_pwm, background) # Create report self.print_params() self._calc_report_values(self.final_pwm, background) self._create_report(self.final_pwm, background, stats=p.stats, best_id=best_id) self._create_text_report(self.final_pwm, background) self.logger.info("finished") self.logger.info("output dir: %s", os.path.split(self.motif_report)[0]) self.logger.info("report: %s", os.path.split(self.motif_report)[-1]) #self.logger.info("Open %s in your browser to see your results." % (self.motif_report)) if not (params["keep_intermediate"]): self.logger.debug( "Deleting intermediate files. Please specifify the -k option if you want to keep these files." ) shutil.rmtree(self.tmpdir) self.logger.debug("Done") return self.motif_report
def to_img(self, fname, fmt="PNG", add_left=0, seqlogo=None, height=6): """Create a sequence logo using seqlogo. Create a sequence logo and save it to a file. Valid formats are: PNG, EPS, GIF and PDF. Parameters ---------- fname : str Output filename. fmt : str , optional Output format (case-insensitive). Valid formats are PNG, EPS, GIF and PDF. add_left : int , optional Pad motif with empty positions on the left side. seqlogo : str Location of the seqlogo executable. By default the seqlogo version that is included with GimmeMotifs is used. height : float Height of the image """ if not seqlogo: seqlogo = self.seqlogo if not seqlogo: raise ValueError("seqlogo not specified or configured") #TODO: split to_align function VALID_FORMATS = ["EPS", "GIF", "PDF", "PNG"] N = 1000 fmt = fmt.upper() if not fmt in VALID_FORMATS: sys.stderr.write("Invalid motif format\n") return if fname[-4:].upper() == (".%s" % fmt): fname = fname[:-4] seqs = [] if add_left == 0: seqs = ["" for i in range(N)] else: for nuc in ["A", "C", "T", "G"]: seqs += [nuc * add_left for i in range(N // 4)] for pos in range(len(self.pwm)): vals = [self.pwm[pos][0] * N] for i in range(1,4): vals.append(vals[i-1] + self.pwm[pos][i] * N) if vals[3] - N != 0: #print "Motif weights don't add up to 1! Error of %s%%" % ((vals[3] - n)/ n * 100) vals[3] = N for i in range(N): if i <= vals[0]: seqs[i] += "A" elif i <= vals[1]: seqs[i] += "C" elif i <= vals[2]: seqs[i] += "G" elif i <= vals[3]: seqs[i] += "T" f = NamedTemporaryFile(mode="w", dir=mytmpdir()) for seq in seqs: f.write("%s\n" % seq) f.flush() makelogo = "{0} -f {1} -F {2} -c -a -h {3} -w {4} -o {5} -b -n -Y" cmd = makelogo.format( seqlogo, f.name, fmt, height, len(self) + add_left, fname) sp.call(cmd, shell=True)
def run_full_analysis(self, inputfile, user_params=None): """ Full analysis: from bed-file to motifs (including clustering, ROC-curves, location plots and html report) """ self.logger.info("Starting full motif analysis") self.logger.info("Using temporary directory {0}".format(mytmpdir())) if user_params is None: user_params = {} params = self.config.get_default_params() params.update(user_params) if params["torque"]: from gimmemotifs.prediction_torque import pp_predict_motifs, PredictionResult self.logger.info("Using torque") else: from gimmemotifs.prediction import pp_predict_motifs, PredictionResult self.logger.info("Using multiprocessing") self.params = params #self.weird = params["weird_option"] background = [x.strip() for x in params["background"].split(",")] self.logger.info("Parameters:") for param, value in params.items(): self.logger.info(" %s: %s" % (param, value)) # Checking input self.input_type = "BED" # If we can load it as fasta then it is a fasta, yeh? try: Fasta(inputfile) self.logger.info("Inputfile is a FASTA file") self.input_type = "FASTA" except: # Leave it to BED pass if self.input_type == "FASTA": for bg in background: if not bg in FA_VALID_BGS: self.logger.info("Input type is FASTA, can't use background type '%s'" % bg) background = [bg for bg in background if bg in FA_VALID_BGS] elif self.input_type == "BED": # Does the index_dir exist? #bed-specific index_dir = os.path.join(self.config.get_index_dir(), params["genome"]) if not os.path.exists(index_dir): self.logger.error("No index found for genome %s! Has GimmeMotifs been configured correctly and is the genome indexed?" % params["genome"]) sys.exit(1) # is it a valid bed-file etc. self._check_input(inputfile) # bed-specific # Check for valid background for bg in background: if not bg in BED_VALID_BGS: self.logger.info("Input type is BED, can't use background type '%s'" % bg) background = [bg for bg in background if bg in BED_VALID_BGS] if len(background) == 0: self.logger.error("No valid backgrounds specified!") sys.exit(1) self.max_time = None max_time = None # Maximum time? if params["max_time"]: try: max_time = float(params["max_time"]) except: self.logger.info("Could not parse max_time value, setting to no limit") self.max_time = None if max_time > 0: self.logger.info("Time limit for motif prediction: %0.2f hours" % max_time) max_time = 3600 * max_time self.max_time = max_time self.logger.debug("Max_time in seconds %0.0f" % self.max_time) else: self.logger.info("Invalid time limit for motif prediction, setting to no limit") self.max_time = None else: self.logger.info("No time limit for motif prediction") if "random" in background: self.markov_model = params["markov_model"] # Create the necessary files for motif prediction and validation if self.input_type == "BED": self.prepare_input_bed(inputfile, params["genome"], params["width"], params["fraction"], params["abs_max"], params["use_strand"]) # Create file for location plots index_dir = os.path.join(self.config.get_index_dir(), params["genome"]) lwidth = int(params["lwidth"]) width = int(params["width"]) extend = (lwidth - width) / 2 genome_index.track2fasta(index_dir, self.validation_bed, self.location_fa, extend_up=extend, extend_down=extend, use_strand=params["use_strand"], ignore_missing=True) elif self.input_type == "FASTA": self.prepare_input_fa(inputfile, params["width"], params["fraction"], params["abs_max"]) # File for location plots self.location_fa = self.validation_fa fa = Fasta(self.location_fa) seqs = fa.seqs lwidth = len(seqs[0]) all_same_width = not(False in [len(seq) == lwidth for seq in seqs]) if not all_same_width: self.logger.warn("PLEASE NOTE: FASTA file contains sequences of different lengths. Positional preference plots will be incorrect!") else: self.logger.error("Unknown input type, shouldn't happen") sys.exit(1) tools = dict([(x.strip(), x in [y.strip() for y in params["tools"].split(",")]) for x in params["available_tools"].split(",")]) self.create_background(background, params["genome"], params["width"]) # Predict the motifs analysis = params["analysis"] """ Predict motifs, input is a FASTA-file""" self.logger.info("Starting motif prediction (%s) using %s" % (analysis, ", ".join([x for x in tools.keys() if tools[x]]))) bg_file = self.bg_file["fa"][sorted(background, lambda x,y: cmp(BG_RANK[x], BG_RANK[y]))[0]] self.logger.info("Using bg_file %s for significance" % bg_file) result = pp_predict_motifs(self.prediction_fa, self.predicted_pfm, analysis, params["genome"], params["use_strand"], self.prediction_bg, tools, self.job_server(), logger=self.logger, max_time=self.max_time, fg_file=self.validation_fa, bg_file=bg_file) motifs = result.motifs self.logger.info("Predicted %s motifs, written to %s" % (len(motifs), self.predicted_pfm)) if len(motifs) == 0: self.logger.info("No motifs found. Done.") sys.exit() # Write stats output to file f = open(self.stats_file, "w") stat_keys = result.stats.values()[0].keys() f.write("%s\t%s\n" % ("Motif", "\t".join(stat_keys))) print result.stats for motif in motifs: stats = result.stats["%s_%s" % (motif.id, motif.to_consensus())] if stats: f.write("%s\t%s\n" % (motif.id, "\t".join([str(stats[k]) for k in stat_keys]))) else: self.logger.error("No stats for motif {0}, skipping this motif!".format(motif.id)) motifs.remove(motif) f.close() self.motifs_with_stats = motifs f = open(self.ranks_file, "w") tools = dict((m.id.split("_")[0],1) for m in motifs).keys() f.write("Metric\tType\t%s\n" % ("\t".join(tools))) for stat in ["mncp", "roc_auc", "maxenr"]: best_motif = {} for motif in self.motifs_with_stats: val = result.stats["%s_%s" % (motif.id, motif.to_consensus())][stat] name = motif.id.split("_")[0] if val > best_motif.setdefault(name, 0): best_motif[name] = val names = best_motif.keys() vals = [best_motif[name] for name in names] rank = rankdata(vals) ind = [names.index(x) for x in tools] f.write("%s\t%s\t%s\n" % (stat, "value", "\t".join([str(vals[i]) for i in ind]))) f.write("%s\t%s\t%s\n" % (stat, "rank", "\t".join([str(rank[i]) for i in ind]))) f.close() #self.logger.debug("RANK: %s" % stat) #self.logger.debug("\t".join([str(x) for x in names])) #self.logger.debug("\t".join([str(x) for x in vals])) #self.logger.debug("\t".join([str(x) for x in rank])) # Determine significant motifs nsig = 0 f = open(self.significant_pfm, "w") for motif in motifs: stats = result.stats["%s_%s" % (motif.id, motif.to_consensus())] if stats["maxenr"] >= 3 and stats["roc_auc"] >= 0.55 and stats['enr_fdr'] >= 2: f.write("%s\n" % motif.to_pfm()) nsig += 1 f.close() self.logger.info("%s motifs are significant, written to %s" % (nsig, self.significant_pfm)) if nsig == 0: self.logger.info("No significant motifs found. Done.") sys.exit() # ROC metrics of significant motifs for bg in background: self._roc_metrics(self.significant_pfm, self.validation_fa, self.bg_file["fa"][bg], self.bg_file["roc"][bg]) # Cluster significant motifs clusters = self._cluster_motifs(self.significant_pfm, self.cluster_pwm, self.outdir, params["cluster_threshold"]) # Determine best motif in cluster num_cluster, best_id = self._determine_best_motif_in_cluster(clusters, self.final_pwm, self.validation_fa, bg_file, self.imgdir) ### Enable parallel and modular evaluation of results # Scan (multiple) files with motifs # Define callback functions once scanning is finished: # - ROC plot # - Statistics # - Location plots (histogram) # - # Stars tmp = NamedTemporaryFile(dir=mytmpdir()).name p = PredictionResult(tmp, logger=self.logger, job_server=self.server, fg_file = self.validation_fa, bg_file = bg_file) p.add_motifs(("Clustering", (pwmfile_to_motifs(self.final_pwm), "",""))) while len(p.stats.keys()) < len(p.motifs): sleep(5) for mid, num in num_cluster.items(): p.stats[mid]["numcluster"] = num all_stats = { "mncp": [2, 5, 8], "roc_auc": [0.6, 0.75, 0.9], "maxenr": [10, 20, 30], "enr_fdr": [4, 8, 12], "fraction": [0.4, 0.6, 0.8], "ks_sig": [4, 7, 10], "numcluster": [3, 6, 9], } # ROC plots for bg in background: self.create_roc_plots(self.final_pwm, self.validation_fa, self.bg_file["fa"][bg], bg) # Location plots self.logger.info("Creating localization plots") motifs = pwmfile_to_motifs(self.final_pwm) for motif in motifs: m = "%s_%s" % (motif.id, motif.to_consensus()) s = p.stats[m] outfile = os.path.join(self.imgdir, "%s_histogram.svg" % motif.id) motif_localization(self.location_fa, motif, lwidth, outfile, cutoff=s["cutoff_fdr"]) s["stars"] = int(mean([star(s[x], all_stats[x]) for x in all_stats.keys()]) + 0.5) self.logger.debug("Motif %s: %s stars" % (m, s["stars"])) # Calculate enrichment of final, clustered motifs self.calculate_cluster_enrichment(self.final_pwm, background) # Create report self.print_params() self._calc_report_values(self.final_pwm, background) self._create_report(self.final_pwm, background, stats=p.stats, best_id=best_id) self._create_text_report(self.final_pwm, background) self.logger.info("Open %s in your browser to see your results." % (self.motif_report)) if not(params["keep_intermediate"]): self.logger.info("Deleting intermediate files. Please specifify the -k option if you want to keep these files.") shutil.rmtree(self.tmpdir) self.logger.info("Done")
def gimme_motifs(inputfile, outdir, params=None, filter_significant=True, cluster=True, create_report=True): """De novo motif prediction based on an ensemble of different tools. Parameters ---------- inputfile : str Filename of input. Can be either BED, narrowPeak or FASTA. outdir : str Name of output directory. params : dict, optional Optional parameters. filter_significant : bool, optional Filter motifs for significance using the validation set. cluster : bool, optional Cluster similar predicted (and significant) motifs. create_report : bool, optional Create output reports (both .txt and .html). Returns ------- motifs : list List of predicted motifs. Examples -------- >>> from gimmemotifs.denovo import gimme_motifs >>> gimme_motifs("input.fa", "motifs.out") """ if outdir is None: outdir = "gimmemotifs_{}".format(datetime.date.today().strftime("%d_%m_%Y")) # Create output directories tmpdir = os.path.join(outdir, "intermediate") for d in [outdir, tmpdir]: if not os.path.exists(d): os.mkdir(d) # setup logfile logger = logging.getLogger("gimme") # Log to file logfile = os.path.join(outdir, "gimmemotifs.log") fh = logging.FileHandler(logfile, "w") fh.setLevel(logging.DEBUG) file_formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") fh.setFormatter(file_formatter) logger.addHandler(fh) logger = logging.getLogger("gimme.denovo.gimme_motifs") # Initialize parameters params = parse_denovo_params(params) # Check the input files input_type, background = check_denovo_input(inputfile, params) logger.info("starting full motif analysis") logger.debug("Using temporary directory %s", mytmpdir()) # Create the necessary files for motif prediction and validation if input_type == "bed": prepare_denovo_input_bed(inputfile, params, tmpdir) elif input_type == "narrowpeak": prepare_denovo_input_narrowpeak(inputfile, params, tmpdir) elif input_type == "fasta": prepare_denovo_input_fa(inputfile, params, tmpdir) else: logger.error("Unknown input file.") sys.exit(1) # Create the background FASTA files background = create_backgrounds( tmpdir, background, params.get("genome", None), params["width"], params.get("custom_background", None) ) # Predict de novo motifs result = predict_motifs( os.path.join(tmpdir, "prediction.fa"), os.path.join(tmpdir, "prediction.bg.fa"), os.path.join(tmpdir, "all_motifs.pfm"), params=params, stats_fg=os.path.join(tmpdir, 'validation.fa'), stats_bg=background, ) if len(result.motifs) == 0: logger.info("finished") return [] # Write statistics stats_file = os.path.join(tmpdir, "stats.{}.txt") write_stats(result.stats, stats_file) bg = sorted(background, key=lambda x: BG_RANK[x])[0] if filter_significant: motifs = filter_significant_motifs( os.path.join(tmpdir, "significant_motifs.pfm"), result, bg) if len(motifs) == 0: logger.info("no significant motifs") return pwmfile = os.path.join(tmpdir, "significant_motifs.pfm") else: logger.info("not filtering for significance") motifs = result.motifs pwmfile = os.path.join(tmpdir, "all_motifs.pfm") if cluster: clusters = cluster_motifs_with_report( pwmfile, os.path.join(tmpdir, "clustered_motifs.pfm"), outdir, 0.95, title=inputfile) # Determine best motif in cluster best_motifs = best_motif_in_cluster( pwmfile, os.path.join(tmpdir, "clustered_motifs.pfm"), clusters, os.path.join(tmpdir, 'validation.fa'), background, result.stats) final_motifs, stats = rename_motifs(best_motifs, result.stats) else: logger.info("not clustering") rank = rank_motifs(result.stats) sorted_motifs = sorted(motifs, key=lambda x: rank[str(x)], reverse=True) final_motifs, stats = rename_motifs(sorted_motifs, result.stats) with open(os.path.join(outdir, "motifs.pwm"), "w") as f: for m in final_motifs: f.write("{}\n".format(m.to_pwm())) if create_report: bg = dict([(b, os.path.join(tmpdir, "bg.{}.fa".format(b))) for b in background]) create_denovo_motif_report( inputfile, os.path.join(outdir, "motifs.pwm"), os.path.join(tmpdir, "validation.fa"), bg, os.path.join(tmpdir, "localization.fa"), outdir, params, stats, ) with open(os.path.join(outdir, "params.txt"), "w") as f: for k,v in params.items(): f.write("{}\t{}\n".format(k,v)) if not(params.get("keep_intermediate")): logger.debug( "Deleting intermediate files. " "Please specifify the -k option if you want to keep these files.") shutil.rmtree(tmpdir) logger.info("finished") logger.info("output dir: %s", outdir) if cluster: logger.info("report: %s", os.path.join(outdir, "motif_report.html")) return final_motifs
def gimme_motifs( inputfile, outdir, params=None, filter_significant=True, cluster=True, create_report=True, ): """De novo motif prediction based on an ensemble of different tools. Parameters ---------- inputfile : str Filename of input. Can be either BED, narrowPeak or FASTA. outdir : str Name of output directory. params : dict, optional Optional parameters. filter_significant : bool, optional Filter motifs for significance using the validation set. cluster : bool, optional Cluster similar predicted (and significant) motifs. create_report : bool, optional Create output reports (both .txt and .html). Returns ------- motifs : list List of predicted motifs. Examples -------- >>> from gimmemotifs.denovo import gimme_motifs >>> gimme_motifs("input.fa", "motifs.out") """ if outdir is None: outdir = "gimmemotifs_{}".format( datetime.date.today().strftime("%d_%m_%Y")) # Create output directories tmpdir = os.path.join(outdir, "intermediate") for d in [outdir, tmpdir]: if not os.path.exists(d): os.mkdir(d) # Log to file logger = logging.getLogger("gimme") logfile = os.path.join(outdir, "gimmemotifs.log") fh = logging.FileHandler(logfile, "w") fh.setLevel(logging.DEBUG) file_formatter = logging.Formatter( "%(asctime)s - %(name)s - %(levelname)s - %(message)s") fh.setFormatter(file_formatter) logger.addHandler(fh) logger = logging.getLogger("gimme.denovo") # Initialize parameters params = parse_denovo_params(params) # Check the input files input_type, background = check_denovo_input(inputfile, params) logger.info("starting full motif analysis") logger.debug("Using temporary directory %s", mytmpdir()) params["size"] = int(params["size"]) if params["size"] > 0: logger.info( "using size of {}, set size to 0 to use original region size". format(params["size"])) else: logger.info("using original size") # Create the necessary files for motif prediction and validation if input_type == "bed": logger.info("preparing input from BED") prepare_denovo_input_bed(inputfile, params, tmpdir) elif input_type == "narrowpeak": logger.info("preparing input from narrowPeak") prepare_denovo_input_narrowpeak(inputfile, params, tmpdir) elif input_type == "fasta": logger.info("preparing input from FASTA") prepare_denovo_input_fa(inputfile, params, tmpdir) else: logger.error("unknown input file format!") sys.exit(1) # Create the background FASTA files background = create_backgrounds( tmpdir, background, params.get("genome", None), params["size"], params.get("custom_background", None), ) # Predict de novo motifs result = predict_motifs( os.path.join(tmpdir, "prediction.fa"), os.path.join(tmpdir, "prediction.bg.fa"), os.path.join(tmpdir, "all_motifs.pfm"), params=params, stats_fg=os.path.join(tmpdir, "validation.fa"), stats_bg=background, ) if len(result.motifs) == 0: logger.info("finished") return [] # Write statistics stats_file = os.path.join(tmpdir, "stats.{}.txt") write_stats(result.stats, stats_file) bg = sorted(background, key=lambda x: BG_RANK[x])[0] if filter_significant: motifs = filter_significant_motifs( os.path.join(tmpdir, "significant_motifs.pfm"), result, bg) if len(motifs) == 0: logger.info("no significant motifs") return pfmfile = os.path.join(tmpdir, "significant_motifs.pfm") else: logger.info("not filtering for significance") motifs = result.motifs pfmfile = os.path.join(tmpdir, "all_motifs.pfm") if cluster: clusters = cluster_motifs_with_report( pfmfile, os.path.join(tmpdir, "clustered_motifs.pfm"), outdir, 0.95, title=inputfile, ) # Determine best motif in cluster best_motifs = best_motif_in_cluster( pfmfile, os.path.join(tmpdir, "clustered_motifs.pfm"), clusters, os.path.join(tmpdir, "validation.fa"), background, params["genome"], result.stats, ) final_motifs, stats = rename_motifs(best_motifs, result.stats) else: logger.info("not clustering") rank = rank_motifs(result.stats) sorted_motifs = sorted(motifs, key=lambda x: rank[str(x)], reverse=True) final_motifs, stats = rename_motifs(sorted_motifs, result.stats) with open(os.path.join(outdir, "gimme.denovo.pfm"), "w") as f: for m in final_motifs: f.write("{}\n".format(m.to_pwm())) if create_report: bg = dict([(b, os.path.join(tmpdir, "bg.{}.fa".format(b))) for b in background]) create_denovo_motif_report( inputfile, os.path.join(outdir, "gimme.denovo.pfm"), os.path.join(tmpdir, "validation.fa"), bg, os.path.join(tmpdir, "localization.fa"), outdir, params, stats, ) with open(os.path.join(outdir, "params.txt"), "w") as f: for k, v in params.items(): f.write("{}\t{}\n".format(k, v)) if not (params.get("keep_intermediate")): logger.debug( "Deleting intermediate files. " "Please specifify the -k option if you want to keep these files.") shutil.rmtree(tmpdir) logger.info("finished") logger.info("output dir: %s", outdir) if cluster: logger.info("de novo report: %s", os.path.join(outdir, "gimme.denovo.html")) return final_motifs
def diff_plot( motifs, pwms, names, freq, counts, bgfreq, bgcounts, outfile, mindiff=0, minenr=3, minfreq=0.01, ): w_ratio = np.array([14, len(names), len(names) + 1]) plot_order = [0, 1, 2] nbar = 5 freq = np.array(freq) counts = np.array(counts) bgfreq = np.array([[x] for x in bgfreq]) enr = np.log2(np.divide(freq, bgfreq)) filt = np.ones(len(enr), dtype="bool") filters = [ np.sum(enr > minenr, 1) > 0, np.sum(freq > minfreq, 1) > 0, (np.max(enr, 1) - np.min(enr, 1)) > mindiff, np.sum(counts > 2, 1) > 0, ] for f in filters: filt = np.logical_and(filt, f) motifs = np.array(motifs)[filt] freq = freq[filt] bgfreq = bgfreq[filt] enr = enr[filt] sys.stderr for m, f, b, e in zip(motifs, freq, bgfreq, enr): sys.stderr.write("{0}\t{1}\t{2}\t{3}\n".format( m, "\t".join(str(x) for x in e), "\t".join(str(x) for x in f), b[0])) if len(freq) == 0: sys.stderr.write("No enriched and/or differential motifs found.\n") return elif len(freq) >= 3: z = hier.linkage(freq, method="complete", metric="correlation") ind = hier.leaves_list(z) else: ind = np.arange(len(freq)) fig = plt.figure(figsize=((5 + 0.75 * len(names)) * 3, (0.3 * len(motifs) + 1.5) * 3)) gs = GridSpec( len(motifs) + 3 + nbar, 3, height_ratios=[1] * nbar + [3] * (len(motifs) + 3), width_ratios=w_ratio[plot_order], ) # Colormaps c1 = mpl.cm.RdBu c2 = mpl.cm.Blues # Frequency plot # # Create axis ax = plt.subplot(gs[nbar:-3, plot_order[2]]) # Plot frequencies vmin = 0 vmax = 0.3 pfreq = np.hstack((freq, bgfreq)) ax.pcolormesh(pfreq[ind], cmap=c2, vmin=vmin, vmax=vmax) sm = plt.cm.ScalarMappable(cmap=c2, norm=Normalize(vmin=vmin, vmax=vmax)) # Show percentages for y, row in enumerate(pfreq[ind]): for x, val in enumerate(row): v = vmax if val >= (vmin + ((vmax - vmin) / 2)): v = vmin plt.text( x + 0.5, y + 0.5, "{:.1%}".format(val), ha="center", va="center", color=sm.to_rgba(v), ) # Hide most labels plt.setp(ax.get_xticklines(), visible=False) plt.setp(ax.get_yticklines(), visible=False) plt.setp(ax.get_yticklabels(), visible=False) # Set the X labels ticks = np.arange(len(names) + 1) + 0.5 plt.xticks(ticks, names + ["background"], rotation=30, ha="right") ax.set_ylim(0, len(motifs)) # Title plt.title("Frequency") # Colorbar # pylint: disable=protected-access sm._A = [] cax = plt.subplot(gs[0, plot_order[2]]) cb = fig.colorbar(sm, cax=cax, ticks=[0, 0.3], orientation="horizontal") cb.ax.set_xticklabels(["0%", "30%"]) # Enrichment plot ax = plt.subplot(gs[nbar:-3, plot_order[1]]) vmin = -10 vmax = 10 ax.pcolormesh(enr[ind], cmap=c1, vmin=vmin, vmax=vmax) for y, row in enumerate(enr[ind]): for x, val in enumerate(row): col = "black" if val >= (vmin + ((vmax - vmin) / 8.0 * 7)): col = "white" elif val <= (vmin + ((vmax - vmin) / 8.0)): col = "white" plt.text( x + 0.5, y + 0.5, "{:.1f}".format(val), ha="center", va="center", color=col, ) ticks = np.arange(len(names)) + 0.5 plt.xticks(ticks, names, rotation=30, ha="right") # plt.setp(plt.xticks()[1], rotation=30) # for label in labels: # label.set_rotation(30) ticks = np.arange(len(motifs)) + 0.5 plt.yticks(ticks, motifs[ind]) plt.setp(ax.get_xticklines(), visible=False) plt.setp(ax.get_yticklines(), visible=False) ax.set_ylim(0, len(motifs)) # Title plt.title("Enrichment (log2)") # Colorbar sm = plt.cm.ScalarMappable(cmap=c1, norm=Normalize(vmin=vmin, vmax=vmax)) sm._A = [] cax = plt.subplot(gs[0, plot_order[1]]) cb = fig.colorbar(sm, cax=cax, ticks=[vmin, 0, vmax], orientation="horizontal") cb.ax.set_xticklabels([vmin, 0, vmax]) # Motif logos for i, motif in enumerate(motifs[ind][::-1]): ax = plt.subplot(gs[i + nbar, plot_order[0]]) axes_off(ax) tmp = NamedTemporaryFile(dir=mytmpdir(), suffix=".png") pwms[motif].plot_logo(fname=tmp.name, title=False) ax.imshow(plt.imread(tmp.name), interpolation="none") # plt.show() plt.savefig(outfile, dpi=300, bbox_inches="tight") plt.close(fig)
def diff_plot(motifs, pwms, names, freq, counts, bgfreq, bgcounts, outfile, mindiff=0, minenr=3, minfreq=0.01): w_ratio = np.array([14, len(names), len(names) + 1]) plot_order = [0,1,2] nbar = 5 freq = np.array(freq) counts = np.array(counts) bgfreq = np.array([[x] for x in bgfreq]) enr = np.log2(np.divide(freq, bgfreq)) filt = np.ones(len(enr), dtype="bool") filters = [ np.sum(enr > minenr, 1) > 0, np.sum(freq > minfreq, 1) > 0, (np.max(enr, 1) - np.min(enr, 1)) > mindiff, np.sum(counts > 2, 1) > 0 ] for f in filters: filt = np.logical_and(filt, f) print "Filter: ", sum(filt) motifs = np.array(motifs)[filt] freq = freq[filt] bgfreq = bgfreq[filt] enr = enr[filt] for m,f,b,e in zip(motifs,freq,bgfreq,enr): sys.stderr.write("{0}\t{1}\t{2}\t{3}\n".format(m,f,b,e)) if len(freq) == 0: sys.stderr.write("No enriched and/or differential motifs found.\n") return elif len(freq) >= 3: z = hier.linkage(freq, method="complete", metric="correlation") ind = hier.leaves_list(z) else: ind = np.arange(len(freq)) fig = plt.figure(figsize=( (5 + 0.75 * len(names)) * 3, (0.3 * len(motifs) + 1.5) * 3 )) gs = GridSpec(len(motifs) + 3 + nbar, 3, height_ratios=[1] * nbar + [3] * (len(motifs) + 3), width_ratios=w_ratio[plot_order], ) # Colormaps c1 = mpl.cm.RdBu c2 = mpl.cm.Blues ##create_colormap("white", "blue") ### Frequency plot ### # Create axis ax = plt.subplot(gs[nbar:-3, plot_order[2]]) # Plot frequencies vmin = 0 vmax = 0.3 pfreq = np.hstack((freq, bgfreq)) ax.pcolormesh(pfreq[ind], cmap=c2, vmin=vmin, vmax=vmax) sm = plt.cm.ScalarMappable(cmap=c2, norm=mpl.colors.Normalize(vmin=vmin, vmax=vmax)) # Show percentages for y,row in enumerate(pfreq[ind]): for x,val in enumerate(row): v = vmax if val >= (vmin + ((vmax - vmin) / 2)): v = vmin plt.text(x + 0.5, y + 0.5, "{:.1%}".format(val), ha='center', va='center', color=sm.to_rgba(v)) # Hide most labels plt.setp(ax.get_xticklines(),visible=False) plt.setp(ax.get_yticklines(),visible=False) plt.setp(ax.get_yticklabels(),visible=False) # Set the X labels ticks = np.arange(len(names)+ 1) + 0.5 plt.xticks(ticks, names + ["background"], rotation=30, ha="right") ax.set_ylim(0, len(motifs)) # Title plt.title('Frequency') # Colorbar sm._A = [] cax = plt.subplot(gs[0,plot_order[2]]) cb = fig.colorbar(sm, cax=cax, ticks = [0, 0.3], orientation='horizontal') cb.ax.set_xticklabels(["0%","30%"]) #### Enrichment plot ax = plt.subplot(gs[nbar:-3, plot_order[1]]) vmin = -10 vmax = 10 ax.pcolormesh(enr[ind], cmap=c1, vmin=vmin, vmax=vmax) for y,row in enumerate(enr[ind]): for x,val in enumerate(row): col = "black" if val >= (vmin + ((vmax - vmin) / 8.0 * 7)): col = "white" elif val <= (vmin + ((vmax - vmin) / 8.0)): col = "white" plt.text(x + 0.5, y + 0.5, "{:.1f}".format(val), ha='center', va='center', color=col) ticks = np.arange(len(names)) + 0.5 plt.xticks(ticks, names, rotation=30, ha="right") #plt.setp(plt.xticks()[1], rotation=30) #for label in labels: # label.set_rotation(30) ticks = np.arange(len(motifs)) + 0.5 plt.yticks(ticks, motifs[ind]) plt.setp(ax.get_xticklines(),visible=False) plt.setp(ax.get_yticklines(),visible=False) ax.set_ylim(0, len(motifs)) # Title plt.title('Enrichment (log2)') # Colorbar sm = plt.cm.ScalarMappable(cmap=c1, norm=mpl.colors.Normalize(vmin=vmin, vmax=vmax)) sm._A = [] cax = plt.subplot(gs[0,plot_order[1]]) cb = fig.colorbar(sm, cax=cax, ticks = [vmin,0, vmax], orientation='horizontal') cb.ax.set_xticklabels([vmin, 0, vmax]) #### Motif logos for i,motif in enumerate(motifs[ind][::-1]): ax = plt.subplot(gs[i + nbar, plot_order[0]]) axes_off(ax) tmp = NamedTemporaryFile(dir=mytmpdir(), suffix=".png") pwms[motif].to_img(tmp.name, format="PNG", height=6) ax.imshow(plt.imread(tmp.name), interpolation="none") #plt.show() plt.savefig(outfile, dpi=300, bbox_inches='tight') plt.close(fig)
def to_img(self, fname, fmt="PNG", add_left=0, seqlogo=None, height=6): """Create a sequence logo using seqlogo. Create a sequence logo and save it to a file. Valid formats are: PNG, EPS, GIF and PDF. Parameters ---------- fname : str Output filename. fmt : str , optional Output format (case-insensitive). Valid formats are PNG, EPS, GIF and PDF. add_left : int , optional Pad motif with empty positions on the left side. seqlogo : str Location of the seqlogo executable. By default the seqlogo version that is included with GimmeMotifs is used. height : float Height of the image """ if not seqlogo: seqlogo = self.seqlogo if not seqlogo: raise ValueError("seqlogo not specified or configured") #TODO: split to_align function VALID_FORMATS = ["EPS", "GIF", "PDF", "PNG"] N = 1000 fmt = fmt.upper() if not fmt in VALID_FORMATS: sys.stderr.write("Invalid motif format\n") return if fname[-4:].upper() == (".%s" % fmt): fname = fname[:-4] seqs = [] if add_left == 0: seqs = ["" for i in range(N)] else: for nuc in ["A", "C", "T", "G"]: seqs += [nuc * add_left for i in range(N // 4)] for pos in range(len(self.pwm)): vals = [self.pwm[pos][0] * N] for i in range(1, 4): vals.append(vals[i - 1] + self.pwm[pos][i] * N) if vals[3] - N != 0: #print "Motif weights don't add up to 1! Error of %s%%" % ((vals[3] - n)/ n * 100) vals[3] = N for i in range(N): if i <= vals[0]: seqs[i] += "A" elif i <= vals[1]: seqs[i] += "C" elif i <= vals[2]: seqs[i] += "G" elif i <= vals[3]: seqs[i] += "T" f = NamedTemporaryFile(mode="w", dir=mytmpdir()) for seq in seqs: f.write("%s\n" % seq) f.flush() makelogo = "{0} -f {1} -F {2} -c -a -h {3} -w {4} -o {5} -b -n -Y" cmd = makelogo.format(seqlogo, f.name, fmt, height, len(self) + add_left, fname) sp.call(cmd, shell=True)