def prepare_input_bed(self, inputfile, organism="hg18", width=200, fraction=0.2, abs_max=1000, use_strand=False): """ Create all the bed- and fasta-files necessary for motif prediction and validation """ self.inputfile = inputfile width = int(width) fraction = float(fraction) abs_max = int(abs_max) use_strand = bool(use_strand) self.logger.info("Preparing input (BED)") # Set all peaks to specific width self.logger.debug("Creating inputfile %s, width %s" % (self.input_bed, width)) # if not self.weird: write_equalwidth_bedfile(inputfile, width, self.input_bed) # Split input_bed in prediction and validation set self.logger.debug("Splitting %s into prediction set (%s) and validation set (%s)" % (self.input_bed, self.prediction_bed, self.validation_bed)) #if not self.weird: self.prediction_num, self.validation_num = divide_file(self.input_bed, self.prediction_bed, self.validation_bed, fraction, abs_max) # Make fasta files index_dir = os.path.join(self.config.get_index_dir(), organism) self.logger.debug("Creating %s" % (self.prediction_fa)) genome_index.track2fasta(index_dir, self.prediction_bed, self.prediction_fa, use_strand=use_strand, ignore_missing=True) self.logger.debug("Creating %s" % (self.validation_fa)) genome_index.track2fasta(index_dir, self.validation_bed, self.validation_fa, use_strand=use_strand, ignore_missing=True)
def as_fasta(seqs, index_dir=None): ftype = get_seqs_type(seqs) if ftype == "fasta": return seqs elif ftype == "fastafile": return Fasta(seqs) else: if index_dir is None: raise ValueError("need index_dir / genome to convert to FASTA") tmpfa = NamedTemporaryFile() if ftype == "bedfile": track2fasta(index_dir, seqs, tmpfa.name) else: if ftype == "regionfile": seqs = [l.strip() for l in open(seqs).readlines()] tmpbed = NamedTemporaryFile() for seq in seqs: vals = re.split(r'[:-]', seq) tmpbed.write("{}\t{}\t{}\n".format(*vals)) tmpbed.flush() track2fasta(index_dir, tmpbed.name, tmpfa.name) return Fasta(tmpfa.name)
def prepare_input_bed(self, inputfile, organism="hg18", width=200, fraction=0.2, abs_max=1000, use_strand=False): """ Create all the bed- and fasta-files necessary for motif prediction and validation """ self.inputfile = inputfile width = int(width) fraction = float(fraction) abs_max = int(abs_max) use_strand = bool(use_strand) self.logger.info("Preparing input (BED)") # Set all peaks to specific width self.logger.debug("Creating inputfile %s, width %s" % (self.input_bed, width)) if not self.weird: write_equalwidth_bedfile(inputfile, width, self.input_bed) # Split input_bed in prediction and validation set self.logger.debug("Splitting %s into prediction set (%s) and validation set (%s)" % (self.input_bed, self.prediction_bed, self.validation_bed)) if not self.weird: self.prediction_num, self.validation_num = divide_file(self.input_bed, self.prediction_bed, self.validation_bed, fraction, abs_max) # Make fasta files index_dir = os.path.join(self.config.get_index_dir(), organism) self.logger.debug("Creating %s" % (self.prediction_fa)) genome_index.track2fasta(index_dir, self.prediction_bed, self.prediction_fa, use_strand=use_strand) self.logger.debug("Creating %s" % (self.validation_fa)) genome_index.track2fasta(index_dir, self.validation_bed, self.validation_fa, use_strand=use_strand)
def create_location_plots(self, motif_file, fasta_file, params): self.logger.info("Creating localization plots") index_dir = os.path.join(self.config.get_index_dir(), params["genome"]) lwidth = int(params["lwidth"]) width = int(params["width"]) extend = (lwidth - width) / 2 genome_index.track2fasta(index_dir, self.validation_bed, self.location_fa, extend_up=extend, extend_down=extend, use_strand=params["use_strand"]) jobs = [] motifs = pwmfile_to_motifs(motif_file) for motif in motifs: outfile = os.path.join(self.imgdir, "%s_histogram.svg" % motif.id) motif_localization(fasta_file, motif, lwidth, outfile)
def __init__(self, bedfile, genefile, index="/usr/share/gimmemotifs/genome_index/hg18", length=None, multiply=10, match_chromosome=True): self.match_chromosome = match_chromosome # Create temporary files tmpbed = NamedTemporaryFile().name tmpfasta = NamedTemporaryFile().name # Create bed-file with coordinates of random sequences self._create_bedfile(tmpbed, bedfile, genefile, length, multiply) # Convert track to fasta track2fasta(index, tmpbed, tmpfasta) # Initialize super Fasta object Fasta.__init__(self, tmpfasta) # Delete the temporary files os.remove(tmpbed) os.remove(tmpfasta)
def __init__(self, index="/usr/share/gimmemotifs/genome_index/hg18", length=None, n=None): length = int(length) # Create temporary files tmpbed = NamedTemporaryFile(dir=mytmpdir()).name tmpfasta = NamedTemporaryFile(dir=mytmpdir()).name # Create bed-file with coordinates of random sequences create_random_genomic_bedfile(tmpbed, index, length, n) # Convert track to fasta track2fasta(index, tmpbed, tmpfasta, use_strand=True) # Initialize super Fasta object Fasta.__init__(self, tmpfasta) # Delete the temporary files os.remove(tmpbed) os.remove(tmpfasta)
def __init__(self, genefile, index="/usr/share/gimmemotifs/genome_index/hg18", length=None, n=None): length = int(length) # Create temporary files tmpbed = NamedTemporaryFile().name tmpfasta = NamedTemporaryFile().name # Create bed-file with coordinates of random sequences self._create_promoter_bedfile(tmpbed, genefile, length, n) # Convert track to fasta track2fasta(index, tmpbed, tmpfasta, use_strand=True) # Initialize super Fasta object Fasta.__init__(self, tmpfasta) # Delete the temporary files os.remove(tmpbed) os.remove(tmpfasta)
def __init__(self, matchfile, genome="hg19", number=None): config = MotifConfig() index = os.path.join(config.get_index_dir(), genome) # Create temporary files tmpbed = NamedTemporaryFile(dir=mytmpdir()).name tmpfasta = NamedTemporaryFile(dir=mytmpdir()).name # Create bed-file with coordinates of random sequences matched_gc_bedfile(tmpbed, matchfile, genome, number) # Convert track to fasta track2fasta(index, tmpbed, tmpfasta) # Initialize super Fasta object Fasta.__init__(self, tmpfasta) # Delete the temporary files os.remove(tmpbed) os.remove(tmpfasta)
def __init__(self, bedfile, genefile, index="/usr/share/gimmemotifs/genome_index/hg18", length=None, multiply=10, match_chromosome=True): self.match_chromosome = match_chromosome length = int(length) # Create temporary files tmpbed = NamedTemporaryFile().name tmpfasta = NamedTemporaryFile().name # Create bed-file with coordinates of random sequences self._create_bedfile(tmpbed, bedfile, genefile, length, multiply) # Convert track to fasta track2fasta(index, tmpbed, tmpfasta) # Initialize super Fasta object Fasta.__init__(self, tmpfasta) # Delete the temporary files os.remove(tmpbed) os.remove(tmpfasta)
def create_location_plots(self, motif_file, params): self.logger.info("Creating localization plots") if self.input_type == "BED": index_dir = os.path.join(self.config.get_index_dir(), params["genome"]) lwidth = int(params["lwidth"]) width = int(params["width"]) extend = (lwidth - width) / 2 genome_index.track2fasta(index_dir, self.validation_bed, self.location_fa, extend_up=extend, extend_down=extend, use_strand=params["use_strand"]) else: self.location_fa = self.validation_fa fa = Fasta(self.location_fa) seqs = fa.seqs lwidth = len(seqs[0]) all_same_width = not(False in [len(seq) == lwidth for seq in seqs]) if not all_same_width: self.logger.warn("PLEASE NOTE: FASTA file contains sequences of different lengths. Positional preference plots will be incorrect!") motifs = pwmfile_to_motifs(motif_file) for motif in motifs: outfile = os.path.join(self.imgdir, "%s_histogram.svg" % motif.id) motif_localization(self.location_fa, motif, lwidth, outfile)
def run_full_analysis(self, inputfile, user_params=None): """ Full analysis: from bed-file to motifs (including clustering, ROC-curves, location plots and html report) """ self.logger.info("starting full motif analysis") self.logger.debug("Using temporary directory {0}".format(mytmpdir())) if user_params is None: user_params = {} params = self.config.get_default_params() params.update(user_params) if params["torque"]: from gimmemotifs.prediction_torque import pp_predict_motifs, PredictionResult self.logger.debug("Using torque") else: from gimmemotifs.prediction import pp_predict_motifs, PredictionResult self.logger.debug("Using multiprocessing") self.params = params #self.weird = params["weird_option"] background = [x.strip() for x in params["background"].split(",")] self.logger.debug("Parameters:") for param, value in params.items(): self.logger.debug(" %s: %s", param, value) # Checking input self.input_type = "BED" # If we can load it as fasta then it is a fasta, yeh? try: Fasta(inputfile) self.logger.debug("Inputfile is a FASTA file") self.input_type = "FASTA" except Exception: # Leave it to BED pass index_msg = ("No index found for genome {}! " "Has GimmeMotifs been configured correctly and is the " "genome indexed?").format(params["genome"]) index_dir = os.path.join(self.config.get_index_dir(), params["genome"]) if self.input_type == "FASTA": for bg in background: if not bg in FA_VALID_BGS: self.logger.info( "Input type is FASTA, can't use background type '%s'", bg) if bg == "genomic": if not os.path.exists(index_dir): self.logger.error(index_msg) sys.exit(1) background = [bg for bg in background if bg in FA_VALID_BGS] elif self.input_type == "BED": # Does the index_dir exist? #bed-specific if not os.path.exists(index_dir): self.logger.error(index_msg) sys.exit(1) # is it a valid bed-file etc. self._check_input(inputfile) # bed-specific # Check for valid background for bg in background: if not bg in BED_VALID_BGS: self.logger.info( "Input type is BED, can't use background type '%s'", bg) background = [bg for bg in background if bg in BED_VALID_BGS] if len(background) == 0: self.logger.error("No valid backgrounds specified!") sys.exit(1) self.max_time = None max_time = None # Maximum time? if params["max_time"]: try: max_time = float(params["max_time"]) except Exception: self.logger.debug( "Could not parse max_time value, setting to no limit") self.max_time = None if max_time > 0: self.logger.debug( "Time limit for motif prediction: %0.2f hours" % max_time) max_time = 3600 * max_time self.max_time = max_time self.logger.debug("Max_time in seconds %0.0f" % self.max_time) else: self.logger.debug( "Invalid time limit for motif prediction, setting to no limit" ) self.max_time = None else: self.logger.debug("No time limit for motif prediction") if "random" in background: self.markov_model = params["markov_model"] # Create the necessary files for motif prediction and validation if self.input_type == "BED": self.prepare_input_bed(inputfile, params["genome"], params["width"], params["fraction"], params["abs_max"], params["use_strand"]) # Create file for location plots index_dir = os.path.join(self.config.get_index_dir(), params["genome"]) lwidth = int(params["lwidth"]) width = int(params["width"]) extend = (lwidth - width) / 2 genome_index.track2fasta(index_dir, self.validation_bed, self.location_fa, extend_up=extend, extend_down=extend, use_strand=params["use_strand"], ignore_missing=True) elif self.input_type == "FASTA": self.prepare_input_fa(inputfile, params["width"], params["fraction"], params["abs_max"]) # File for location plots self.location_fa = self.validation_fa fa = Fasta(self.location_fa) seqs = fa.seqs lwidth = len(seqs[0]) all_same_width = not (False in [len(seq) == lwidth for seq in seqs]) if not all_same_width: self.logger.warn( "PLEASE NOTE: FASTA file contains sequences of different lengths. Positional preference plots will be incorrect!" ) else: self.logger.error("Unknown input type, shouldn't happen") sys.exit(1) tools = dict([(x.strip(), x in [y.strip() for y in params["tools"].split(",")]) for x in params["available_tools"].split(",")]) self.create_background(background, params["genome"], params["width"]) # Predict the motifs analysis = params["analysis"] """ Predict motifs, input is a FASTA-file""" self.logger.info("starting motif prediction (%s)", analysis) self.logger.info("tools: %s", ", ".join([x for x in tools.keys() if tools[x]])) bg_file = self.bg_file["fa"][sorted( background, lambda x, y: cmp(BG_RANK[x], BG_RANK[y]))[0]] self.logger.debug("Using bg_file %s for significance" % bg_file) result = pp_predict_motifs(self.prediction_fa, self.predicted_pfm, analysis, params["genome"], params["use_strand"], self.prediction_bg, tools, self.job_server(), logger=self.logger, max_time=self.max_time, fg_file=self.validation_fa, bg_file=bg_file) motifs = result.motifs self.logger.info("predicted %s motifs", len(motifs)) self.logger.debug("written to %s", self.predicted_pfm) if len(motifs) == 0: self.logger.info("no motifs found") sys.exit() # Write stats output to file f = open(self.stats_file, "w") stat_keys = result.stats.values()[0].keys() f.write("%s\t%s\n" % ("Motif", "\t".join(stat_keys))) self.logger.debug(result.stats) for motif in motifs: stats = result.stats["%s_%s" % (motif.id, motif.to_consensus())] if stats: f.write( "%s\t%s\n" % (motif.id, "\t".join([str(stats[k]) for k in stat_keys]))) else: self.logger.error( "No stats for motif {0}, skipping this motif!".format( motif.id)) motifs.remove(motif) f.close() self.motifs_with_stats = motifs f = open(self.ranks_file, "w") tools = dict((m.id.split("_")[0], 1) for m in motifs).keys() f.write("Metric\tType\t%s\n" % ("\t".join(tools))) for stat in ["mncp", "roc_auc", "maxenr"]: best_motif = {} for motif in self.motifs_with_stats: val = result.stats["%s_%s" % (motif.id, motif.to_consensus())][stat] name = motif.id.split("_")[0] if val > best_motif.setdefault(name, 0): best_motif[name] = val names = best_motif.keys() vals = [best_motif[name] for name in names] rank = rankdata(vals) ind = [names.index(x) for x in tools] f.write("%s\t%s\t%s\n" % (stat, "value", "\t".join([str(vals[i]) for i in ind]))) f.write("%s\t%s\t%s\n" % (stat, "rank", "\t".join([str(rank[i]) for i in ind]))) f.close() #self.logger.debug("RANK: %s" % stat) #self.logger.debug("\t".join([str(x) for x in names])) #self.logger.debug("\t".join([str(x) for x in vals])) #self.logger.debug("\t".join([str(x) for x in rank])) # Determine significant motifs nsig = 0 f = open(self.significant_pfm, "w") for motif in motifs: stats = result.stats["%s_%s" % (motif.id, motif.to_consensus())] if stats["maxenr"] >= 3 and stats["roc_auc"] >= 0.55 and stats[ 'enr_fdr'] >= 2: f.write("%s\n" % motif.to_pfm()) nsig += 1 f.close() self.logger.info("%s motifs are significant", nsig) self.logger.debug("written to %s", self.significant_pfm) if nsig == 0: self.logger.info("no significant motifs found") return # ROC metrics of significant motifs for bg in background: self._roc_metrics(self.significant_pfm, self.validation_fa, self.bg_file["fa"][bg], self.bg_file["roc"][bg]) # Cluster significant motifs clusters = self._cluster_motifs(self.significant_pfm, self.cluster_pwm, self.outdir, params["cluster_threshold"]) # Determine best motif in cluster num_cluster, best_id = self._determine_best_motif_in_cluster( clusters, self.final_pwm, self.validation_fa, bg_file, self.imgdir) ### Enable parallel and modular evaluation of results # Scan (multiple) files with motifs # Define callback functions once scanning is finished: # - ROC plot # - Statistics # - Location plots (histogram) # - # Stars tmp = NamedTemporaryFile(dir=mytmpdir()).name p = PredictionResult(tmp, logger=self.logger, job_server=self.server, fg_file=self.validation_fa, bg_file=bg_file, do_counter=False) p.add_motifs( ("clustering", (read_motifs(open(self.final_pwm)), "", ""))) while len(p.stats.keys()) < len(p.motifs): sleep(5) #print "p.stats" #print p.stats #print "num_cluster" #print num_cluster for mid, num in num_cluster.items(): p.stats[mid]["numcluster"] = num all_stats = { "mncp": [2, 5, 8], "roc_auc": [0.6, 0.75, 0.9], "maxenr": [10, 20, 30], "enr_fdr": [4, 8, 12], "fraction": [0.4, 0.6, 0.8], "ks_sig": [4, 7, 10], "numcluster": [3, 6, 9], } self.logger.info("creating report") # ROC plots for bg in background: self.create_roc_plots(self.final_pwm, self.validation_fa, self.bg_file["fa"][bg], bg) # Location plots self.logger.debug("Creating localization plots") motifs = read_motifs(open(self.final_pwm), fmt="pwm") for motif in motifs: m = "%s_%s" % (motif.id, motif.to_consensus()) s = p.stats[m] outfile = os.path.join(self.imgdir, "%s_histogram.svg" % motif.id) motif_localization(self.location_fa, motif, lwidth, outfile, cutoff=s["cutoff_fdr"]) s["stars"] = int( mean([star(s[x], all_stats[x]) for x in all_stats.keys()]) + 0.5) self.logger.debug("Motif %s: %s stars" % (m, s["stars"])) # Calculate enrichment of final, clustered motifs self.calculate_cluster_enrichment(self.final_pwm, background) # Create report self.print_params() self._calc_report_values(self.final_pwm, background) self._create_report(self.final_pwm, background, stats=p.stats, best_id=best_id) self._create_text_report(self.final_pwm, background) self.logger.info("finished") self.logger.info("output dir: %s", os.path.split(self.motif_report)[0]) self.logger.info("report: %s", os.path.split(self.motif_report)[-1]) #self.logger.info("Open %s in your browser to see your results." % (self.motif_report)) if not (params["keep_intermediate"]): self.logger.debug( "Deleting intermediate files. Please specifify the -k option if you want to keep these files." ) shutil.rmtree(self.tmpdir) self.logger.debug("Done") return self.motif_report
def run_full_analysis(self, inputfile, user_params=None): """ Full analysis: from bed-file to motifs (including clustering, ROC-curves, location plots and html report) """ self.logger.info("Starting full motif analysis") self.logger.info("Using temporary directory {0}".format(mytmpdir())) if user_params is None: user_params = {} params = self.config.get_default_params() params.update(user_params) if params["torque"]: from gimmemotifs.prediction_torque import pp_predict_motifs, PredictionResult self.logger.info("Using torque") else: from gimmemotifs.prediction import pp_predict_motifs, PredictionResult self.logger.info("Using multiprocessing") self.params = params #self.weird = params["weird_option"] background = [x.strip() for x in params["background"].split(",")] self.logger.info("Parameters:") for param, value in params.items(): self.logger.info(" %s: %s" % (param, value)) # Checking input self.input_type = "BED" # If we can load it as fasta then it is a fasta, yeh? try: Fasta(inputfile) self.logger.info("Inputfile is a FASTA file") self.input_type = "FASTA" except: # Leave it to BED pass if self.input_type == "FASTA": for bg in background: if not bg in FA_VALID_BGS: self.logger.info("Input type is FASTA, can't use background type '%s'" % bg) background = [bg for bg in background if bg in FA_VALID_BGS] elif self.input_type == "BED": # Does the index_dir exist? #bed-specific index_dir = os.path.join(self.config.get_index_dir(), params["genome"]) if not os.path.exists(index_dir): self.logger.error("No index found for genome %s! Has GimmeMotifs been configured correctly and is the genome indexed?" % params["genome"]) sys.exit(1) # is it a valid bed-file etc. self._check_input(inputfile) # bed-specific # Check for valid background for bg in background: if not bg in BED_VALID_BGS: self.logger.info("Input type is BED, can't use background type '%s'" % bg) background = [bg for bg in background if bg in BED_VALID_BGS] if len(background) == 0: self.logger.error("No valid backgrounds specified!") sys.exit(1) self.max_time = None max_time = None # Maximum time? if params["max_time"]: try: max_time = float(params["max_time"]) except: self.logger.info("Could not parse max_time value, setting to no limit") self.max_time = None if max_time > 0: self.logger.info("Time limit for motif prediction: %0.2f hours" % max_time) max_time = 3600 * max_time self.max_time = max_time self.logger.debug("Max_time in seconds %0.0f" % self.max_time) else: self.logger.info("Invalid time limit for motif prediction, setting to no limit") self.max_time = None else: self.logger.info("No time limit for motif prediction") if "random" in background: self.markov_model = params["markov_model"] # Create the necessary files for motif prediction and validation if self.input_type == "BED": self.prepare_input_bed(inputfile, params["genome"], params["width"], params["fraction"], params["abs_max"], params["use_strand"]) # Create file for location plots index_dir = os.path.join(self.config.get_index_dir(), params["genome"]) lwidth = int(params["lwidth"]) width = int(params["width"]) extend = (lwidth - width) / 2 genome_index.track2fasta(index_dir, self.validation_bed, self.location_fa, extend_up=extend, extend_down=extend, use_strand=params["use_strand"], ignore_missing=True) elif self.input_type == "FASTA": self.prepare_input_fa(inputfile, params["width"], params["fraction"], params["abs_max"]) # File for location plots self.location_fa = self.validation_fa fa = Fasta(self.location_fa) seqs = fa.seqs lwidth = len(seqs[0]) all_same_width = not(False in [len(seq) == lwidth for seq in seqs]) if not all_same_width: self.logger.warn("PLEASE NOTE: FASTA file contains sequences of different lengths. Positional preference plots will be incorrect!") else: self.logger.error("Unknown input type, shouldn't happen") sys.exit(1) tools = dict([(x.strip(), x in [y.strip() for y in params["tools"].split(",")]) for x in params["available_tools"].split(",")]) self.create_background(background, params["genome"], params["width"]) # Predict the motifs analysis = params["analysis"] """ Predict motifs, input is a FASTA-file""" self.logger.info("Starting motif prediction (%s) using %s" % (analysis, ", ".join([x for x in tools.keys() if tools[x]]))) bg_file = self.bg_file["fa"][sorted(background, lambda x,y: cmp(BG_RANK[x], BG_RANK[y]))[0]] self.logger.info("Using bg_file %s for significance" % bg_file) result = pp_predict_motifs(self.prediction_fa, self.predicted_pfm, analysis, params["genome"], params["use_strand"], self.prediction_bg, tools, self.job_server(), logger=self.logger, max_time=self.max_time, fg_file=self.validation_fa, bg_file=bg_file) motifs = result.motifs self.logger.info("Predicted %s motifs, written to %s" % (len(motifs), self.predicted_pfm)) if len(motifs) == 0: self.logger.info("No motifs found. Done.") sys.exit() # Write stats output to file f = open(self.stats_file, "w") stat_keys = result.stats.values()[0].keys() f.write("%s\t%s\n" % ("Motif", "\t".join(stat_keys))) print result.stats for motif in motifs: stats = result.stats["%s_%s" % (motif.id, motif.to_consensus())] if stats: f.write("%s\t%s\n" % (motif.id, "\t".join([str(stats[k]) for k in stat_keys]))) else: self.logger.error("No stats for motif {0}, skipping this motif!".format(motif.id)) motifs.remove(motif) f.close() self.motifs_with_stats = motifs f = open(self.ranks_file, "w") tools = dict((m.id.split("_")[0],1) for m in motifs).keys() f.write("Metric\tType\t%s\n" % ("\t".join(tools))) for stat in ["mncp", "roc_auc", "maxenr"]: best_motif = {} for motif in self.motifs_with_stats: val = result.stats["%s_%s" % (motif.id, motif.to_consensus())][stat] name = motif.id.split("_")[0] if val > best_motif.setdefault(name, 0): best_motif[name] = val names = best_motif.keys() vals = [best_motif[name] for name in names] rank = rankdata(vals) ind = [names.index(x) for x in tools] f.write("%s\t%s\t%s\n" % (stat, "value", "\t".join([str(vals[i]) for i in ind]))) f.write("%s\t%s\t%s\n" % (stat, "rank", "\t".join([str(rank[i]) for i in ind]))) f.close() #self.logger.debug("RANK: %s" % stat) #self.logger.debug("\t".join([str(x) for x in names])) #self.logger.debug("\t".join([str(x) for x in vals])) #self.logger.debug("\t".join([str(x) for x in rank])) # Determine significant motifs nsig = 0 f = open(self.significant_pfm, "w") for motif in motifs: stats = result.stats["%s_%s" % (motif.id, motif.to_consensus())] if stats["maxenr"] >= 3 and stats["roc_auc"] >= 0.55 and stats['enr_fdr'] >= 2: f.write("%s\n" % motif.to_pfm()) nsig += 1 f.close() self.logger.info("%s motifs are significant, written to %s" % (nsig, self.significant_pfm)) if nsig == 0: self.logger.info("No significant motifs found. Done.") sys.exit() # ROC metrics of significant motifs for bg in background: self._roc_metrics(self.significant_pfm, self.validation_fa, self.bg_file["fa"][bg], self.bg_file["roc"][bg]) # Cluster significant motifs clusters = self._cluster_motifs(self.significant_pfm, self.cluster_pwm, self.outdir, params["cluster_threshold"]) # Determine best motif in cluster num_cluster, best_id = self._determine_best_motif_in_cluster(clusters, self.final_pwm, self.validation_fa, bg_file, self.imgdir) ### Enable parallel and modular evaluation of results # Scan (multiple) files with motifs # Define callback functions once scanning is finished: # - ROC plot # - Statistics # - Location plots (histogram) # - # Stars tmp = NamedTemporaryFile(dir=mytmpdir()).name p = PredictionResult(tmp, logger=self.logger, job_server=self.server, fg_file = self.validation_fa, bg_file = bg_file) p.add_motifs(("Clustering", (pwmfile_to_motifs(self.final_pwm), "",""))) while len(p.stats.keys()) < len(p.motifs): sleep(5) for mid, num in num_cluster.items(): p.stats[mid]["numcluster"] = num all_stats = { "mncp": [2, 5, 8], "roc_auc": [0.6, 0.75, 0.9], "maxenr": [10, 20, 30], "enr_fdr": [4, 8, 12], "fraction": [0.4, 0.6, 0.8], "ks_sig": [4, 7, 10], "numcluster": [3, 6, 9], } # ROC plots for bg in background: self.create_roc_plots(self.final_pwm, self.validation_fa, self.bg_file["fa"][bg], bg) # Location plots self.logger.info("Creating localization plots") motifs = pwmfile_to_motifs(self.final_pwm) for motif in motifs: m = "%s_%s" % (motif.id, motif.to_consensus()) s = p.stats[m] outfile = os.path.join(self.imgdir, "%s_histogram.svg" % motif.id) motif_localization(self.location_fa, motif, lwidth, outfile, cutoff=s["cutoff_fdr"]) s["stars"] = int(mean([star(s[x], all_stats[x]) for x in all_stats.keys()]) + 0.5) self.logger.debug("Motif %s: %s stars" % (m, s["stars"])) # Calculate enrichment of final, clustered motifs self.calculate_cluster_enrichment(self.final_pwm, background) # Create report self.print_params() self._calc_report_values(self.final_pwm, background) self._create_report(self.final_pwm, background, stats=p.stats, best_id=best_id) self._create_text_report(self.final_pwm, background) self.logger.info("Open %s in your browser to see your results." % (self.motif_report)) if not(params["keep_intermediate"]): self.logger.info("Deleting intermediate files. Please specifify the -k option if you want to keep these files.") shutil.rmtree(self.tmpdir) self.logger.info("Done")