def gimme_motifs( inputfile, outdir, params=None, filter_significant=True, cluster=True, create_report=True, ): """De novo motif prediction based on an ensemble of different tools. Parameters ---------- inputfile : str Filename of input. Can be either BED, narrowPeak or FASTA. outdir : str Name of output directory. params : dict, optional Optional parameters. filter_significant : bool, optional Filter motifs for significance using the validation set. cluster : bool, optional Cluster similar predicted (and significant) motifs. create_report : bool, optional Create output reports (both .txt and .html). Returns ------- motifs : list List of predicted motifs. Examples -------- >>> from gimmemotifs.denovo import gimme_motifs >>> gimme_motifs("input.fa", "motifs.out") """ if outdir is None: outdir = "gimmemotifs_{}".format( datetime.date.today().strftime("%d_%m_%Y")) # Create output directories tmpdir = os.path.join(outdir, "intermediate") for d in [outdir, tmpdir]: if not os.path.exists(d): os.mkdir(d) # Log to file logger = logging.getLogger("gimme") logfile = os.path.join(outdir, "gimmemotifs.log") fh = logging.FileHandler(logfile, "w") fh.setLevel(logging.DEBUG) file_formatter = logging.Formatter( "%(asctime)s - %(name)s - %(levelname)s - %(message)s") fh.setFormatter(file_formatter) logger.addHandler(fh) logger = logging.getLogger("gimme.denovo") # Initialize parameters params = parse_denovo_params(params) # Check the input files input_type, background = check_denovo_input(inputfile, params) logger.info("starting full motif analysis") logger.debug("Using temporary directory %s", mytmpdir()) params["size"] = int(params["size"]) if params["size"] > 0: logger.info( "using size of {}, set size to 0 to use original region size". format(params["size"])) else: logger.info("using original size") # Create the necessary files for motif prediction and validation if input_type == "bed": logger.info("preparing input from BED") prepare_denovo_input_bed(inputfile, params, tmpdir) elif input_type == "narrowpeak": logger.info("preparing input from narrowPeak") prepare_denovo_input_narrowpeak(inputfile, params, tmpdir) elif input_type == "fasta": logger.info("preparing input from FASTA") prepare_denovo_input_fa(inputfile, params, tmpdir) else: logger.error("unknown input file format!") sys.exit(1) # Create the background FASTA files background = create_backgrounds( tmpdir, background, params.get("genome", None), params["size"], params.get("custom_background", None), ) # Predict de novo motifs result = predict_motifs( os.path.join(tmpdir, "prediction.fa"), os.path.join(tmpdir, "prediction.bg.fa"), os.path.join(tmpdir, "all_motifs.pfm"), params=params, stats_fg=os.path.join(tmpdir, "validation.fa"), stats_bg=background, ) if len(result.motifs) == 0: logger.info("finished") return [] # Write statistics stats_file = os.path.join(tmpdir, "stats.{}.txt") write_stats(result.stats, stats_file) bg = sorted(background, key=lambda x: BG_RANK[x])[0] if filter_significant: motifs = filter_significant_motifs( os.path.join(tmpdir, "significant_motifs.pfm"), result, bg) if len(motifs) == 0: logger.info("no significant motifs") return pfmfile = os.path.join(tmpdir, "significant_motifs.pfm") else: logger.info("not filtering for significance") motifs = result.motifs pfmfile = os.path.join(tmpdir, "all_motifs.pfm") if cluster: clusters = cluster_motifs_with_report( pfmfile, os.path.join(tmpdir, "clustered_motifs.pfm"), outdir, 0.95, title=inputfile, ) # Determine best motif in cluster best_motifs = best_motif_in_cluster( pfmfile, os.path.join(tmpdir, "clustered_motifs.pfm"), clusters, os.path.join(tmpdir, "validation.fa"), background, params["genome"], result.stats, ) final_motifs, stats = rename_motifs(best_motifs, result.stats) else: logger.info("not clustering") rank = rank_motifs(result.stats) sorted_motifs = sorted(motifs, key=lambda x: rank[str(x)], reverse=True) final_motifs, stats = rename_motifs(sorted_motifs, result.stats) with open(os.path.join(outdir, "gimme.denovo.pfm"), "w") as f: for m in final_motifs: f.write("{}\n".format(m.to_pwm())) if create_report: bg = dict([(b, os.path.join(tmpdir, "bg.{}.fa".format(b))) for b in background]) create_denovo_motif_report( inputfile, os.path.join(outdir, "gimme.denovo.pfm"), os.path.join(tmpdir, "validation.fa"), bg, os.path.join(tmpdir, "localization.fa"), outdir, params, stats, ) with open(os.path.join(outdir, "params.txt"), "w") as f: for k, v in params.items(): f.write("{}\t{}\n".format(k, v)) if not (params.get("keep_intermediate")): logger.debug( "Deleting intermediate files. " "Please specifify the -k option if you want to keep these files.") shutil.rmtree(tmpdir) logger.info("finished") logger.info("output dir: %s", outdir) if cluster: logger.info("de novo report: %s", os.path.join(outdir, "gimme.denovo.html")) return final_motifs
def gimme_motifs(inputfile, outdir, params=None, filter_significant=True, cluster=True, create_report=True): """De novo motif prediction based on an ensemble of different tools. Parameters ---------- inputfile : str Filename of input. Can be either BED, narrowPeak or FASTA. outdir : str Name of output directory. params : dict, optional Optional parameters. filter_significant : bool, optional Filter motifs for significance using the validation set. cluster : bool, optional Cluster similar predicted (and significant) motifs. create_report : bool, optional Create output reports (both .txt and .html). Returns ------- motifs : list List of predicted motifs. Examples -------- >>> from gimmemotifs.denovo import gimme_motifs >>> gimme_motifs("input.fa", "motifs.out") """ if outdir is None: outdir = "gimmemotifs_{}".format(datetime.date.today().strftime("%d_%m_%Y")) # Create output directories tmpdir = os.path.join(outdir, "intermediate") for d in [outdir, tmpdir]: if not os.path.exists(d): os.mkdir(d) # setup logfile logger = logging.getLogger("gimme") # Log to file logfile = os.path.join(outdir, "gimmemotifs.log") fh = logging.FileHandler(logfile, "w") fh.setLevel(logging.DEBUG) file_formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") fh.setFormatter(file_formatter) logger.addHandler(fh) logger = logging.getLogger("gimme.denovo.gimme_motifs") # Initialize parameters params = parse_denovo_params(params) # Check the input files input_type, background = check_denovo_input(inputfile, params) logger.info("starting full motif analysis") logger.debug("Using temporary directory %s", mytmpdir()) # Create the necessary files for motif prediction and validation if input_type == "bed": prepare_denovo_input_bed(inputfile, params, tmpdir) elif input_type == "narrowpeak": prepare_denovo_input_narrowpeak(inputfile, params, tmpdir) elif input_type == "fasta": prepare_denovo_input_fa(inputfile, params, tmpdir) else: logger.error("Unknown input file.") sys.exit(1) # Create the background FASTA files background = create_backgrounds( tmpdir, background, params.get("genome", None), params["width"], params.get("custom_background", None) ) # Predict de novo motifs result = predict_motifs( os.path.join(tmpdir, "prediction.fa"), os.path.join(tmpdir, "prediction.bg.fa"), os.path.join(tmpdir, "all_motifs.pfm"), params=params, stats_fg=os.path.join(tmpdir, 'validation.fa'), stats_bg=background, ) if len(result.motifs) == 0: logger.info("finished") return [] # Write statistics stats_file = os.path.join(tmpdir, "stats.{}.txt") write_stats(result.stats, stats_file) bg = sorted(background, key=lambda x: BG_RANK[x])[0] if filter_significant: motifs = filter_significant_motifs( os.path.join(tmpdir, "significant_motifs.pfm"), result, bg) if len(motifs) == 0: logger.info("no significant motifs") return pwmfile = os.path.join(tmpdir, "significant_motifs.pfm") else: logger.info("not filtering for significance") motifs = result.motifs pwmfile = os.path.join(tmpdir, "all_motifs.pfm") if cluster: clusters = cluster_motifs_with_report( pwmfile, os.path.join(tmpdir, "clustered_motifs.pfm"), outdir, 0.95, title=inputfile) # Determine best motif in cluster best_motifs = best_motif_in_cluster( pwmfile, os.path.join(tmpdir, "clustered_motifs.pfm"), clusters, os.path.join(tmpdir, 'validation.fa'), background, result.stats) final_motifs, stats = rename_motifs(best_motifs, result.stats) else: logger.info("not clustering") rank = rank_motifs(result.stats) sorted_motifs = sorted(motifs, key=lambda x: rank[str(x)], reverse=True) final_motifs, stats = rename_motifs(sorted_motifs, result.stats) with open(os.path.join(outdir, "motifs.pwm"), "w") as f: for m in final_motifs: f.write("{}\n".format(m.to_pwm())) if create_report: bg = dict([(b, os.path.join(tmpdir, "bg.{}.fa".format(b))) for b in background]) create_denovo_motif_report( inputfile, os.path.join(outdir, "motifs.pwm"), os.path.join(tmpdir, "validation.fa"), bg, os.path.join(tmpdir, "localization.fa"), outdir, params, stats, ) with open(os.path.join(outdir, "params.txt"), "w") as f: for k,v in params.items(): f.write("{}\t{}\n".format(k,v)) if not(params.get("keep_intermediate")): logger.debug( "Deleting intermediate files. " "Please specifify the -k option if you want to keep these files.") shutil.rmtree(tmpdir) logger.info("finished") logger.info("output dir: %s", outdir) if cluster: logger.info("report: %s", os.path.join(outdir, "motif_report.html")) return final_motifs
def best_motif_in_cluster( single_pwm, clus_pwm, clusters, fg_fa, background, genome, stats=None, metrics=("roc_auc", "recall_at_fdr"), ): """Return the best motif per cluster for a clustering results. The motif can be either the average motif or one of the clustered motifs. Parameters ---------- single_pwm : str Filename of motifs. clus_pwm : str Filename of motifs. clusters : Motif clustering result. fg_fa : str Filename of FASTA file. background : dict Dictionary for background file names. genome : str Genome name. stats : dict, optional If statistics are not supplied they will be computed. metrics : sequence, optional Metrics to use for motif evaluation. Default are "roc_auc" and "recall_at_fdr". Returns ------- motifs : list List of Motif instances. """ # combine original and clustered motifs motifs = read_motifs(single_pwm) + read_motifs(clus_pwm) motifs = dict([(str(m), m) for m in motifs]) # get the statistics for those motifs that were not yet checked clustered_motifs = [] for clus, singles in clusters: for motif in set([clus] + singles): if str(motif) not in stats: clustered_motifs.append(motifs[str(motif)]) new_stats = {} for bg, bg_fa in background.items(): for m, s in calc_stats(fg_file=fg_fa, bg_file=bg_fa, motifs=clustered_motifs, genome=genome).items(): if m not in new_stats: new_stats[m] = {} new_stats[m][bg] = s stats.update(new_stats) rank = rank_motifs(stats, metrics) # rank the motifs best_motifs = [] for clus, singles in clusters: if len(singles) > 1: eval_motifs = singles if clus not in motifs: eval_motifs.append(clus) eval_motifs = [motifs[str(e)] for e in eval_motifs] best_motif = sorted(eval_motifs, key=lambda x: rank[str(x)])[-1] best_motifs.append(best_motif) else: best_motifs.append(clus) for bg in background: stats[str(best_motifs[-1])][bg]["num_cluster"] = len(singles) best_motifs = sorted(best_motifs, key=lambda x: rank[str(x)], reverse=True) return best_motifs
def best_motif_in_cluster(single_pwm, clus_pwm, clusters, fg_fa, background, stats=None, metrics=("roc_auc", "recall_at_fdr")): """Return the best motif per cluster for a clustering results. The motif can be either the average motif or one of the clustered motifs. Parameters ---------- single_pwm : str Filename of motifs. clus_pwm : str Filename of motifs. clusters : Motif clustering result. fg_fa : str Filename of FASTA file. background : dict Dictionary for background file names. stats : dict, optional If statistics are not supplied they will be computed. metrics : sequence, optional Metrics to use for motif evaluation. Default are "roc_auc" and "recall_at_fdr". Returns ------- motifs : list List of Motif instances. """ # combine original and clustered motifs motifs = read_motifs(single_pwm) + read_motifs(clus_pwm) motifs = dict([(str(m), m) for m in motifs]) # get the statistics for those motifs that were not yet checked clustered_motifs = [] for clus,singles in clusters: for motif in set([clus] + singles): if str(motif) not in stats: clustered_motifs.append(motifs[str(motif)]) new_stats = {} for bg, bg_fa in background.items(): for m,s in calc_stats(clustered_motifs, fg_fa, bg_fa).items(): if m not in new_stats: new_stats[m] = {} new_stats[m][bg] = s stats.update(new_stats) rank = rank_motifs(stats, metrics) # rank the motifs best_motifs = [] for clus, singles in clusters: if len(singles) > 1: eval_motifs = singles if clus not in motifs: eval_motifs.append(clus) eval_motifs = [motifs[str(e)] for e in eval_motifs] best_motif = sorted(eval_motifs, key=lambda x: rank[str(x)])[-1] best_motifs.append(best_motif) else: best_motifs.append(clus) for bg in background: stats[str(best_motifs[-1])][bg]["num_cluster"] = len(singles) best_motifs = sorted(best_motifs, key=lambda x: rank[str(x)], reverse=True) return best_motifs