def scan_to_table(input_table, genome, data_dir, scoring, pwmfile=None): threshold = check_threshold(data_dir, genome, scoring) config = MotifConfig() if pwmfile is None: pwmfile = config.get_default_params().get("motif_db", None) if pwmfile is not None: pwmfile = os.path.join(config.get_motif_dir(), pwmfile) if pwmfile is None: raise ValueError("no pwmfile given and no default database specified") df = pd.read_table(input_table, index_col=0) regions = list(df.index) s = Scanner() s.set_motifs(pwmfile) s.set_genome(genome) scores = [] if scoring == "count": for row in s.count(regions, cutoff=threshold): scores.append(row) else: for row in s.best_score(regions): scores.append(row) motif_names = [m.id for m in read_motifs(open(pwmfile))] return pd.DataFrame(scores, index=df.index, columns=motif_names)
def scan_to_table(input_table, genome, data_dir, scoring, pwmfile=None): threshold = check_threshold(data_dir, genome, scoring) config = MotifConfig() if pwmfile is None: pwmfile = config.get_default_params().get("motif_db", None) if pwmfile is not None: pwmfile = os.path.join(config.get_motif_dir(), pwmfile) if pwmfile is None: raise ValueError("no pwmfile given and no default database specified") df = pd.read_table(input_table, index_col=0) regions = list(df.index) s = Scanner() s.set_motifs(pwmfile) s.set_genome(genome) scores = [] if scoring == "count": for row in s.count(regions, cutoff=threshold): scores.append(row) else: for row in s.best_score(regions): scores.append(row) motif_names = [m.id for m in read_motifs(open(pwmfile))] return pd.DataFrame(scores, index=df.index, columns=motif_names)
def scan_to_table(input_table, genome, data_dir, scoring, pwmfile=None, ncpus=None): config = MotifConfig() if pwmfile is None: pwmfile = config.get_default_params().get("motif_db", None) if pwmfile is not None: pwmfile = os.path.join(config.get_motif_dir(), pwmfile) if pwmfile is None: raise ValueError("no pwmfile given and no default database specified") logger.info("reading table") if input_table.endswith("feather"): df = pd.read_feather(input_table) idx = df.iloc[:, 0].values else: df = pd.read_table(input_table, index_col=0, comment="#") idx = df.index regions = list(idx) s = Scanner(ncpus=ncpus) s.set_motifs(pwmfile) s.set_genome(genome) nregions = len(regions) scores = [] if scoring == "count": logger.info("setting threshold") s.set_threshold(fpr=FPR, genome=genome) logger.info("creating count table") for row in s.count(regions): scores.append(row) logger.info("done") else: s.set_threshold(threshold=0.0) logger.info("creating score table") for row in s.best_score(regions): scores.append(row) logger.info("done") motif_names = [m.id for m in read_motifs(open(pwmfile))] logger.info("creating dataframe") return pd.DataFrame(scores, index=idx, columns=motif_names)
def scan_to_table( input_table, genome, scoring, pfmfile=None, ncpus=None, zscore=True, gc=True ): """Scan regions in input table with motifs. Parameters ---------- input_table : str Filename of input table. Can be either a text-separated tab file or a feather file. genome : str Genome name. Can be either the name of a FASTA-formatted file or a genomepy genome name. scoring : str "count" or "score" pfmfile : str, optional Specify a PFM file for scanning. ncpus : int, optional If defined this specifies the number of cores to use. Returns ------- table : pandas.DataFrame DataFrame with motif ids as column names and regions as index. Values are either counts or scores depending on the 'scoring' parameter.s """ config = MotifConfig() if pfmfile is None: pfmfile = config.get_default_params().get("motif_db", None) if pfmfile is not None: pfmfile = os.path.join(config.get_motif_dir(), pfmfile) if pfmfile is None: raise ValueError("no pfmfile given and no default database specified") logger.info("reading table") if input_table.endswith("feather"): df = pd.read_feather(input_table) idx = df.iloc[:, 0].values else: df = pd.read_table(input_table, index_col=0, comment="#") idx = df.index regions = list(idx) if len(regions) >= 1000: check_regions = np.random.choice(regions, size=1000, replace=False) else: check_regions = regions size = int( np.median([len(seq) for seq in as_fasta(check_regions, genome=genome).seqs]) ) s = Scanner(ncpus=ncpus) s.set_motifs(pfmfile) s.set_genome(genome) s.set_background(genome=genome, gc=gc, size=size) scores = [] if scoring == "count": logger.info("setting threshold") s.set_threshold(fpr=FPR) logger.info("creating count table") for row in s.count(regions): scores.append(row) logger.info("done") else: s.set_threshold(threshold=0.0) msg = "creating score table" if zscore: msg += " (z-score" if gc: msg += ", GC%" msg += ")" else: msg += " (logodds)" logger.info(msg) for row in s.best_score(regions, zscore=zscore, gc=gc): scores.append(row) logger.info("done") motif_names = [m.id for m in read_motifs(pfmfile)] logger.info("creating dataframe") return pd.DataFrame(scores, index=idx, columns=motif_names)
def moap(inputfile, method="classic", scoring="score", outfile=None, motiffile=None, pwmfile=None, genome=None, cutoff=0.95): """ Run a single motif activity prediction algorithm. Parameters ---------- inputfile : str File with regions (chr:start-end) in first column and either cluster name in second column or a table with values. method : str, optional Motif activity method to use. Any of 'classic', 'ks', 'lasso', 'lightning', 'mara', 'rf'. Default is 'classic'. scoring: str, optional Either 'score' or 'count' outfile : str, optional Name of outputfile to save the fitted activity values. motiffile : str, optional Table with motif scan results. First column should be exactly the same regions as in the inputfile. pwmfile : str, optional File with motifs in pwm format. Required when motiffile is not supplied. genome : str, optional Genome name, as indexed by gimme. Required when motiffile is not supplied cutoff : float, optional Cutoff for motif scanning Returns ------- pandas DataFrame with motif activity """ if scoring not in ['score', 'count']: raise ValueError("valid values are 'score' and 'count'") config = MotifConfig() m2f = None # read data df = pd.read_table(inputfile, index_col=0) if method in CLUSTER_METHODS: if df.shape[1] != 1: raise ValueError("1 column expected for {}".format(method)) else: if np.dtype('object') in set(df.dtypes): raise ValueError( "columns should all be numeric for {}".format(method)) if method not in VALUE_METHODS: raise ValueError("method {} not valid".format(method)) if motiffile is None: if genome is None: raise ValueError("need a genome") # check pwmfile if pwmfile is None: pwmfile = config.get_default_params().get("motif_db", None) if pwmfile is not None: pwmfile = os.path.join(config.get_motif_dir(), pwmfile) if pwmfile is None: raise ValueError("no pwmfile given and no default database specified") if not os.path.exists(pwmfile): raise ValueError("{} does not exist".format(pwmfile)) try: motifs = read_motifs(open(pwmfile)) except: sys.stderr.write("can't read motifs from {}".format(pwmfile)) raise base = os.path.splitext(pwmfile)[0] map_file = base + ".motif2factors.txt" if os.path.exists(map_file): m2f = pd.read_table(map_file, index_col=0) # initialize scanner s = Scanner() sys.stderr.write(pwmfile + "\n") s.set_motifs(pwmfile) s.set_genome(genome) # scan for motifs sys.stderr.write("scanning for motifs\n") motif_names = [m.id for m in read_motifs(open(pwmfile))] scores = [] if method == 'classic' or scoring == "count": for row in s.count(list(df.index), cutoff=cutoff): scores.append(row) else: for row in s.best_score(list(df.index)): scores.append(row) motifs = pd.DataFrame(scores, index=df.index, columns=motif_names) else: motifs = pd.read_table(motiffile, index_col=0) motifs = motifs.loc[df.index] clf = None if method == "ks": clf = KSMoap() if method == "mwu": clf = MWMoap() if method == "rf": clf = RFMoap() if method == "lasso": clf = LassoMoap() if method == "lightning": clf = LightningMoap() if method == "mara": clf = MaraMoap() if method == "more": clf = MoreMoap() if method == "classic": clf = ClassicMoap() clf.fit(motifs, df) if outfile: with open(outfile, "w") as f: f.write("# maelstrom - GimmeMotifs version {}\n".format(GM_VERSION)) f.write("# method: {} with motif {}\n".format(method, scoring)) if genome: f.write("# genome: {}\n".format(genome)) if motiffile: f.write("# motif table: {}\n".format(motiffile)) f.write("# {}\n".format(clf.act_description)) with open(outfile, "a") as f: clf.act_.to_csv(f, sep="\t") return clf.act_
def moap(inputfile, method="hypergeom", scoring=None, outfile=None, motiffile=None, pwmfile=None, genome=None, fpr=0.01, ncpus=None, subsample=None): """Run a single motif activity prediction algorithm. Parameters ---------- inputfile : str :1File with regions (chr:start-end) in first column and either cluster name in second column or a table with values. method : str, optional Motif activity method to use. Any of 'hypergeom', 'lasso', 'lightningclassification', 'lightningregressor', 'bayesianridge', 'rf', 'xgboost'. Default is 'hypergeom'. scoring: str, optional Either 'score' or 'count' outfile : str, optional Name of outputfile to save the fitted activity values. motiffile : str, optional Table with motif scan results. First column should be exactly the same regions as in the inputfile. pwmfile : str, optional File with motifs in pwm format. Required when motiffile is not supplied. genome : str, optional Genome name, as indexed by gimme. Required when motiffile is not supplied fpr : float, optional FPR for motif scanning ncpus : int, optional Number of threads to use. Default is the number specified in the config. Returns ------- pandas DataFrame with motif activity """ if scoring and scoring not in ['score', 'count']: raise ValueError("valid values are 'score' and 'count'") config = MotifConfig() if inputfile.endswith("feather"): df = pd.read_feather(inputfile) df = df.set_index(df.columns[0]) else: # read data df = pd.read_table(inputfile, index_col=0, comment="#") clf = Moap.create(method, ncpus=ncpus) if clf.ptype == "classification": if df.shape[1] != 1: raise ValueError("1 column expected for {}".format(method)) else: if np.dtype('object') in set(df.dtypes): raise ValueError( "columns should all be numeric for {}".format(method)) if motiffile is None: if genome is None: raise ValueError("need a genome") pwmfile = pwmfile_location(pwmfile) try: motifs = read_motifs(pwmfile) except: sys.stderr.write("can't read motifs from {}".format(pwmfile)) raise # initialize scanner s = Scanner(ncpus=ncpus) sys.stderr.write(pwmfile + "\n") s.set_motifs(pwmfile) s.set_genome(genome) s.set_background(genome=genome) # scan for motifs sys.stderr.write("scanning for motifs\n") motif_names = [m.id for m in read_motifs(pwmfile)] scores = [] if method == 'classic' or scoring == "count": s.set_threshold(fpr=fpr) for row in s.count(list(df.index)): scores.append(row) else: for row in s.best_score(list(df.index), normalize=True): scores.append(row) motifs = pd.DataFrame(scores, index=df.index, columns=motif_names) else: motifs = pd.read_table(motiffile, index_col=0, comment="#") if outfile and os.path.exists(outfile): out = pd.read_table(outfile, index_col=0, comment="#") ncols = df.shape[1] if ncols == 1: ncols = len(df.iloc[:, 0].unique()) if out.shape[0] == motifs.shape[1] and out.shape[1] == ncols: logger.warn("%s output already exists... skipping", method) return out if subsample is not None: n = int(subsample * df.shape[0]) logger.debug("Subsampling %d regions", n) df = df.sample(n) motifs = motifs.loc[df.index] if method == "lightningregressor": outdir = os.path.dirname(outfile) tmpname = os.path.join(outdir, ".lightning.tmp") clf.fit(motifs, df, tmpdir=tmpname) shutil.rmtree(tmpname) else: clf.fit(motifs, df) if outfile: with open(outfile, "w") as f: f.write( "# maelstrom - GimmeMotifs version {}\n".format(__version__)) f.write("# method: {} with motif {}\n".format(method, scoring)) if genome: f.write("# genome: {}\n".format(genome)) if motiffile: f.write("# motif table: {}\n".format(motiffile)) f.write("# {}\n".format(clf.act_description)) with open(outfile, "a") as f: clf.act_.to_csv(f, sep="\t") return clf.act_
def scan_to_table(input_table, genome, scoring, pwmfile=None, ncpus=None): """Scan regions in input table with motifs. Parameters ---------- input_table : str Filename of input table. Can be either a text-separated tab file or a feather file. genome : str Genome name. Can be either the name of a FASTA-formatted file or a genomepy genome name. scoring : str "count" or "score" pwmfile : str, optional Specify a PFM file for scanning. ncpus : int, optional If defined this specifies the number of cores to use. Returns ------- table : pandas.DataFrame DataFrame with motif ids as column names and regions as index. Values are either counts or scores depending on the 'scoring' parameter.s """ config = MotifConfig() if pwmfile is None: pwmfile = config.get_default_params().get("motif_db", None) if pwmfile is not None: pwmfile = os.path.join(config.get_motif_dir(), pwmfile) if pwmfile is None: raise ValueError("no pwmfile given and no default database specified") logger.info("reading table") if input_table.endswith("feather"): df = pd.read_feather(input_table) idx = df.iloc[:,0].values else: df = pd.read_table(input_table, index_col=0, comment="#") idx = df.index regions = list(idx) s = Scanner(ncpus=ncpus) s.set_motifs(pwmfile) s.set_genome(genome) s.set_background(genome=genome) nregions = len(regions) scores = [] if scoring == "count": logger.info("setting threshold") s.set_threshold(fpr=FPR) logger.info("creating count table") for row in s.count(regions): scores.append(row) logger.info("done") else: s.set_threshold(threshold=0.0) logger.info("creating score table") for row in s.best_score(regions, normalize=True): scores.append(row) logger.info("done") motif_names = [m.id for m in read_motifs(pwmfile)] logger.info("creating dataframe") return pd.DataFrame(scores, index=idx, columns=motif_names)
def command_scan(inputfile, pwmfile, nreport=1, cutoff=0.9, bed=False, scan_rc=True, table=False, score_table=False, moods=False, pvalue=None, bgfile=None, genome=None): motifs = pwmfile_to_motifs(pwmfile) index_dir = None if genome is not None: index_dir = os.path.join(MotifConfig().get_index_dir(), genome) # initialize scanner s = Scanner() s.set_motifs(pwmfile) fa = as_fasta(inputfile, index_dir) if moods: result_it = scan_it_moods(inputfile, motifs, cutoff, bgfile, nreport, scan_rc, pvalue, table) else: result_it = s.scan(fa, nreport, scan_rc, cutoff) if table: # header yield "\t{}".format("\t".join([m.id for m in motifs])) if moods: result_it = scan_it_moods(inputfile, motifs, cutoff, bgfile, nreport, scan_rc, pvalue, table) for seq_id, counts in result_it: yield "{}\t{}".format(seq_id, "\t".join([str(x) for x in counts])) else: # get iterator result_it = s.count(fa, nreport, scan_rc, cutoff) # counts table for i, counts in enumerate(result_it): yield "{}\t{}".format( fa.ids[i], "\t".join([str(x) for x in counts]) ) elif score_table: # get iterator result_it = s.best_score(fa, scan_rc) # header yield "\t{}".format("\t".join([m.id for m in motifs])) # score table for i,scores in enumerate(result_it): yield "{}\t{}".format( fa.ids[i], "\t".join([str(x) for x in scores]) ) else: if moods: for motif, d in result_it: for seq_id,matches in d.items(): for pos,score,strand in matches: yield format_line(fa, seq_id, motif, score, pos, strand, bed=bed) else: for i, result in enumerate(result_it): seq_id = fa.ids[i] for motif, matches in zip(motifs, result): for (score, pos, strand) in matches: yield format_line(fa, seq_id, motif, score, pos, strand, bed=bed)
def moap(inputfile, method="classic", scoring="score", outfile=None, motiffile=None, pwmfile=None, genome=None, cutoff=0.95): """ Run a single motif activity prediction algorithm. Parameters ---------- inputfile : str File with regions (chr:start-end) in first column and either cluster name in second column or a table with values. method : str, optional Motif activity method to use. Any of 'classic', 'ks', 'lasso', 'lightning', 'mara', 'rf'. Default is 'classic'. scoring: str, optional Either 'score' or 'count' outfile : str, optional Name of outputfile to save the fitted activity values. motiffile : str, optional Table with motif scan results. First column should be exactly the same regions as in the inputfile. pwmfile : str, optional File with motifs in pwm format. Required when motiffile is not supplied. genome : str, optional Genome name, as indexed by gimme. Required when motiffile is not supplied cutoff : float, optional Cutoff for motif scanning Returns ------- pandas DataFrame with motif activity """ if scoring not in ['score', 'count']: raise ValueError("valid values are 'score' and 'count'") config = MotifConfig() m2f = None # read data df = pd.read_table(inputfile, index_col=0) if method in CLUSTER_METHODS: if df.shape[1] != 1: raise ValueError("1 column expected for {}".format(method)) else: if np.dtype('object') in set(df.dtypes): raise ValueError( "columns should all be numeric for {}".format(method)) if method not in VALUE_METHODS: raise ValueError("method {} not valid".format(method)) if motiffile is None: if genome is None: raise ValueError("need a genome") # check pwmfile if pwmfile is None: pwmfile = config.get_default_params().get("motif_db", None) if pwmfile is not None: pwmfile = os.path.join(config.get_motif_dir(), pwmfile) if pwmfile is None: raise ValueError( "no pwmfile given and no default database specified") if not os.path.exists(pwmfile): raise ValueError("{} does not exist".format(pwmfile)) try: motifs = read_motifs(open(pwmfile)) except: sys.stderr.write("can't read motifs from {}".format(pwmfile)) raise base = os.path.splitext(pwmfile)[0] map_file = base + ".motif2factors.txt" if os.path.exists(map_file): m2f = pd.read_table(map_file, index_col=0) # initialize scanner s = Scanner() sys.stderr.write(pwmfile + "\n") s.set_motifs(pwmfile) s.set_genome(genome) # scan for motifs sys.stderr.write("scanning for motifs\n") motif_names = [m.id for m in read_motifs(open(pwmfile))] scores = [] if method == 'classic' or scoring == "count": for row in s.count(list(df.index), cutoff=cutoff): scores.append(row) else: for row in s.best_score(list(df.index)): scores.append(row) motifs = pd.DataFrame(scores, index=df.index, columns=motif_names) else: motifs = pd.read_table(motiffile, index_col=0) clf = None if method == "ks": clf = KSMoap() if method == "mwu": clf = MWMoap() if method == "rf": clf = RFMoap() if method == "lasso": clf = LassoMoap() if method == "lightning": clf = LightningMoap() if method == "mara": clf = MaraMoap() if method == "more": clf = MoreMoap() if method == "classic": clf = ClassicMoap() clf.fit(motifs, df) if outfile: with open(outfile, "w") as f: f.write( "# maelstrom - GimmeMotifs version {}\n".format(GM_VERSION)) f.write("# method: {} with motif {}\n".format(method, scoring)) if genome: f.write("# genome: {}\n".format(genome)) if motiffile: f.write("# motif table: {}\n".format(motiffile)) f.write("# {}\n".format(clf.act_description)) with open(outfile, "a") as f: clf.act_.to_csv(f, sep="\t") return clf.act_