def parse(): global probefile, PROBESET, letter, fsafile try: idx = sys.argv.index('-genome') del sys.argv[idx] probefile = sys.argv[idx] del sys.argv[idx] PROBESET = MotifMetrics.ProbeSet(probefile) PROBESET.factor = 0.7 except: pass try: idx = sys.argv.index('-letter') del sys.argv[idx] letter = sys.argv[idx] del sys.argv[idx] except: pass try: idx = sys.argv.index('-f') del sys.argv[idx] fsafile = sys.argv[idx] del sys.argv[idx] except: pass
def parse(): global probefile, PROBESET try: idx = sys.argv.index('-genome') del sys.argv[idx] probefile = sys.argv[idx] del sys.argv[idx] PROBESET = MotifMetrics.ProbeSet(probefile) PROBESET.factor = 0.65 except: pass
def tamo2tamo(file, outname): global probefile, PROBESET, fsafile motifs = MotifTools.load(file) if fsafile: fsaname = fsafile else: fsaname = find_fsa(file) print '# FSA ', fsaname fsaD = MotifMetrics.fasta2seqs(fsaname, 'want_dict') probes = fsaD.keys() if not probefile: PROBESET = MotifMetrics.ProbeSet('YEAST') #PROBESET= pick_genome(fsaname) #for key,seq in fsaD.items(): # PROBESET.probes[key] = seq print "# %d motifs" % len(motifs) for motif in motifs: #motif.pvalue, motif.church = 1,1 #Comment this! if motif.pvalue == 1: motif.pvalue = PROBESET.p_value(motif, probes, 'v') if motif.church == 1: motif.church = PROBESET.church(motif, probes, 'v') #if motif.E_site == None: motif.E_site = PROBESET.E_sitef(motif,probes,3,'v') #if motif.E_chi2 == None: motif.E_chi2 = PROBESET.E_chi2(motif,probes,None,'v') #if motif.E_seq == None: motif.E_seq = PROBESET.E_seq(motif,probes,'v') if motif.ROC_auc == None: motif.ROC_auc = PROBESET.ROC_AUC(motif, probes, 'v') #if motif.MNCP == None: motif.MNCP = PROBESET.MNCP(motif,probes,'v') if motif.frac == None: motif.frac = PROBESET.frac(motif, probes, 'v', 0.7) if motif.numbound == 0: matching = PROBESET.matching_ids(motif, [], factor=0.7) matchbound = [x for x in matching if x in probes] motif.numbound = len(probes) motif.nummotif = len(matching) motif.numboundmotif = len(matchbound) if 0 and motif.CRA == None: try: pass CRA, Cfrac = PROBESET.cons_ROC_AUC(motif, probes, 'v', tuple='YES') motif.CRA = CRA motif.Cfrac = Cfrac except: pass MotifTools.save_motifs(motifs, outname)
def main(): short_opts = 'f:' long_opts = ['genome=', 'range=', 'top=', 'pcnt=', 'bgfile='] try: opts, args = getopt.getopt(sys.argv[1:], short_opts, long_opts) except getopt.GetoptError: print getopt.GetoptError.__dict__ usage() if not opts: usage() fastafile = '' top_count = 10 top_pcnt = None genome = 'YEAST' w_start = 8 w_stop = 15 bgfile = MDSCAN_DIR + 'yeast_int.bg' for opt, value in opts: if opt == '-f': fastafile = value if opt == '--genome': genome = value if opt == '--top': top_count = int(value) if opt == '--pcnt': top_pcnt = float(value) if opt == '--range': w_start, w_stop = [int(x) for x in value.split(',')] print "#" + ' '.join(sys.argv) probeids = Fasta.keys(fastafile) Genome = MotifMetrics.ProbeSet(genome) probeids = Genome.filter(probeids) if top_pcnt: top_count = max(top_count, int(top_pcnt / 100.0 * len(probeids))) theMeta = metaMDscan(fastafile, w_start, w_stop, top_count) for m in theMeta.motifs: m.pvalue = Genome.p_value(m, probeids, 'v') m.church = Genome.church(m, probeids, 'v') sys.stdout.flush() theMeta.motifs.sort(lambda x, y: cmp(x.pvalue, y.pvalue)) print_motifs(theMeta.motifs)
def ace2tamo(filename, tamoname): global probefile, PROBESET if re.search('\.ace$',filename): mdobject = AlignAce.AlignAce(filename) elif re.search('\.meme$',filename): mdobject = Meme.Meme(filename) fsaname = find_fsa(mdobject.fastafile) fsaD = MotifMetrics.fasta2seqs(fsaname,'want_dict') probes = fsaD.keys() if not probefile: PROBESET = MotifMetrics.ProbeSet('HUMAN_250') #PROBESET= pick_genome(fsaname) for key,seq in fsaD.items(): PROBESET.probes[key] = seq for motif in mdobject.motifs: if motif.pvalue == 1: motif.pvalue = PROBESET.p_value(motif,probes,'v') if motif.church == 1: motif.church = PROBESET.church(motif,probes,'v') if motif.E_site == None: motif.E_site = PROBESET.E_sitef(motif,probes,3,'v') #if motif.E_chi2 == None: motif.E_chi2 = PROBESET.E_chi2(motif,probes,None,'v') if motif.E_seq == None: motif.E_seq = PROBESET.E_seq(motif,probes,'v') if motif.ROC_auc== None: motif.ROC_auc= PROBESET.ROC_AUC(motif,probes,'v') if motif.MNCP == None: motif.MNCP = PROBESET.MNCP(motif,probes,'v') if re.search('\.meme$',filename): motif.MAP = -math.log(motif.evalue)/math.log(10) sys.stdout.flush() i = 0 for motif in mdobject.motifs: motif.seednum = i ; i=i+1 kmers = motif.bogus_kmers(100) motif.maxscore = -100 scores = [motif.scan(kmer)[2][0] for kmer in kmers] print Arith.avestd(scores) if re.search('\.meme$',filename): mdobject.motifs.sort(lambda x,y: cmp(x.pvalue, y.pvalue)) else: mdobject.motifs.sort(lambda x,y: cmp(x.church, y.church)) MotifTools.save_motifs(mdobject.motifs,tamoname)
def main(): if len(sys.argv) < 2: print "Usage: %s <fasta_file>" % (re.sub('^.*/', '', sys.argv[0])) print " [-genome genomefile.fsa] Genome file (for computing Enrichment, etc..." print " [-bfile file ] File for Markov Background Model" print ' [-bigdata ] Adds "-maxsize 2000000" for large datasets' sys.exit(1) fastafile = sys.argv[1] width = 0 valid_tfs = [] iter = 10 genome = 'YEAST' xtra = '' bfile = None for tok, i in zip(sys.argv, range(len(sys.argv))): if tok == '-w': width = int(sys.argv[i + 1]) elif tok == '-human': genome = 'HUMAN' elif tok == '-H250': genome = 'HUMAN_250' elif tok == '-Ch22': genome = 'Ch22' elif tok == '-genome': genome = sys.argv[i + 1] elif tok == '-bigdata': xtra = '-maxsize 2000000' elif tok == '-bfile': bfile = sys.argv[i + 1] theMeme = Meme(fastafile, width, xtra, genome, bfile) Genome = MotifMetrics.ProbeSet(genome) ids = theMeme.probes #ids = Genome.ids_from_file(fastafile) motifs = theMeme.motifs for motif in motifs: motif.pvalue = Genome.p_value(motif, ids, 'v') for valid_tf in valid_tfs: motif.valid = Validate.validate(motif, valid_tf, '', 'Want Tuple') print_motifs(motifs) print '#' * 80 for line in theMeme.lines: print line,
def motifs2tamo(motifs, outname): global probefile, PROBESET fsaname = find_fsa(outname) fsaD = MotifMetrics.fasta2seqs(fsaname,'want_dict') probes = fsaD.keys() if not probefile: PROBESET = MotifMetrics.ProbeSet('YEAST') #PROBESET= pick_genome(fsaname) #for key,seq in fsaD.items(): # PROBESET.probes[key] = seq print "# %d motifs"%len(motifs) for motif in motifs: if motif.pvalue == 1: motif.pvalue = PROBESET.p_value(motif,probes,'v') if motif.church == 1: motif.church = PROBESET.church(motif,probes,'v') if motif.E_site == None: motif.E_site = PROBESET.E_sitef(motif,probes,3,'v') #if motif.E_chi2 == None: motif.E_chi2 = PROBESET.E_chi2(motif,probes,None,'v') if motif.E_seq == None: motif.E_seq = PROBESET.E_seq(motif,probes,'v') if motif.ROC_auc== None: motif.ROC_auc= PROBESET.ROC_AUC(motif,probes,'v') if motif.MNCP == None: motif.MNCP = PROBESET.MNCP(motif,probes,'v') MotifTools.save_motifs(motifs,outname)
def random_seqs(numseq=50,genome='YEAST',want_dict=None): global PROBESETS, BADPROBES, BADPROBEFILES, ALL_IDS if PROBESETS.has_key(genome): probeset = PROBESETS[genome] else: probeset = MotifMetrics.ProbeSet(genome) PROBESETS[genome] = probeset if not BADPROBES: _d = {} for file in BADPROBEFILES: F = open(file) for id in [x.strip() for x in F.readlines()]: _d[id] = 1 F.close() BADPROBES = _d.keys() simfilter= GenerateFastas.SimilarFilter(50) all_ids = [x for x in probeset.probes.keys() if (x not in BADPROBES)] ALL_IDS = simfilter.filter(all_ids) ids = ALL_IDS randomids= [] count = 0 numids = len(ids) while 1: randomid = ids[int(random.random() * numids)] if randomid not in randomids: randomids.append(randomid) count = count + 1 if count >= numseq: break if not want_dict: seqs = [] for randomid in randomids: seqs.append( probeset.probes[randomid] ) else: seqs = {} for randomid in randomids: seqs[randomid] = probeset.probes[randomid] return(seqs)
def memefiles2tamo(files, tamoname): global probefile, PROBESET motifs = [] for filename in files: print ">>>SDFSD>F ", filename if re.search('\.ace$', filename): mdobject = AlignAce.AlignAce(filename) if not mdobject.fastafile: mdobject.fastafile = filename.replace('.ace', '.fsa') elif re.search('\.meme.*$', filename): mdobject = Meme.Meme(filename) if not mdobject.fastafile: mdobject.fastafile = re.sub('\..\.meme', '.meme', filename).replace('.meme', '.fsa') motifs.extend(mdobject.motifs) #fsaname = find_fsa(mdobject.fastafile) print mdobject.fastafile fsaname = Fasta.find(mdobject.fastafile) fsaD = Fasta.load(fsaname) probes = fsaD.keys() if not probefile: PROBESET = MotifMetrics.ProbeSet('YEAST') #PROBESET= pick_genome(fsaname) for key, seq in fsaD.items(): PROBESET.probes[key] = seq for motif in motifs: if motif.pvalue == 1: motif.pvalue = PROBESET.p_value(motif, probes, 'v') if motif.church == 1: motif.church = PROBESET.church(motif, probes, 'v') if motif.E_site == None: motif.E_site = PROBESET.E_sitef(motif, probes, 3, 'v') #if motif.E_chi2 == None: motif.E_chi2 = PROBESET.E_chi2(motif,probes,None,'v') #if motif.E_seq == None: motif.E_seq = PROBESET.E_seq(motif,probes,'v') if motif.ROC_auc == None: motif.ROC_auc = PROBESET.ROC_AUC(motif, probes, 'v') if motif.MNCP == None: motif.MNCP = PROBESET.MNCP(motif, probes, 'v') if motif.frac == None: motif.frac = PROBESET.frac(motif, probes, 'v', 0.7) if re.search('\.meme$', filename): motif.MAP = -math.log(motif.evalue) / math.log(10) if 1 and (motif.CRA == None): try: pass CRA, Cfrac = PROBESET.cons_ROC_AUC(motif, probes, 'v', tuple='YES') motif.CRA = CRA motif.Cfrac = Cfrac except: pass if re.search('\.meme$', filename): mdobject.motifs.sort(lambda x, y: cmp(x.pvalue, y.pvalue)) else: mdobject.motifs.sort(lambda x, y: cmp(x.church, y.church)) MotifTools.save_motifs(motifs, tamoname)
def main(): print "#" + ' '.join([x.replace(' ', '\ ') for x in sys.argv]) parse_opts() ARGS = getarg('args') GLOBALS['GENOME'] = MotifMetrics.ProbeSet(getarg('genome')) print '# Loaded %s' % getarg('genome') badprobes = [] for f in BADPROBES: b = [x.strip() for x in open(f).readlines()] badprobes.extend(b) d = getarg('DATA') p = getarg('GENOME') S = SimilarFilter(50) experiments = getarg('expts') top = getarg('top') THRESH = getarg('pvalue') NO_FILTER = getarg('nofilter') ratioabove = getarg('ratioabove') if not experiments: experiments = d.experiments for expt in experiments: e = expt if top: _tups = d.scores(e) _tups.sort(lambda x, y: cmp(x[0], y[0])) unfiltered = [x[1] for x in _tups[0:top]] elif ratioabove: unfiltered = d.ratioabove(e, ratioabove) else: unfiltered = d.bound(e, THRESH) badfiltered = [x for x in unfiltered if not (x in badprobes)] #badfiltered = unfiltered # Turn back on for real data if len(unfiltered) - len(badfiltered) > 2: unfiltered = badfiltered #else: continue # Necessary when only wanting to regenerate problemed data bound_ids = p.filter(unfiltered) filtered_ids = bound_ids print '### Removed ', (len(bound_ids) - len(S.filter(bound_ids))), 'from ', expt if not NO_FILTER: filtered_ids = p.filter(S.filter(bound_ids)) #filtered_ids = bound_ids # Turn back on for real data if NO_FILTER: print '#%-15s %3d ' % (expt, len(bound_ids)) else: print '#%-15s Before %3d After %3d ' % (expt, len(bound_ids), len(filtered_ids)) if len(unfiltered) - len(bound_ids) > 2: diff = [x for x in unfiltered if (not x in bound_ids)] #l_andnot(unfiltered,bound_ids) print '%-15s %3d probes (out of %3d) without predicted sequences ' % ( expt, len(diff), len(unfiltered)) for _p in diff: print '# Absent in (%s) %s' % (expt, _p) #continue #Comment this #sort final_ids, final_scores = [], [] _tups = d.scores(e) #Sometimes redundant, but who cares? _tups.sort(lambda x, y: cmp(x[0], y[0])) for score, id in _tups: #if (score <= THRESH) and (id in filtered_ids): if (id in filtered_ids): #Does this break everything? final_ids.append(id) final_scores.append('%8.4e' % score) if final_scores: print "#%% %-15s %s" % (expt, final_scores[-1]) else: print "#%% %-15s None" % (expt) s = p.fsa_string_from_ids(final_ids, final_scores) if len(s) == 0: continue f = expt + '.fsa' f = re.sub(' ', '_', f) FID = open(f, 'w') FID.write(s) FID.close() sys.stdout.flush()