示例#1
0
def match(args):
    sample = dict([(m.id, m) for m in pwmfile_to_motifs(args.pwmfile)])
    db = dict([(m.id, m) for m in pwmfile_to_motifs(args.dbpwmfile)])

    mc = MotifComparer()
    result = mc.get_closest_match(sample.values(), db.values(), "partial", "wic", "mean")

    print("Motif\tMatch\tScore\tP-value")
    for motif, match in result.items():
        pval, pos, orient = mc.compare_motifs(sample[motif], db[match[0]], "partial", "wic", "mean", pval=True)
        print("%s\t%s\t%0.2f\t%0.3e" % (motif, match[0], match[1][0], pval))

    if args.img:
        plotdata = []
        for query, match in result.items():
            motif = sample[query]
            dbmotif = db[match[0]]
            pval, pos, orient = mc.compare_motifs(motif, dbmotif, "partial", "wic", "mean", pval=True)
            
            if orient == -1:
                tmp = dbmotif.id
                dbmotif = dbmotif.rc()
                dbmotif.id = tmp

            if pos < 0:
                tmp = motif.id
                motif = Motif([[0.25,0.25,0.25,0.25]] * -pos + motif.pwm)
                motif.id = tmp
            elif pos > 0:
                tmp = dbmotif.id
                dbmotif = Motif([[0.25,0.25,0.25,0.25]] * pos + dbmotif.pwm)
                dbmotif.id = tmp

            plotdata.append((motif, dbmotif, pval))
            match_plot(plotdata, args.img)
示例#2
0
def match(args):
    sample = dict([(m.id, m) for m in pwmfile_to_motifs(args.pwmfile)])
    db = dict([(m.id, m) for m in pwmfile_to_motifs(args.dbpwmfile)])

    mc = MotifComparer()
    result = mc.get_closest_match(sample.values(), db.values(), "partial", "wic", "mean")

    print "Motif\tMatch\tScore\tP-value"
    for motif, match in result.items():
        pval, pos, orient = mc.compare_motifs(sample[motif], db[match[0]], "partial", "wic", "mean", pval=True)
        print "%s\t%s\t%0.2f\t%0.3e" % (motif, match[0], match[1][0], pval)

    if args.img:
        plotdata = []
        for query, match in result.items():
            motif = sample[query]
            dbmotif = db[match[0]]
            pval, pos, orient = mc.compare_motifs(motif, dbmotif, "partial", "wic", "mean", pval=True)
            
            if orient == -1:
                tmp = dbmotif.id
                dbmotif = dbmotif.rc()
                dbmotif.id = tmp

            if pos < 0:
                tmp = motif.id
                motif = Motif([[0.25,0.25,0.25,0.25]] * -pos + motif.pwm)
                motif.id = tmp
            elif pos > 0:
                tmp = dbmotif.id
                dbmotif = Motif([[0.25,0.25,0.25,0.25]] * pos + dbmotif.pwm)
                dbmotif.id = tmp

            plotdata.append((motif, dbmotif, pval))
            match_plot(plotdata, args.img)
示例#3
0
	def determine_significant_motifs(self, background=["random"], organism="hg18", width=200, pvalue_cutoff=0.001, enrichment_cutoff=1.5):

		fg = [self.validation_fa, self.validation_gff]
		bg = []
		for bg_type in background:
			bg.append([self.bg_file["fa"][bg_type], self.bg_file["gff"][bg_type],self.bg_file["enrichment"][bg_type]])
		self.calculate_enrichment(self.predicted_pwm, fg, bg)
		
		self.logger.info("Determining significant motifs")
		self.logger.info("Thresholds: enrichment >= %s; p-value <= %s"% (enrichment_cutoff, pvalue_cutoff))
		all_motifs = pwmfile_to_motifs(self.predicted_pwm)
		filt_ids = [x.id for x in all_motifs]
			
		for bg_id in background:
			filt_ids = self.filter_motifs(filt_ids, self.bg_file["enrichment"][bg_id], enrichment_cutoff, pvalue_cutoff)
		
		f = open(self.significant_pwm, "w")
		fp = open(self.significant_pfm, "w")
		for motif in pwmfile_to_motifs(self.predicted_pfm):
			if motif.id in filt_ids:
				f.write("%s\n" % motif.to_pwm())
				fp.write("%s\n" % motif.to_pfm())
		f.close()
		fp.close()
		self.logger.info("%s motifs are significant, written to %s" % (len(filt_ids), self.significant_pwm))
		return len(filt_ids)
示例#4
0
	def _calc_report_values(self, pwm, background):

		self.p = dict([(b,{}) for b in background])
		self.e = dict([(b,{}) for b in background])

		e_files = {
			"random": self.bg_random_cluster_enrichment,
			"genomic": self.bg_genomic_cluster_enrichment, 
			"genomic_matched": self.bg_genomic_matched_cluster_enrichment
		}

		for bg in self.p.keys():
			for line in open(e_files[bg]).readlines():
				if not (line.startswith("#") or line.startswith("Motif\tSig")):
					vals = line.strip().split("\t")
					self.p[bg][vals[0]] = float(vals[2])
					self.e[bg][vals[0]] = float(vals[5])
			
		self.auc = dict([(b,{}) for b in background])
		self.mncp = dict([(b,{}) for b in background])
		rocs = {
			"random": [self.bg_random_fa, self.cluster_bg_random_roc_metrics],
			"genomic": [self.bg_genomic_fa, self.cluster_bg_genomic_roc_metrics],
			"genomic_matched": [self.bg_genomic_matched_fa, self.cluster_bg_genomic_matched_roc_metrics],
		}

		for bg in self.auc.keys():
			bg_fasta_file, roc_file = rocs[bg]
			self.auc[bg], self.mncp[bg] = self._roc_metrics(pwm, self.validation_fa, bg_fasta_file, roc_file)

	
		motifs = pwmfile_to_motifs(pwm)
		self.closest_match = self.determine_closest_match(motifs)
示例#5
0
def scan_fasta_file_with_motifs(fastafile, motiffile, threshold, gfffile):
	from gimmemotifs.fasta import Fasta
	from gimmemotifs.motif import pwmfile_to_motifs
	motifs = pwmfile_to_motifs(motiffile)
	fa = Fasta(fastafile)
	for motif in motifs:
		motif.pwm_scan_to_gff(fa, gfffile, nreport=1, cutoff=float(threshold), append=True)
示例#6
0
def threshold(args):
    if args.fdr < 0 or args.fdr > 1:
        print "Please specify a FDR between 0 and 1"
        sys.exit(1)

    motifs = pwmfile_to_motifs(args.pwmfile)
    
    s = Scanner()
    s.set_motifs(args.pwmfile)
    
    score_table = []
    for scores in s.best_score(args.inputfile):
        score_table.append(scores)

    print "Motif\tScore\tCutoff"
    for i,scores in enumerate(np.array(score_table).transpose()):
        motif = motifs[i]
        pwm = motif.pwm
        min_score = motif.pwm_min_score()
        if len(scores) > 0:
            opt_score = scoreatpercentile(scores, 100 - (100 * args.fdr))
            cutoff = (opt_score - min_score) / (
                    motif.pwm_max_score() - min_score)
            print "{0}\t{1}\t{2}".format(
                    motif.id, opt_score , cutoff)
        else:
            sys.stderr.write("Warning: no matches for {0}\n".format(motif.id))
示例#7
0
def threshold(args):
    if args.fdr < 0 or args.fdr > 1:
        print "Please specify a FDR between 0 and 1"
        sys.exit(1)

    motifs = pwmfile_to_motifs(args.pwmfile)

    s = Scanner()
    s.set_motifs(args.pwmfile)

    score_table = []
    for scores in s.best_score(args.inputfile):
        score_table.append(scores)

    print "Motif\tScore\tCutoff"
    for i, scores in enumerate(np.array(score_table).transpose()):
        motif = motifs[i]
        pwm = motif.pwm
        min_score = motif.pwm_min_score()
        if len(scores) > 0:
            opt_score = scoreatpercentile(scores, 100 - (100 * args.fdr))
            cutoff = (opt_score - min_score) / (motif.pwm_max_score() -
                                                min_score)
            print "{0}\t{1}\t{2}".format(motif.id, opt_score, cutoff)
        else:
            sys.stderr.write("Warning: no matches for {0}\n".format(motif.id))
示例#8
0
	def _calc_report_values(self, pwm, background):
		self.logger.info("Calculating final statistics for report")
		self.p = dict([(b,{}) for b in background])
		self.e = dict([(b,{}) for b in background])

		e_files = dict([(bg, self.bg_file["cluster_enrichment"][bg]) for bg in background])

		for bg in self.p.keys():
			for line in open(e_files[bg]).readlines():
				if not (line.startswith("#") or line.startswith("Motif\tSig")):
					vals = line.strip().split("\t")
					self.p[bg][vals[0]] = float(vals[2])
					self.e[bg][vals[0]] = float(vals[5])
			
		self.auc = dict([(b,{}) for b in background])
		self.mncp = dict([(b,{}) for b in background])
		
		
		rocs = dict([(bg, [self.bg_file["fa"][bg], self.bg_file["roc"][bg]]) for bg in background])
		
		for bg in self.auc.keys():
			bg_fasta_file, roc_file = rocs[bg]
			self.auc[bg], self.mncp[bg] = self._roc_metrics(pwm, self.validation_fa, bg_fasta_file, roc_file)
	
		motifs = pwmfile_to_motifs(pwm)
		self.closest_match = self.determine_closest_match(motifs)
示例#9
0
def location(args):
    """
    Creates histrogram of motif location.

    Parameters
    ----------
    args : argparse object
        Command line arguments.
    """
    fastafile = args.fastafile
    pwmfile = args.pwmfile

    lwidth = args.width
    if not lwidth:
        f = Fasta(fastafile)
        lwidth = len(f.items()[0][1])
        f = None

    jobs = []
    motifs = pwmfile_to_motifs(pwmfile)
    ids = [motif.id for motif in motifs]
    if args.ids:
        ids = args.ids.split(",")

    for motif in motifs:
        if motif.id in ids:
            outfile = os.path.join("%s_histogram" % motif.id)
            jobs.append(
                pool.apply_async(
                    motif_localization,
                    (fastafile, motif, lwidth, outfile, args.cutoff)))

    for job in jobs:
        job.get()
示例#10
0
	def create_roc_plots(self, pwm_file, fg_fasta, bg_fasta, name):
		motifs = dict([(m.id, m) for m in pwmfile_to_motifs(pwm_file)])
		
		jobs = {}
		for id,m in motifs.items():
			jobs[id] = self.job_server().submit(get_roc_values, (motifs[id],fg_fasta,bg_fasta,))
	
		roc_img_file = os.path.join(self.imgdir, "%s_%s_roc.png")
		
		for id in motifs.keys():
			error, x, y = jobs[id]()
			if error:
				self.logger.error("Error in thread: %s" % error)
				sys.exit(1)

			fig = plt.figure()
			try:
				# matplotlib >= 0.99
				rect = fig.patch # a rectangle instance
			except:
				# matplotlib 0.98
				rect = fig.figurePatch # a rectangle instance
				
			plt.xlim(0,0.2)
			plt.ylim(0,1.0)
			colors = [cm.Paired(256 / 11 * i) for i in range(11)]
			plt.plot(x, y, color=colors[(0 * 2) % 10 + 1])
			plt.axis([0,1,0,1])
			plt.xlabel("1 - Specificity")
			plt.ylabel("Sensitivity")
			plt.savefig(roc_img_file % (id,name), format="png")
示例#11
0
def location(args):
    fastafile = args.fastafile
    pwmfile = args.pwmfile

    lwidth = args.width
    if not lwidth:
        f = Fasta(fastafile)
        lwidth = len(f.items()[0][1])
        f = None

    jobs = []
    motifs = pwmfile_to_motifs(pwmfile)
    ids = [motif.id for motif in motifs]
    if args.ids:
        ids = args.ids.split(",")

    for motif in motifs:
        if motif.id in ids:
            outfile = os.path.join("%s_histogram" % motif.id)
            jobs.append(
                    pool.apply_async(
                        motif_localization, 
                        (fastafile,motif,lwidth,outfile, args.cutoff)
                        ))
    
    for job in jobs:
        job.get()
示例#12
0
	def create_roc_plots(self, pwm_file, fg_fasta, bg_fasta, name):
		roc_cmd = "motif_roc.py -p %s -s %s -b %s -o %s -i %s -l"
		roc_img_file = os.path.join(self.imgdir, "%s_%s_roc")
		
		motifs = pwmfile_to_motifs(pwm_file)
		for motif in motifs:
			p = Popen(roc_cmd % (pwm_file, fg_fasta, bg_fasta, roc_img_file % (motif.id, name), motif.id), shell=True)
			p.communicate()
示例#13
0
 def _run_program(self, bin, fastafile, savedir, params={}):
     from gimmemotifs.motif import pwmfile_to_motifs
     import os
     fname = os.path.join(self.config.get_motif_dir(), "JASPAR2010_vertebrate.pwm")
     motifs =  pwmfile_to_motifs(fname)
     for motif in motifs:
         motif.id = "JASPAR_%s" % motif.id
     return motifs, "", ""
示例#14
0
def scan_fasta_file_with_motifs(fastafile, motiffile, threshold, gfffile, scan_rc=True, nreport=1):
    error = None
    try:
        motifs = pwmfile_to_motifs(motiffile)
        fa = Fasta(fastafile)
        for motif in motifs:
            motif.pwm_scan_to_gff(fa, gfffile, nreport=nreport, cutoff=float(threshold), scan_rc=scan_rc, append=True)
    except Exception,e :
        error = e
示例#15
0
def maxenr(args):

    if not os.path.exists(args.sample):
        print "File %s does not exist!" % args.sample
        exit(1)

    if not os.path.exists(args.background):
        print "File %s does not exist!" % args.background
        exit(1)

    pwmfile = args.pwmfile
    fg_file = args.sample
    bg_file = args.background

    motifs = dict([(x.id, x) for x in pwmfile_to_motifs(pwmfile)])

    ids = []
    if args.ids:
        ids = args.ids.split(",")
    else:
        ids = motifs.keys()

    fg_jobs = {}
    bg_jobs = {}

    for id in ids:
        if motifs.has_key(id):
            bg_jobs[id] = pool.apply_async(get_scores, (motifs[id], bg_file))
            fg_jobs[id] = pool.apply_async(get_scores, (motifs[id], fg_file))
        else:
            print "Wrong id: %s" % id
            sys.exit()

    print "Motif\t# matches\tMax. enrichment\tScore\tCutoff"

    for id in ids:
        pos = array(fg_jobs[id].get())
        neg = array(bg_jobs[id].get())
        factor = len(neg) / float(len(pos))

        scores = array([s for s in hstack((pos, neg)) if sum(neg >= s) > 1])
        enr = array([(sum(pos >= x) / float(sum(neg >= x))) * factor for x in scores])

        # print len(scores), len(enr)
        # for x,y in zip(enr, scores):
        #    print "%s\t%s" % (x,y)

        max_score = scores[enr.argmax()]
        cutoff = (max_score - motifs[id].pwm_min_score()) / (motifs[id].pwm_max_score() - motifs[id].pwm_min_score())

        print "%s\t%s\t%0.2f\t%0.2f\t%0.3f" % (
            id,
            sum(pos >= scores[enr.argmax()]),
            max(enr),
            scores[enr.argmax()],
            cutoff,
        )
示例#16
0
	def _cluster_motifs(self, pfm_file, cluster_pwm, dir, threshold):
		self.logger.info("Clustering significant motifs.")
		
		trim_ic = 0.2
		clusters = []
		motifs = pwmfile_to_motifs(pfm_file)
		if len(motifs) == 1:
			clusters = [[motifs[0], motifs]]
		else:
			tree = cluster_motifs(pfm_file, "total", "wic", "mean", True, threshold=float(threshold), include_bg=True)
			clusters = tree.getResult()

		ids = []
		mc = MotifComparer()

		for cluster,members in clusters:
			cluster.trim(trim_ic)
			cluster.to_img(os.path.join(self.imgdir,"%s.png" % cluster.id), format="PNG")
			ids.append([cluster.id, {"src":"images/%s.png" % cluster.id},[]])
			if len(members) > 1:
				scores = {}
				for motif in members:
					scores[motif] =  mc.compare_motifs(cluster, motif, "total", "wic", "mean", pval=True)	
				add_pos = sorted(scores.values(),cmp=lambda x,y: cmp(x[1], y[1]))[0][1]
				for motif in members:
					score, pos, strand = scores[motif]
					add = pos - add_pos
						
					if strand in [1,"+"]:
						pass
					else:
						#print "RC %s" % motif.id
						rc = motif.rc()
						rc.id = motif.id
						motif = rc
					#print "%s\t%s" % (motif.id, add)	
					motif.to_img(os.path.join(self.imgdir, "%s.png" % motif.id.replace(" ", "_")), format="PNG", add_left=add)
			ids[-1][2] = [dict([("src", "images/%s.png" % motif.id.replace(" ", "_")), ("alt", motif.id.replace(" ", "_"))]) for motif in members]
		
		kid.enable_import()
		template_file = os.path.join(self.config.get_template_dir(), "cluster_template_v2.kid")
		template = kid.Template(file=template_file, expname=self.name, motifs=ids, inputfile=self.inputfile, date=datetime.today().strftime("%d/%m/%Y"), version=GM_VERSION)
		f = open(self.cluster_report, "w")
		f.write(template.serialize())
		f.close()
		
		f = open(cluster_pwm, "w")
		if len(clusters) == 1 and len(clusters[0][1]) == 1:
			f.write("%s\n" % clusters[0][0].to_pwm())
		else:
			for motif in tree.get_clustered_motifs():
				f.write("%s\n" % motif.to_pwm())
		f.close()
	
		self.logger.info("Clustering done. See the result in %s" % self.cluster_report)
		return clusters
示例#17
0
def logo(args):
    inputfile = args.pwmfile

    motifs = pwmfile_to_motifs(inputfile)
    if args.ids:
        ids = args.ids.split(",")
        motifs = [m for m in motifs if m.id in ids]

    for motif in motifs:
        motif.to_img(motif.id, fmt="PNG")
示例#18
0
def logo(args):
    inputfile = args.pwmfile
    
    motifs = pwmfile_to_motifs(inputfile)
    if args.ids:
        ids = args.ids.split(",")
        motifs = [m for m in motifs if m.id in ids]
    
    for motif in motifs:
        motif.to_img(motif.id, fmt="PNG")
示例#19
0
    def _run_program(self, bin, fastafile, savedir="", params={}):
        import os, tempfile, shutil
        from subprocess import Popen, PIPE
        
        default_params = {"single":False, "background":None}
        default_params.update(params)
        
        trawler = bin
        
        fastafile = os.path.abspath(fastafile)
        if not default_params["background"]:
            print "Background file needed!"
            sys.exit()
        bgfile = os.path.abspath(default_params["background"])
        savedir = os.path.abspath(savedir)
        
        #savedir = "/tmp/trawler/"

        tmp = tempfile.NamedTemporaryFile(dir=self.tmpdir, delete=False)
        shutil.copy(fastafile, tmp.name)
        fastafile = tmp.name
    
        current_path = os.getcwd()
        os.chdir(self.dir())
        
        stdout = ""
        stderr = ""
        strand = "double"
        if default_params["single"]:
            strand = "single"
        cmd = "%s -sample %s -background %s -directory %s -strand %s" % (trawler, fastafile, bgfile, self.tmpdir, strand)
        p = Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE) 
        out,err = p.communicate()
        stdout += out
        stderr += err
        
        os.chdir(current_path)
        motifs = []
        out_name = [dir for dir in os.listdir(self.tmpdir) if dir.startswith("tmp")][-1]
        out_file = os.path.join(self.tmpdir, out_name, "result", "%s.pwm" % out_name)
        if os.path.exists(out_file):
            motifs = pwmfile_to_motifs(os.path.join(
                                                    self.tmpdir, 
                                                    out_name, 
                                                    "result", 
                                                    "%s.pwm" % out_name))
        
        # remove temporary files
        if os.path.exists(tmp.name):
            os.unlink(tmp.name)
        
        for motif in motifs:
            motif.id = "%s_%s" % (self.name, motif.id)
        
        return motifs, stdout, stderr
示例#20
0
def pwmscan(args):
    inputfile = args.inputfile
    nreport = args.nreport
    cutoff = args.cutoff
    bed = args.bed
    scan_rc = args.scan_rc

    motifs = pwmfile_to_motifs(args.pwmfile)
    result = scan_it(inputfile, motifs, cutoff, nreport, scan_rc)
   
    p = re.compile(r'([^\s:]+):(\d+)-(\d+)')
    fa = Fasta(inputfile)
    if args.table:
        table = {}
        for seq_id in fa.ids:
            table[seq_id] = {}

        for motif, result in result:
            for seq_id, matches in result.items():
                table[seq_id][motif] = len(matches)
        
        #mnames = [m.id for m in motifs]
        #print table
        print "\t{}".format("\t".join([m.id for m in motifs]))
        for seq_id in fa.ids:
            counts = [table[seq_id].get(m, 0) for m in motifs]
            print "{}\t{}".format(seq_id, "\t".join([str(x) for x in counts]))

    else:
        strandmap = {-1:"-",1:"+"}
        for motif, result in result:
            for seq_id, matches in result.items():
                for (pos, score, strand) in matches:
                    if bed:
                        m = p.search(seq_id)
                        if m:
                            chrom = m.group(1)
                            start = int(m.group(2))
                            end = int(m.group(3))
                            print "%s\t%s\t%s\t%s\t%s\t%s" % (chrom, start + pos, start + pos + len(motif) , motif.id, score, strandmap[strand])
                        else:
                            print "%s\t%s\t%s\t%s\t%s\t%s" % (seq_id, pos, pos +  len(motif), motif.id, score, strandmap[strand])
                    else:
                        print "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\tmotif_name \"%s\" ; motif_instance \"%s\"" % (
                            seq_id, 
                            "pwmscan", 
                            "misc_feature", 
                            pos + 1,            # GFF is 1-based
                            pos + len(motif), 
                            score, 
                            strandmap[strand], 
                            ".", 
                            motif.id, 
                            fa[seq_id][pos: pos + len(motif)]
                        )
示例#21
0
def scan_fasta_file_with_motifs(fastafile, motiffile, threshold, gfffile, scan_rc=True):
	error = None
	try:
		from gimmemotifs.fasta import Fasta
		from gimmemotifs.motif import pwmfile_to_motifs
		motifs = pwmfile_to_motifs(motiffile)
		fa = Fasta(fastafile)
		for motif in motifs:
			motif.pwm_scan_to_gff(fa, gfffile, nreport=1, cutoff=float(threshold), scan_rc=scan_rc, append=True)
	except Exception,e :
		error = e
示例#22
0
	def _create_report(self, pwm, background):
		class ReportMotif:
			pass
		
		motifs = pwmfile_to_motifs(pwm)
		for m,match in self.closest_match.items():
			match[0].to_img(os.path.join(self.imgdir,"%s.png" % match[0].id), format="PNG")

		random = "random" in background
		genomic = "genomic_matched" in background
		sort_key = "random"
		if genomic:
			sort_key = "genomic_matched"

		roc_img_file = "%s_%s_roc"
		report_motifs = []
		for motif in sorted(motifs, cmp=lambda x,y: cmp(self.mncp[sort_key][y.id], self.mncp[sort_key][x.id])):
			rm = ReportMotif()
			rm.id = motif.id
			rm.id_href = {"href": "#%s" % motif.id}
			rm.id_name = {"name": motif.id}
			rm.img = {"src":  os.path.join("images", "%s.png" % motif.id)}
			
			rm.consensus = motif.to_consensus()
			
			if random:
				rm.random_e = "%0.2f" % self.e["random"][motif.id]
				rm.random_p = "%0.2f" % self.p["random"][motif.id]
				rm.random_auc = "%0.3f" % self.auc["random"][motif.id]
				rm.random_mncp = "%0.3f" % self.mncp["random"][motif.id]
				rm.random_roc_img = {"src": "images/" + os.path.basename(roc_img_file % (motif.id, "random")) + ".png"}
				rm.random_roc_img_link = {"href": "images/" + os.path.basename(roc_img_file % (motif.id, "random")) + ".png"}
			if genomic:
				rm.genomic_e = "%0.2f" % self.e["genomic_matched"][motif.id]
				rm.genomic_p = "%0.2f" % self.p["genomic_matched"][motif.id]
				rm.genomic_auc = "%0.3f" % self.auc["genomic_matched"][motif.id]
				rm.genomic_mncp = "%0.3f" % self.mncp["genomic_matched"][motif.id]
				rm.genomic_roc_img = {"src": "images/" + os.path.basename(roc_img_file % (motif.id, "genomic_matched")) + ".png"}
				rm.genomic_roc_img_link = {"href": "images/" + os.path.basename(roc_img_file % (motif.id, "genomic_matched")) + ".png"}
			rm.histogram_img = {"data":"images/%s_histogram.svg" % motif.id}
			rm.histogram_link= {"href":"images/%s_histogram.svg" % motif.id}
			rm.match_img = {"src":  "images/%s.png" % self.closest_match[motif.id][0].id}
			rm.match_id = self.closest_match[motif.id][0].id
			rm.match_pval = "%0.2e" % self.closest_match[motif.id][1] 
			
			report_motifs.append(rm)
		
		total_report = self.motif_report 
		kid.enable_import()
		template_file = os.path.join(self.config.get_template_dir(), "report_template_v2.kid") 
		template = kid.Template(file=template_file, expname=self.name, motifs=report_motifs, random=random, genomic=genomic, inputfile=self.inputfile, date=datetime.today().strftime("%d/%m/%Y"), version=self.VERSION)
		f = open(total_report, "w")
		f.write(template.serialize())
		f.close()
示例#23
0
    def _run_program(self, bin, fastafile, savedir="", params={}):
        import os, tempfile, shutil
        from subprocess import Popen, PIPE
        
        default_params = {"single":False, "background":None, "analysis":"medium", "number":5, "width":10}
        default_params.update(params)
        
        homer = bin
        
        fastafile = os.path.abspath(fastafile)
        
        # Background file is essential!
        if not default_params["background"]:
            print "Background file needed!"
            sys.exit()
        
        bgfile = os.path.abspath(default_params["background"])
        
        outfile = tempfile.NamedTemporaryFile(
                dir=self.tmpdir, 
                prefix= "homer_w{}.".format(default_params["width"])
                ).name
        
        stderr = ""
        
        strand = ""
        if default_params["single"]:
            strand = " -strand + "

        cmd = "%s denovo -i %s -b %s -len %s -S %s %s -o %s -p 8" % (
            homer,
            fastafile,
            bgfile,
            default_params["width"],
            default_params["number"],
            strand,
            outfile)

        stdout = "Running command:\n{}\n".format(cmd)
        
        p = Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE, cwd=self.tmpdir) 
        out,err = p.communicate()
        stdout += out
        stderr += err
        
        motifs = []
        
        if os.path.exists(outfile):
            motifs = pwmfile_to_motifs(outfile)
            for i, m in enumerate(motifs):
                m.id = "{}_{}_{}".format(self.name, default_params["width"], i + 1)
        
        return motifs, stdout, stderr
示例#24
0
	def _create_report(self, pwm, background):
		self.logger.info("Creating graphical report")
		class ReportMotif:
			pass
		
		motifs = pwmfile_to_motifs(pwm)
		for m,match in self.closest_match.items():
			match[0].to_img(os.path.join(self.imgdir,"%s.png" % match[0].id), format="PNG")

		sort_key = background[0]
		if "genomic_matched" in background:
			sort_key = "genomic_matched"

		roc_img_file = "%s_%s_roc"
		report_motifs = []
		for motif in sorted(motifs, cmp=lambda x,y: cmp(self.mncp[sort_key][y.id], self.mncp[sort_key][x.id])):
			rm = ReportMotif()
			rm.id = motif.id
			rm.id_href = {"href": "#%s" % motif.id}
			rm.id_name = {"name": motif.id}
			rm.img = {"src":  os.path.join("images", "%s.png" % motif.id)}
			
			rm.consensus = motif.to_consensus()
			
			rm.bg = {}
			for bg in background:
				rm.bg[bg] = {}
				rm.bg[bg]["e"] = "%0.2f" % self.e[bg][motif.id]
				rm.bg[bg]["p"] = "%0.2f" % self.p[bg][motif.id]
				rm.bg[bg]["auc"] = "%0.3f" % self.auc[bg][motif.id]
				rm.bg[bg]["mncp"] = "%0.3f" % self.mncp[bg][motif.id]
				rm.bg[bg]["roc_img"] = {"src": "images/" + os.path.basename(roc_img_file % (motif.id, bg)) + ".png"}
				rm.bg[bg]["roc_img_link"] = {"href": "images/" + os.path.basename(roc_img_file % (motif.id, bg)) + ".png"}
			
			rm.histogram_img = {"data":"images/%s_histogram.svg" % motif.id}
			rm.histogram_link= {"href":"images/%s_histogram.svg" % motif.id}
			rm.match_img = {"src":  "images/%s.png" % self.closest_match[motif.id][0].id}
			rm.match_id = self.closest_match[motif.id][0].id
			rm.match_pval = "%0.2e" % self.closest_match[motif.id][1] 
			
			report_motifs.append(rm)
		
		total_report = self.motif_report 
		kid.enable_import()
		template_file = os.path.join(self.config.get_template_dir(), "report_template_v2.kid") 
		template = kid.Template(file=template_file, expname=self.name, motifs=report_motifs, inputfile=self.inputfile, date=datetime.today().strftime("%d/%m/%Y"), version=GM_VERSION)
		f = open(total_report, "w")
		f.write(template.serialize())
		f.close()
示例#25
0
	def create_location_plots(self, motif_file, fasta_file, params):
		self.logger.info("Creating localization plots")
		index_dir = os.path.join(self.config.get_index_dir(), params["genome"])
		lwidth = int(params["lwidth"])
		width = int(params["width"])
		extend = (lwidth - width) / 2
		
		genome_index.track2fasta(index_dir, self.validation_bed, self.location_fa, extend_up=extend, extend_down=extend, use_strand=params["use_strand"])

		jobs = []
		motifs = pwmfile_to_motifs(motif_file)
		
		for motif in motifs:
			outfile = os.path.join(self.imgdir, "%s_histogram.svg" % motif.id)
			motif_localization(fasta_file, motif, lwidth, outfile)
示例#26
0
def roc(args):
    """ Calculate ROC_AUC and other metrics and optionally plot ROC curve.
    """
    pwmfile = args.pwmfile
    fg_file = args.sample
    bg_file = args.background
    outputfile = args.outfile
    # Default extension for image
    if outputfile and   not outputfile.endswith(".png"):
        outputfile += ".png"
    
    motifs = dict([(x.id, x) for x in pwmfile_to_motifs(pwmfile)])

    ids = []
    if args.ids:
        ids = args.ids.split(",")
    else:
        ids = motifs.keys()

    fg_total = {}
    result = scan(fg_file, [motifs[x] for x in ids], 0.0, 1)    
    for key,m in result.items():
        fg_total[key.id.split("\t")[0]] = [matches[0][1] for matches in m.values()]
   
    bg_total = {}
    result = scan(bg_file, [motifs[x] for x in ids], 0.0, 1)    
    for key,m in result.items():
        bg_total[key.id.split("\t")[0]] = [matches[0][1] for matches in m.values()]
    
    plot_x = []
    plot_y = []
    # Print the metrics
    print "Motif\tROC AUC\tMNCP\tEnr. at 5% FDR\tMax enr."
    for id in ids:
        fg_vals = fg_total[id] 
        bg_vals = bg_total[id]    
        (x, y) = ROC_values(fg_vals, bg_vals) 
        plot_x.append(x)
        plot_y.append(y)
        auc = ROC_AUC(fg_vals, bg_vals)
        mncp = MNCP(fg_vals, bg_vals)
        enr_fdr = enr_at_fdr(fg_vals, bg_vals)
        max_enr,score = max_enrichment(fg_vals, bg_vals)
        print "%s\t%0.3f\t%03f\t%0.2f\t%0.2f" % (id, auc, mncp, enr_fdr, max_enr)
    
    # Plot the ROC curve
    if outputfile:
        roc_plot(outputfile, plot_x, plot_y, ids=ids)
示例#27
0
	def determine_closest_match(self, motifs):
		jaspar = os.path.join(self.config.get_motif_dir(), [x for x in os.listdir(self.config.get_motif_dir()) if x.startswith("jaspar")][0])
		db_motifs = []
		if jaspar.endswith("pwm") or jaspar.endswith("pfm"):
			db_motifs = pwmfile_to_motifs(jaspar)
		elif jaspar.endswith("transfac"):
			db_motifs = transfac_to_motifs(jaspar)
		
		closest_match = {}
		mc = MotifComparer()
		db_motif_lookup = dict([(m.id, m) for m in db_motifs])
		match = mc.get_closest_match(motifs, db_motifs, "partial", "wic", "mean")
		for motif in motifs:
			# Calculate p-value
			pval, pos, orient = mc.compare_motifs(motif, db_motif_lookup[match[motif.id][0]], "partial", "wic", "mean", pval=True)
			closest_match[motif.id] = [db_motif_lookup[match[motif.id][0]], pval]
		return closest_match
示例#28
0
def cluster(args):

    outdir = os.path.abspath(args.outdir)
    if not os.path.exists(outdir):
        os.mkdir(outdir)

    ncpus = args.ncpus
    
    clusters = []
    motifs = pwmfile_to_motifs(args.inputfile)
    if len(motifs) == 1:
        clusters = [[motifs[0], motifs]]
    else:
        tree = cluster_motifs(args.inputfile, "total", "wic", "mean", True, threshold=args.threshold, include_bg=True, ncpus=ncpus)
        clusters = tree.getResult()
    
    ids = _create_images(outdir, clusters) 
    _write_report(outdir, ids, tree, clusters)
示例#29
0
	def determine_closest_match(self, motifs):
		self.logger.info("Determining closest matching motifs in database (JASPAR)")
		jaspar = os.path.join(self.config.get_motif_dir(), [x for x in os.listdir(self.config.get_motif_dir()) if x.startswith("jaspar")][0])
		db_motifs = []
		if jaspar.endswith("pwm") or jaspar.endswith("pfm"):
			db_motifs = pwmfile_to_motifs(jaspar)
		elif jaspar.endswith("transfac"):
			db_motifs = transfac_to_motifs(jaspar)
		
		closest_match = {}
		mc = MotifComparer()
		db_motif_lookup = dict([(m.id, m) for m in db_motifs])
		match = mc.get_closest_match(motifs, db_motifs, "partial", "wic", "mean", parallel=False)
		for motif in motifs:
			# Calculate p-value
			pval, pos, orient = mc.compare_motifs(motif, db_motif_lookup[match[motif.id][0]], "partial", "wic", "mean", pval=True)
			closest_match[motif.id] = [db_motif_lookup[match[motif.id][0]], pval]
		return closest_match
示例#30
0
	def _create_text_report(self, pwm, background):
		self.logger.info("Creating text report")
		motifs = pwmfile_to_motifs(pwm)

		sort_key = background[0]
		if "genomic_matched" in background:
			sort_key = "genomic_matched"

		f = open(self.text_report, "w")
		header = "ID\tconsensus\tBest match JASPAR\tp-value JASPAR\t" + "\t".join("Enrichment (%s)\tp-value (%s)\tROC AUC (%s)\tMNCP (%s)" % (b,b,b,b) for b in background)
		#print header
		f.write("%s\n" % header)
		for motif in sorted(motifs, cmp=lambda x,y: cmp(self.mncp[sort_key][y.id], self.mncp[sort_key][x.id])):
			vals = [motif.id, motif.to_consensus(), self.closest_match[motif.id][0].id, self.closest_match[motif.id][1]]
			for bg in background:
				vals += [self.e[bg][motif.id], self.p[bg][motif.id], self.auc[bg][motif.id], self.mncp[bg][motif.id]]
			f.write("%s\n" % "\t".join([str(x) for x in vals]))
			#print "%s\n" % "\t".join([str(x) for x in vals])
		f.close()	
示例#31
0
def threshold(args):
    if args.fdr < 0 or args.fdr > 1:
        print "Please specify a FDR between 0 and 1"
        sys.exit(1)

    motifs = pwmfile_to_motifs(args.pwmfile)
    result = scan(args.inputfile, motifs, 0.0, 1)

    print "Motif\tScore\tCutoff"
    for motif in result.keys():
        pwm = motif.pwm
        scores = []
        min_score = motif.pwm_min_score()
        scores = [x[0][1] for x in result[motif].values() if len(x) > 0]
        if len(scores) > 0:
            opt_score = scoreatpercentile(scores, 100 - (100 * args.fdr))
            cutoff = (opt_score - min_score) / (motif.pwm_max_score() - min_score)
            print "{0}\t{1}\t{2}".format(motif.id, opt_score , cutoff)
        else:
            sys.stderr.write("Warning: no matches for {0}\n".format(motif.id))
示例#32
0
def command_scan(inputfile,
                 pwmfile,
                 nreport=1,
                 fpr=0.01,
                 cutoff=None,
                 bed=False,
                 scan_rc=True,
                 table=False,
                 score_table=False,
                 moods=False,
                 pvalue=None,
                 bgfile=None,
                 genome=None,
                 ncpus=None):
    motifs = pwmfile_to_motifs(pwmfile)

    fa = as_fasta(inputfile, genome)

    # initialize scanner
    s = Scanner(ncpus=ncpus)

    s.set_motifs(pwmfile)
    if not score_table:
        s.set_threshold(fpr=fpr,
                        threshold=cutoff,
                        genome=genome,
                        length=fa.median_length(),
                        filename=bgfile)

    if table:
        it = scan_table(s, inputfile, fa, motifs, cutoff, bgfile, nreport,
                        scan_rc, pvalue, moods)
    elif score_table:
        it = scan_score_table(s, fa, motifs, scan_rc)
    else:
        it = scan_normal(s, inputfile, fa, motifs, cutoff, bgfile, nreport,
                         scan_rc, pvalue, moods, bed)

    for row in it:
        yield row
示例#33
0
	def create_location_plots(self, motif_file, params):
		self.logger.info("Creating localization plots")
		if self.input_type == "BED":
		
			index_dir = os.path.join(self.config.get_index_dir(), params["genome"])
			lwidth = int(params["lwidth"])
			width = int(params["width"])
			extend = (lwidth - width) / 2
		
			genome_index.track2fasta(index_dir, self.validation_bed, self.location_fa, extend_up=extend, extend_down=extend, use_strand=params["use_strand"])
		else:
			self.location_fa = self.validation_fa
			fa = Fasta(self.location_fa)
			seqs = fa.seqs
			lwidth = len(seqs[0]) 
			all_same_width = not(False in [len(seq) == lwidth for seq in seqs])
			if not all_same_width:
				self.logger.warn("PLEASE NOTE: FASTA file contains sequences of different lengths. Positional preference plots will be incorrect!")
		
		motifs = pwmfile_to_motifs(motif_file)
		for motif in motifs:
			outfile = os.path.join(self.imgdir, "%s_histogram.svg" % motif.id)
			motif_localization(self.location_fa, motif, lwidth, outfile)
示例#34
0
	def _roc_metrics(self, pwm, sample_fa, bg_fa, roc_file):
		motifs = dict([(m.id, m) for m in pwmfile_to_motifs(pwm)])
		
		jobs = {}
		for id,m in motifs.items():
			jobs[id] = self.job_server().submit(get_scores, (motifs[id],sample_fa,bg_fa,))
		
		all_auc = {}
		all_mncp = {}
		f = open(roc_file, "w")
		f.write("Motif\tROC AUC\tMNCP\tMax f-measure\tSens @ max f-measure\n")
		for id in motifs.keys():
			error, auc, mncp, max_f, y = jobs[id]()
			if error:
				self.logger.error("Error in thread: %s" % error)
				sys.exit(1)
			f.write("%s\t%s\t%s\t%s\t%s\n" % (id,auc,mncp,max_f,y))
			all_auc[id] = auc
			all_mncp[id] = mncp
		
		f.close()
		
		return all_auc,all_mncp
示例#35
0
def location(args):
    """
    Creates histrogram of motif location.

    Parameters
    ----------
    args : argparse object
        Command line arguments.
    """
    fastafile = args.fastafile
    pwmfile = args.pwmfile

    lwidth = args.width
    if not lwidth:
        f = Fasta(fastafile)
        lwidth = len(f.items()[0][1])
        f = None

    jobs = []
    motifs = pwmfile_to_motifs(pwmfile)
    ids = [motif.id for motif in motifs]
    if args.ids:
        ids = args.ids.split(",")
    
    n_cpus = int(MotifConfig().get_default_params()["ncpus"])
    pool = Pool(processes=n_cpus, maxtasksperchild=1000) 
    for motif in motifs:
        if motif.id in ids:
            outfile = os.path.join("%s_histogram" % motif.id)
            jobs.append(
                    pool.apply_async(
                        motif_localization, 
                        (fastafile,motif,lwidth,outfile, args.cutoff)
                        ))
    
    for job in jobs:
        job.get()
示例#36
0
def cluster(args):

    revcomp = not args.single

    outdir = os.path.abspath(args.outdir)
    if not os.path.exists(outdir):
        os.mkdir(outdir)

    trim_ic = 0.2
    clusters = []
    motifs = pwmfile_to_motifs(args.inputfile)
    if len(motifs) == 1:
        clusters = [[motifs[0], motifs]]
    else:
        tree = cluster_motifs(args.inputfile, "total", "wic", "mean", True, threshold=args.threshold, include_bg=True)
        clusters = tree.getResult()
    
    ids = []
    mc = MotifComparer()

    sys.stderr.write("Creating images\n")
    for cluster,members in clusters:
        cluster.trim(trim_ic)
        cluster.to_img(os.path.join(outdir,"%s.png" % cluster.id), format="PNG")
        ids.append([cluster.id, {"src":"%s.png" % cluster.id},[]])
        if len(members) > 1:
            scores = {}
            for motif in members:
                scores[motif] =  mc.compare_motifs(cluster, motif, "total", "wic", "mean", pval=True)    
            add_pos = sorted(scores.values(),cmp=lambda x,y: cmp(x[1], y[1]))[0][1]
            for motif in members:
                score, pos, strand = scores[motif]
                add = pos - add_pos
                
                if strand in [1,"+"]:
                    pass
                else:
                    #print "RC %s" % motif.id
                    rc = motif.rc()
                    rc.id = motif.id
                    motif = rc
                #print "%s\t%s" % (motif.id, add)    
                motif.to_img(os.path.join(outdir, "%s.png" % motif.id.replace(" ", "_")), format="PNG", add_left=add)
        ids[-1][2] = [dict([("src", "%s.png" % motif.id.replace(" ", "_")), ("alt", motif.id.replace(" ", "_"))]) for motif in members]
    
    config = MotifConfig()
    env = jinja2.Environment(loader=jinja2.FileSystemLoader([config.get_template_dir()]))
    template = env.get_template("cluster_template.jinja.html")
    result = template.render(motifs=ids)

    with open(os.path.join(outdir, "cluster_report.html"), "w") as f:
        f.write(result.encode('utf-8'))

    f = open(os.path.join(outdir, "cluster_key.txt"), "w")
    for id in ids:
        f.write("%s\t%s\n" % (id[0], ",".join([x["alt"] for x in id[2]])))
    f.close()

    f = open(os.path.join(outdir, "clustered_motifs.pwm"), "w")
    if len(clusters) == 1 and len(clusters[0][1]) == 1:
        f.write("%s\n" % clusters[0][0].to_pwm())
    else:
        for motif in tree.get_clustered_motifs():
            f.write("%s\n" % motif.to_pwm())
    f.close()
示例#37
0
def command_scan(inputfile, pwmfile, nreport=1, cutoff=0.9, bed=False, 
        scan_rc=True, table=False, score_table=False, moods=False, 
        pvalue=None, bgfile=None, genome=None):
    motifs = pwmfile_to_motifs(pwmfile)
    
    index_dir = None
    if genome is not None:
        index_dir = os.path.join(MotifConfig().get_index_dir(), genome) 
    
    # initialize scanner
    s = Scanner()
    s.set_motifs(pwmfile)
    
    fa = as_fasta(inputfile, index_dir)
    
    if moods:
        result_it = scan_it_moods(inputfile, motifs, cutoff, bgfile, nreport, scan_rc, pvalue, table)
    else:
        result_it = s.scan(fa, nreport, scan_rc, cutoff)

    
    if table:
        # header
        yield "\t{}".format("\t".join([m.id for m in motifs]))
        
        if moods:
            result_it = scan_it_moods(inputfile, motifs, cutoff, bgfile,  nreport, scan_rc, pvalue, table)
            for seq_id, counts in result_it:
                yield "{}\t{}".format(seq_id, "\t".join([str(x) for x in counts]))
        else:
            # get iterator
            result_it = s.count(fa, nreport, scan_rc, cutoff)
            # counts table
            for i, counts in enumerate(result_it):
                yield "{}\t{}".format(
                        fa.ids[i], 
                        "\t".join([str(x) for x in counts])
                        )

    elif score_table:
        # get iterator
        result_it = s.best_score(fa, scan_rc)
        # header
        yield "\t{}".format("\t".join([m.id for m in motifs]))
        # score table
        for i,scores in enumerate(result_it):
            yield "{}\t{}".format(
                    fa.ids[i], 
                    "\t".join([str(x) for x in scores])
                    )

    else:
        if moods:
            for motif, d in result_it:
                for seq_id,matches in d.items():
                    for pos,score,strand in matches:
                        yield format_line(fa, seq_id, motif,
                                score, pos, strand, bed=bed)
        else:
            for i, result in enumerate(result_it):
                seq_id = fa.ids[i]
                for motif, matches in zip(motifs, result):
                    for (score, pos, strand) in matches:
                        yield format_line(fa, seq_id, motif, 
                                   score, pos, strand, bed=bed)
示例#38
0
tiny = options.tiny

pwmfile = options.pwmfile
fg_file = options.sample
bg_file = options.background
outputfile = options.output

if options.tiny:
	if not outputfile.endswith(".png"):
		outputfile += ".png"
else:
	if not outputfile.endswith(".png"):
		outputfile += ".png"

motifs = dict([(x.id, x) for x in pwmfile_to_motifs(pwmfile)])

ids = []
if options.ids:
	ids = options.ids.split(",")
else:
	ids = motifs.keys()
	
fg_jobs = {}
bg_jobs = {}

for id in ids:
	if motifs.has_key(id):
		bg_jobs[id] = job_server.submit(get_scores, (motifs[id],bg_file,))
		fg_jobs[id] = job_server.submit(get_scores, (motifs[id],fg_file,))
	else:
示例#39
0
def diff(args):

    infiles = args.inputfiles.split(",")
    bgfile = args.bgfile
    outfile = args.outputfile
    pwmfile = args.pwmfile
    cutoff = args.cutoff
    genome = args.genome
    minenr = float(args.minenr)
    minfreq = float(args.minfreq)

    tmpdir = mkdtemp()

    # Retrieve FASTA clusters from BED file
    if len(infiles) == 1 and infiles[0].endswith("bed"):
        if not args.genome:
            sys.stderr.write("Can't convert BED file without genome!\n")
            sys.exit(1)

        clusters = {}
        for line in open(infiles[0]):
            vals = line.strip().split("\t")
            clusters.setdefault(vals[4], []).append(vals[:3])

        infiles = []

        for cluster, regions in clusters.items():
            sys.stderr.write("Creating FASTA file for {0}\n".format(cluster))
            inbed = os.path.join(tmpdir, "{0}.bed".format(cluster))
            outfa = os.path.join(tmpdir, "{0}.fa".format(cluster))
            with open(inbed, "w") as f:
                for vals in regions:
                    f.write("{0}\t{1}\t{2}\n".format(*vals))
            Genome(genome).track2fasta(inbed, outfa)
            infiles.append(outfa)

    pwms = dict([(m.id, m) for m in pwmfile_to_motifs(pwmfile)])
    motifs = [m for m in pwms.keys()]
    names = [os.path.basename(os.path.splitext(fname)[0]) for fname in infiles]

    s = Scanner()
    s.set_motifs(pwmfile)
    s.set_threshold(threshold=cutoff)

    # Get background frequencies
    nbg = float(len(Fasta(bgfile).seqs))

    bgcounts = s.total_count(bgfile, nreport=1)
    bgfreq = [(c + 0.01) / nbg for c in bgcounts]

    # Get frequences in input files
    freq = {}
    counts = {}
    for fname in infiles:
        mcounts = s.total_count(fname, nreport=1)
        n = float(len(Fasta(fname).seqs))
        counts[fname] = mcounts
        freq[fname] = [(c + 0.01) / n for c in mcounts]

    freq = np.array([freq[fname] for fname in infiles]).transpose()
    counts = np.array([counts[fname] for fname in infiles]).transpose()

    #for row in freq:
    #    print freq

    diff_plot(motifs,
              pwms,
              names,
              freq,
              counts,
              bgfreq,
              bgcounts,
              outfile,
              minenr=minenr,
              minfreq=minfreq)

    shutil.rmtree(tmpdir)
示例#40
0
                  dest="fpr",
                  help="Desired fpr",
                  type="float",
                  metavar="FLOAT")

(options, args) = parser.parse_args()

if not options.pwmfile or not options.inputfile or not options.fpr:
    parser.print_help()
    exit()

if options.fpr < 0 or options.fpr > 1:
    print "Please specify a FPR between 0 and 1"
    sys.exit()

f = Fasta(options.inputfile)
motifs = pwmfile_to_motifs(options.pwmfile)

print "Motif\tScore\tCutoff"
for motif in motifs:
    pwm = motif.pwm
    scores = []
    min_score = motif.pwm_min_score()
    for name, seq in f.items():
        result = pwmscan(seq.upper(), pwm, min_score, 1, True)
        score = result[0][0]
        scores.append(score)
    opt_score = scoreatpercentile(scores, 100 - (100 * options.fpr))
    cutoff = (opt_score - min_score) / (motif.pwm_max_score() - min_score)
    print "%s\t%s\t%s" % (motif.id, opt_score, cutoff)
示例#41
0
def maxenr(args):

    if not os.path.exists(args.sample):
        print "File %s does not exist!" % args.sample
        exit(1)

    if not os.path.exists(args.background):
        print "File %s does not exist!" % args.background
        exit(1)

    pwmfile = args.pwmfile
    fg_file = args.sample
    bg_file = args.background

    motifs = dict([(x.id, x) for x in pwmfile_to_motifs(pwmfile)])

    ids = []
    if args.ids:
        ids = args.ids.split(",")
    else:
        ids = motifs.keys()

    fg_jobs = {}
    bg_jobs = {}

    for id in ids:
        if motifs.has_key(id):
            bg_jobs[id] = pool.apply_async(get_scores, (
                motifs[id],
                bg_file,
            ))
            fg_jobs[id] = pool.apply_async(get_scores, (
                motifs[id],
                fg_file,
            ))
        else:
            print "Wrong id: %s" % id
            sys.exit()

    print "Motif\t# matches\tMax. enrichment\tScore\tCutoff"

    for id in ids:
        pos = array(fg_jobs[id].get())
        neg = array(bg_jobs[id].get())
        factor = len(neg) / float(len(pos))

        scores = array([s for s in hstack((pos, neg)) if sum(neg >= s) > 1])
        enr = array([(sum(pos >= x) / float(sum(neg >= x))) * factor
                     for x in scores])

        #print len(scores), len(enr)
        #for x,y in zip(enr, scores):
        #    print "%s\t%s" % (x,y)

        max_score = scores[enr.argmax()]
        cutoff = (max_score - motifs[id].pwm_min_score()) / (
            motifs[id].pwm_max_score() - motifs[id].pwm_min_score())

        print "%s\t%s\t%0.2f\t%0.2f\t%0.3f" % (
            id, sum(pos >= scores[enr.argmax()]), max(enr),
            scores[enr.argmax()], cutoff)
示例#42
0

def get_scores(motif, file):
    from gimmemotifs.fasta import Fasta
    result = motif.pwm_scan_score(Fasta(file), cutoff=0.0, nreport=1)
    vals = [sorted(x)[-1] for x in result.values()]
    return vals


job_server = pp.Server(secret="pumpkinrisotto")

pwmfile = options.pwmfile
fg_file = options.sample
bg_file = options.background

motifs = dict([(x.id, x) for x in pwmfile_to_motifs(pwmfile)])

ids = []
if options.ids:
    ids = options.ids.split(",")
else:
    ids = motifs.keys()

fg_jobs = {}
bg_jobs = {}

for id in ids:
    if motifs.has_key(id):
        bg_jobs[id] = job_server.submit(get_scores, (
            motifs[id],
            bg_file,