def location(args): fastafile = args.fastafile pwmfile = args.pwmfile lwidth = args.width if not lwidth: f = Fasta(fastafile) lwidth = len(f.items()[0][1]) f = None jobs = [] motifs = pwmfile_to_motifs(pwmfile) ids = [motif.id for motif in motifs] if args.ids: ids = args.ids.split(",") for motif in motifs: if motif.id in ids: outfile = os.path.join("%s_histogram" % motif.id) jobs.append( pool.apply_async( motif_localization, (fastafile,motif,lwidth,outfile, args.cutoff) )) for job in jobs: job.get()
def location(args): """ Creates histrogram of motif location. Parameters ---------- args : argparse object Command line arguments. """ fastafile = args.fastafile pwmfile = args.pwmfile lwidth = args.width if not lwidth: f = Fasta(fastafile) lwidth = len(f.items()[0][1]) f = None jobs = [] motifs = pwmfile_to_motifs(pwmfile) ids = [motif.id for motif in motifs] if args.ids: ids = args.ids.split(",") for motif in motifs: if motif.id in ids: outfile = os.path.join("%s_histogram" % motif.id) jobs.append( pool.apply_async( motif_localization, (fastafile, motif, lwidth, outfile, args.cutoff))) for job in jobs: job.get()
class TestMotifPwm(unittest.TestCase): """ A test class to test Motif pwmscan functionality and related things """ def setUp(self): self.data_dir = "test/data/pwmscan" self.motif = pwmfile_to_motifs(os.path.join(self.data_dir, "TATA.pwm"))[0] self.prom = Fasta(os.path.join(self.data_dir, "promoters.fa")) self.prom_gff = os.path.join(self.data_dir, "promoters_result.gff") self.random = Fasta(os.path.join(self.data_dir, "random_sequences.fa")) self.random_gff = os.path.join(self.data_dir, "random_result.gff") self.enrichment = os.path.join(self.data_dir, "enrichment.txt") self.tmp = NamedTemporaryFile().name def test1_pwm_scan(self): """ Scan a FASTA file with PWM of motif """ result = self.motif.pwm_scan(self.prom, nreport=1) # Every sequence should have a TATA match self.assertEquals(len(result.keys()), len(self.prom.items())) def test2_pwm_scan_to_gff(self): """ Scan a FASTA file with PWM of motif, and produce GFF """ self.motif.pwm_scan_to_gff(self.prom, self.tmp) self.assertEquals(open(self.prom_gff).read(), open(self.tmp).read()) def test3_gff_enrichment(self): """ Test gff_enrichment """ self.motif.pwm_scan_to_gff(self.random, self.random_gff) gff_enrichment(self.prom_gff, self.random_gff, 316, 3160, self.tmp) self.assertEquals(open(self.enrichment).read(), open(self.tmp).read()) def tearDown(self): pass
def _run_program(self, bin, fastafile, savedir, params=None): fastafile = os.path.abspath(fastafile) savedir = os.path.abspath(savedir) basename = "munk_in.fa" new_file = os.path.join(self.tmpdir, basename) out = open(new_file, "w") f = Fasta(fastafile) for name,seq in f.items(): header = " ".join(["%0.1f" % x for x in range(len(seq) / 2) + range(len(seq) / 2, 0, -1)]) out.write(">%s\n" % header) out.write("%s\n" % seq) out.close() fastafile = new_file outfile = fastafile + ".out" current_path = os.getcwd() os.chdir(self.dir()) cmd = "%s %s %s yes 1.0 p:%s > %s" % (bin, params["width"], params["width"], fastafile, outfile) p = Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE) stdout, stderr = p.communicate() motifs = [] if os.path.exists(outfile): motifs = self.parse(open(outfile)) os.chdir(current_path) return motifs, stdout, stderr
class TestMotifPwm(unittest.TestCase): """ A test class to test Motif pwmscan functionality and related things """ def setUp(self): self.data_dir = "test/data/pwmscan" self.motif = pwmfile_to_motifs(os.path.join(self.data_dir, "TATA.pwm"))[0] self.prom = Fasta(os.path.join(self.data_dir, "promoters.fa")) self.prom_gff = os.path.join(self.data_dir, "promoters_result.gff") self.random = Fasta(os.path.join(self.data_dir, "random_sequences.fa")) self.random_gff = os.path.join(self.data_dir, "random_result.gff") self.enrichment = os.path.join(self.data_dir, "enrichment.txt") self.tmp = NamedTemporaryFile().name def test1_pwm_scan(self): """ Scan a FASTA file with PWM of motif """ result = self.motif.pwm_scan(self.prom, nreport=1) # Every sequence should have a TATA match self.assertEquals(len(result.keys()), len(self.prom.items())) def test2_pwm_scan_to_gff(self): """ Scan a FASTA file with PWM of motif, and produce GFF """ self.motif.pwm_scan_to_gff(self.prom, self.tmp) self.assertEquals(open(self.prom_gff).read(), open(self.tmp).read()) def test3_gff_enrichment(self): """ Test gff_enrichment """ self.motif.pwm_scan_to_gff(self.random, self.random_gff) gff_enrichment(self.prom_gff, self.random_gff, 316, 3160, self.tmp) self.assertEquals(open(self.enrichment).read(), open(self.tmp).read()) def tearDown(self): pass
def download_genome(genomebuild, genome_dir): # download genome based on URL + genomebuild sys.stderr.write("Downloading {} genome\n".format(genomebuild)) for genome_url in UCSC_GENOME_URLS: remote = genome_url.format(genomebuild) genome_fa = os.path.join( genome_dir, os.path.split(remote)[-1] ) sys.stderr.write("Trying to download {}\n".format(genome_url.format(genomebuild))) try: urlretrieve( genome_url.format(genomebuild), genome_fa ) if not check_genome_file(genome_fa): os.unlink(genome_fa) continue break except: pass if not check_genome_file(genome_fa): sys.stderr.write("Failed to download genome\n") sys.exit(1) sys.stderr.write("Unpacking\n") genome_fa = os.path.basename(genome_fa) if genome_fa.endswith("tar.gz"): cmd = "tar -C {0} -xvzf {1} && rm {1}".format(genome_dir, genome_fa) elif genome_fa.endswith(".zip"): cmd = "unzip {0}".format(genome_fa) else: cmd = "gunzip {0}".format(genome_fa) sp.call(cmd, shell=True, cwd=genome_dir) fa_files = glob("{}/*.fa".format(genome_dir)) if len(fa_files) == 1: f = Fasta(fa_files[0]) for n,s in f.items(): with open("{}/{}.fa".format(genome_dir, n), "w") as f: f.write(">{}\n{}\n".format(n,s)) os.unlink(fa_files[0]) genome_fa = os.path.join(genome_dir, genome_fa) if os.path.exists(genome_fa): os.unlink(genome_fa)
class TestMotifPwm(unittest.TestCase): """ A test class to test Motif pwmscan functionality and related things """ def setUp(self): self.data_dir = "test/data/pwmscan" self.motif = read_motifs(open(os.path.join(self.data_dir, "TATA.pwm")), fmt="pwm")[0] self.prom = Fasta(os.path.join(self.data_dir, "promoters.fa")) self.prom_gff = os.path.join(self.data_dir, "promoters_result.gff") self.random = Fasta(os.path.join(self.data_dir, "random_sequences.fa")) self.random_gff = os.path.join(self.data_dir, "random_result.gff") self.enrichment = os.path.join(self.data_dir, "enrichment.txt") self.tmp = NamedTemporaryFile().name def test1_pwm_scan(self): """ Scan a FASTA file with PWM of motif """ result = self.motif.pwm_scan(self.prom, nreport=1) # Every sequence should have a TATA match self.assertEquals(len(result.keys()), len(self.prom.items())) def test2_pwm_scan_to_gff(self): """ Scan a FASTA file with PWM of motif, and produce GFF """ self.motif.pwm_scan_to_gff(self.prom, self.tmp) for line in open(self.tmp): vals = line.strip().split("\t") self.assertEquals(9, len(vals)) self.assertTrue(int(vals[3]) > 0) self.assertTrue(int(vals[4]) > 0) self.assertTrue(float(vals[5]) > 5.25) self.assertTrue(float(vals[5]) < 9.06) self.assertIn(vals[6], ["+", "-"]) def test3_gff_enrichment(self): """ Test gff_enrichment """ self.motif.pwm_scan_to_gff(self.random, self.random_gff) gff_enrichment(self.prom_gff, self.random_gff, 316, 3160, self.tmp) f = open(self.tmp) f.readline() # Header vals = f.readline().strip().split("\t") self.assertEquals(vals[0], "TATA-box") self.assertLess(float(vals[2]), 1e-60) self.assertGreater(float(vals[5]), 1.5) def tearDown(self): pass
class TestMotifPwm(unittest.TestCase): """ A test class to test Motif pwmscan functionality and related things """ def setUp(self): self.data_dir = "test/data/pwmscan" self.motif = pwmfile_to_motifs(os.path.join(self.data_dir, "TATA.pwm"))[0] self.prom = Fasta(os.path.join(self.data_dir, "promoters.fa")) self.prom_gff = os.path.join(self.data_dir, "promoters_result.gff") self.random = Fasta(os.path.join(self.data_dir, "random_sequences.fa")) self.random_gff = os.path.join(self.data_dir, "random_result.gff") self.enrichment = os.path.join(self.data_dir, "enrichment.txt") self.tmp = NamedTemporaryFile().name def test1_pwm_scan(self): """ Scan a FASTA file with PWM of motif """ result = self.motif.pwm_scan(self.prom, nreport=1) # Every sequence should have a TATA match self.assertEquals(len(result.keys()), len(self.prom.items())) def test2_pwm_scan_to_gff(self): """ Scan a FASTA file with PWM of motif, and produce GFF """ self.motif.pwm_scan_to_gff(self.prom, self.tmp) for line in open(self.tmp): vals = line.strip().split("\t") self.assertEquals(9, len(vals)) self.assertTrue(int(vals[3]) > 0) self.assertTrue(int(vals[4]) > 0) self.assertTrue(float(vals[5]) > 5.25) self.assertTrue(float(vals[5]) < 9.06) self.assertIn(vals[6], ["+", "-"]) def test3_gff_enrichment(self): """ Test gff_enrichment """ self.motif.pwm_scan_to_gff(self.random, self.random_gff) gff_enrichment(self.prom_gff, self.random_gff, 316, 3160, self.tmp) f = open(self.tmp) f.readline() # Header vals = f.readline().strip().split("\t") self.assertEquals(vals[0], "TATA-box") self.assertLess(float(vals[2]), 1e-60) self.assertGreater(float(vals[5]), 1.5) def tearDown(self): pass
def divide_fa_file(fname, sample, rest, fraction, abs_max): fa = Fasta(fname) ids = fa.ids[:] x = int(fraction * len(ids)) if x > abs_max: x = abs_max sample_seqs = random.sample(ids, x) # Rest f_sample = open(sample, "w") f_rest = open(rest, "w") for name,seq in fa.items(): if name in sample_seqs: f_sample.write(">%s\n%s\n" % (name, seq)) else: f_rest.write(">%s\n%s\n" % (name, seq)) f_sample.close() f_rest.close() return x, len(ids[x:])
def divide_fa_file(fname, sample, rest, fraction, abs_max): fa = Fasta(fname) ids = fa.ids[:] x = int(fraction * len(ids)) if x > abs_max: x = abs_max sample_seqs = random.sample(ids, x) # Rest f_sample = open(sample, "w") f_rest = open(rest, "w") for name,seq in fa.items(): if name in sample_seqs: f_sample.write(">%s\n%s\n" % (name, seq)) else: f_rest.write(">%s\n%s\n" % (name, seq)) f_sample.close() f_rest.close() return x, len(ids[x:])
def location(args): """ Creates histrogram of motif location. Parameters ---------- args : argparse object Command line arguments. """ fastafile = args.fastafile pwmfile = args.pwmfile lwidth = args.width if not lwidth: f = Fasta(fastafile) lwidth = len(f.items()[0][1]) f = None jobs = [] motifs = pwmfile_to_motifs(pwmfile) ids = [motif.id for motif in motifs] if args.ids: ids = args.ids.split(",") n_cpus = int(MotifConfig().get_default_params()["ncpus"]) pool = Pool(processes=n_cpus, maxtasksperchild=1000) for motif in motifs: if motif.id in ids: outfile = os.path.join("%s_histogram" % motif.id) jobs.append( pool.apply_async( motif_localization, (fastafile,motif,lwidth,outfile, args.cutoff) )) for job in jobs: job.get()
def _run_program(self, bin, fastafile, savedir, params=None): fastafile = os.path.abspath(fastafile) savedir = os.path.abspath(savedir) basename = "munk_in.fa" new_file = os.path.join(self.tmpdir, basename) out = open(new_file, "w") f = Fasta(fastafile) for name, seq in f.items(): header = " ".join([ "%0.1f" % x for x in range(len(seq) / 2) + range(len(seq) / 2, 0, -1) ]) out.write(">%s\n" % header) out.write("%s\n" % seq) out.close() fastafile = new_file outfile = fastafile + ".out" current_path = os.getcwd() os.chdir(self.dir()) cmd = "%s %s %s yes 1.0 p:%s > %s" % ( bin, params["width"], params["width"], fastafile, outfile) p = Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE) stdout, stderr = p.communicate() motifs = [] if os.path.exists(outfile): motifs = self.parse(open(outfile)) os.chdir(current_path) return motifs, stdout, stderr
def location(args): """ Creates histrogram of motif location. Parameters ---------- args : argparse object Command line arguments. """ fastafile = args.fastafile pfmfile = args.pfmfile lsize = args.size if not lsize: f = Fasta(fastafile) lsize = len(f.items()[0][1]) f = None jobs = [] motifs = read_motifs(pfmfile) ids = [motif.id for motif in motifs] if args.ids: ids = args.ids.split(",") n_cpus = int(MotifConfig().get_default_params()["ncpus"]) pool = Pool(processes=n_cpus, maxtasksperchild=1000) for motif in motifs: if motif.id in ids: outfile = os.path.join("%s_histogram" % motif.id) jobs.append( pool.apply_async( motif_localization, (fastafile, motif, lsize, outfile, args.cutoff))) for job in jobs: job.get()
def nmer_predict(fastafile): from tempfile import NamedTemporaryFile, mkdtemp from gimmemotifs.fasta import Fasta from numpy import sum, histogram from subprocess import Popen, PIPE from gimmemotifs.motif import Motif, motif_from_align from gimmemotifs.cluster import cluster_motifs from string import maketrans def rc(seq): t = maketrans("ATCG", "TAGC") return seq[::-1].translate(t) f = Fasta(fastafile) nmer = {} N = {6: 4, 8: 3, 10: 2, 12: 1} tmp = NamedTemporaryFile() abs_cutoff = len(f.items()) / 100.0 * 2 for check_n, cutoff in N.items(): for id, seq in f.items(): for i in range(len(seq) - check_n): n = seq[i:i + check_n] nmer.setdefault(n.upper(), []).append(i) for n, pos in nmer.items(): if len(pos) > abs_cutoff: hist = histogram(pos, bins=9, range=(0, 200))[0] if sum(hist[3:6]) > sum(hist[0:3] * N[len(n)]) and sum( hist[3:6]) > sum(hist[7:]) * N[len(n)]: tmp.write(">%s\n" % n) for char in n: w = [] for x in ["A", "C", "G", "T"]: if x == char: w.append(len(pos)) else: w.append(0) tmp.write("\t".join([str(x) for x in w]) + "\n") tmp.flush() tmpname = tmp.name tree = cluster_motifs(tmpname, "subtotal", "ed", "mean", False, threshold=-0.1, include_bg=False) clusters = tree.getResult() def refine_by_scanning(motifs, fastafile): tmp_gff = NamedTemporaryFile() file_in = NamedTemporaryFile() for m in motifs: file_in.write("%s\n" % m.to_pfm()) file_in.flush() cmd = "pwmscan.py -i %s -p %s -c 0.8 > %s" % (fastafile, file_in.name, tmp_gff.name) p = Popen(cmd, shell=True) stdout, stderr = p.communicate() aligns = {} for line in open(tmp_gff.name): vals = line.strip().split("\t") motif, instance = [ x.split(" ")[1].replace('"', "") for x in vals[8].split(" ; ") ] if vals[6] == "+": aligns.setdefault(motif, []).append(instance.upper()) else: aligns.setdefault(motif, []).append(rc(instance.upper())) tmp_out = NamedTemporaryFile() refined_motifs = [] for id, align in aligns.items(): if len(align) > 10: motif = motif_from_align(align) refined_motifs.append(motif) return refined_motifs motifs = refine_by_scanning([x[0] for x in clusters], fastafile) tmp4 = NamedTemporaryFile() for m in motifs: tmp4.write("%s\n" % m.to_pfm()) tmp4.flush() motifs = [] tree = cluster_motifs(tmp4.name, "total", "wic", "mean", True, threshold=0.95, include_bg=True) clusters = tree.getResult() for i, (cluster, members) in enumerate(clusters): cluster.id = "Nmer_%s" % (i + 1) motifs.append(cluster) refined_motifs = refine_by_scanning(motifs, fastafile) for i, m in enumerate(refined_motifs): m.id = "WannaMotif_%s" % (i + 1) return refined_motifs, "", ""
sys.exit(0) inputfile = options.inputfile if options.nreport: nreport = int(options.nreport) cutoff = float(options.cutoff) motifs = pwmfile_to_motifs(options.pwmfile) bed = options.bed f = Fasta(inputfile) strandmap = {-1:"-",1:"+"} for (id,seq) in f.items(): for motif in motifs: pwm = motif.pwm c = motif.pwm_min_score() + (motif.pwm_max_score() - motif.pwm_min_score()) * cutoff result = pwmscan(seq.upper(), pwm, c, nreport, options.scan_rc) for (score, pos, strand) in result: if bed: first = id.split(" ")[0] (chr,loc) = first.split(":") if loc: (start, end) = map(int, loc.split("-")) print "%s\t%s\t%s\t%s" % (chr, start + pos, start + pos + len(pwm) , score) else: print "%s\t%s\t%s\t%s" % (id, pos, pos + len(pwm), score) else: print "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\tmotif_name \"%s\" ; motif_instance \"%s\"" % (
dest="fpr", help="Desired fpr", type="float", metavar="FLOAT") (options, args) = parser.parse_args() if not options.pwmfile or not options.inputfile or not options.fpr: parser.print_help() exit() if options.fpr < 0 or options.fpr > 1: print "Please specify a FPR between 0 and 1" sys.exit() f = Fasta(options.inputfile) motifs = pwmfile_to_motifs(options.pwmfile) print "Motif\tScore\tCutoff" for motif in motifs: pwm = motif.pwm scores = [] min_score = motif.pwm_min_score() for name, seq in f.items(): result = pwmscan(seq.upper(), pwm, min_score, 1, True) score = result[0][0] scores.append(score) opt_score = scoreatpercentile(scores, 100 - (100 * options.fpr)) cutoff = (opt_score - min_score) / (motif.pwm_max_score() - min_score) print "%s\t%s\t%s" % (motif.id, opt_score, cutoff)
return motif.id, p else: return motif.id, 1.0 if not options.fastafile and not options.pwmfile: parser.print_help() sys.exit() fastafile = options.fastafile pwmfile = options.pwmfile lwidth = options.width if not lwidth: f = Fasta(fastafile) lwidth = len(f.items()[0][1]) f = None job_server = pp.Server(secret="pumpkinrisotto") jobs = [] motifs = pwmfile_to_motifs(pwmfile) ids = [motif.id for motif in motifs] if options.ids: ids = options.ids.split(",") for motif in motifs: if motif.id in ids: outfile = os.path.join("%s_histogram" % motif.id) jobs.append(job_server.submit(motif_localization, (fastafile, motif, lwidth, outfile, options.cutoff), (), ())) for job in jobs:
def nmer_predict(fastafile): from tempfile import NamedTemporaryFile,mkdtemp from gimmemotifs.fasta import Fasta from numpy import sum,histogram from subprocess import Popen,PIPE from gimmemotifs.motif import Motif,motif_from_align from gimmemotifs.cluster import cluster_motifs from string import maketrans def rc(seq): t = maketrans("ATCG", "TAGC") return seq[::-1].translate(t) f = Fasta(fastafile) nmer = {} N = {6:4, 8:3,10:2,12:1} tmp = NamedTemporaryFile() abs_cutoff = len(f.items()) / 100.0 * 2 for check_n,cutoff in N.items(): for id,seq in f.items(): for i in range(len(seq) - check_n): n = seq[i: i + check_n] nmer.setdefault(n.upper(), []).append(i) for n,pos in nmer.items(): if len(pos) > abs_cutoff: hist = histogram(pos, bins=9, range=(0,200))[0] if sum(hist[3:6]) > sum(hist[0:3] * N[len(n)]) and sum(hist[3:6]) > sum(hist[7:]) * N[len(n)]: tmp.write(">%s\n" % n) for char in n: w = [] for x in ["A", "C", "G", "T"]: if x == char: w.append(len(pos)) else: w.append(0) tmp.write("\t".join([str(x) for x in w]) + "\n") tmp.flush() tmpname = tmp.name tree = cluster_motifs(tmpname, "subtotal", "ed", "mean", False, threshold=-0.1, include_bg=False) clusters = tree.getResult() def refine_by_scanning(motifs, fastafile): tmp_gff = NamedTemporaryFile() file_in = NamedTemporaryFile() for m in motifs: file_in.write("%s\n" % m.to_pfm()) file_in.flush() cmd = "pwmscan.py -i %s -p %s -c 0.8 > %s" % (fastafile, file_in.name, tmp_gff.name) p = Popen(cmd, shell=True) stdout,stderr = p.communicate() aligns = {} for line in open(tmp_gff.name): vals = line.strip().split("\t") motif,instance = [x.split(" ")[1].replace('"', "") for x in vals[8].split(" ; ")] if vals[6] == "+": aligns.setdefault(motif,[]).append(instance.upper()) else: aligns.setdefault(motif,[]).append(rc(instance.upper())) tmp_out = NamedTemporaryFile() refined_motifs = [] for id,align in aligns.items(): if len(align) > 10: motif = motif_from_align(align) refined_motifs.append(motif) return refined_motifs motifs = refine_by_scanning([x[0] for x in clusters], fastafile) tmp4 = NamedTemporaryFile() for m in motifs: tmp4.write("%s\n" % m.to_pfm()) tmp4.flush() motifs = [] tree = cluster_motifs(tmp4.name, "total", "wic", "mean", True, threshold=0.95, include_bg=True) clusters = tree.getResult() for i, (cluster,members) in enumerate(clusters): cluster.id = "Nmer_%s" % (i + 1) motifs.append(cluster) refined_motifs = refine_by_scanning(motifs, fastafile) for i,m in enumerate(refined_motifs): m.id = "WannaMotif_%s" % (i + 1) return refined_motifs, "", ""
def get_genome(genomebuild, fastadir, indexdir=None): config = MotifConfig() if not indexdir: indexdir = config.get_index_dir() genome_dir = os.path.join(fastadir, genomebuild) index_dir = os.path.join(indexdir, genomebuild) pred_bin = "genePredToBed" pred = find_executable(pred_bin) if not pred: sys.stderr.write("{} not found in path!\n".format(pred_bin)) sys.exit(1) # Check for rights to write to directory if not os.path.exists(genome_dir): try: os.mkdir(genome_dir) except: sys.stderr.write( "Could not create genome dir {}\n".format(genome_dir)) sys.exit(1) # Download gene file based on URL + genomebuild gene_file = os.path.join(config.get_gene_dir(), "%s.bed" % genomebuild) tmp = NamedTemporaryFile(delete=False, suffix=".gz") anno = [] f = urllib2.urlopen(UCSC_GENE_URL.format(genomebuild)) p = re.compile(r'\w+.Gene.txt.gz') for line in f.readlines(): m = p.search(line) if m: anno.append(m.group(0)) sys.stderr.write("Retrieving gene annotation for {}\n".format(genomebuild)) url = "" for a in ANNOS: if a in anno: url = UCSC_GENE_URL.format(genomebuild) + a break if url: urllib.urlretrieve(url, tmp.name) sp.call("zcat {} | cut -f2-11 | {} /dev/stdin {}".format( tmp.name, pred, gene_file), shell=True) else: sys.stderr.write("No annotation found!") # download genome based on URL + genomebuild sys.stderr.write("Downloading {} genome\n".format(genomebuild)) for genome_url in [UCSC_GENOME_URL, ALT_UCSC_GENOME_URL]: remote = genome_url.format(genomebuild) genome_fa = os.path.join(genome_dir, os.path.split(remote)[-1]) sys.stderr.write("Trying to download {}\n".format( genome_url.format(genomebuild))) urllib.urlretrieve(genome_url.format(genomebuild), genome_fa) if not check_genome_file(genome_fa): os.unlink(genome_fa) continue break if not check_genome_file(genome_fa): sys.stderr.write("Failed to download genome\n") sys.exit(1) sys.stderr.write("Unpacking\n") if genome_fa.endswith("tar.gz"): cmd = "tar -C {0} -xvzf {1} && rm {1}".format(genome_dir, genome_fa) else: cmd = "gunzip {0}".format(genome_fa) sp.call(cmd, shell=True, cwd=genome_dir) fa_files = glob("{}/*.fa".format(genome_dir)) if len(fa_files) == 1: f = Fasta(fa_files[0]) for n, s in f.items(): with open("{}/{}.fa".format(genome_dir, n), "w") as f: f.write(">{}\n{}\n".format(n, s)) os.unlink(fa_files[0]) sys.stderr.write("Creating index\n") g = GenomeIndex() g = g.create_index(genome_dir, index_dir) create_bedtools_fa(index_dir, genome_dir)
parser = OptionParser() parser.add_option("-p", "--pwmfile", dest="pwmfile", help="File with pwms", metavar="FILE") parser.add_option("-i", "--inputfile", dest="inputfile", help="FASTA file with background sequences", metavar="FILE") parser.add_option("-f", "--fpr", dest="fpr", help="Desired fpr", type="float", metavar="FLOAT") (options, args) = parser.parse_args() if not options.pwmfile or not options.inputfile or not options.fpr: parser.print_help() exit() if options.fpr < 0 or options.fpr > 1: print "Please specify a FPR between 0 and 1" sys.exit() f = Fasta(options.inputfile) motifs = pwmfile_to_motifs(options.pwmfile) print "Motif\tScore\tCutoff" for motif in motifs: pwm = motif.pwm scores = [] min_score = motif.pwm_min_score() for name,seq in f.items(): result = pwmscan(seq.upper(), pwm, min_score, 1, True) score = result[0][0] scores.append(score) opt_score = scoreatpercentile(scores, 100 - (100 * options.fpr)) cutoff = (opt_score - min_score) / (motif.pwm_max_score() - min_score) print "%s\t%s\t%s" % (motif.id, opt_score , cutoff)
sys.exit(0) inputfile = options.inputfile if options.nreport: nreport = int(options.nreport) cutoff = float(options.cutoff) motifs = pwmfile_to_motifs(options.pwmfile) bed = options.bed f = Fasta(inputfile) strandmap = {-1:"-",1:"+"} for (id,seq) in f.items(): for motif in motifs: pwm = motif.pwm c = motif.pwm_min_score() + (motif.pwm_max_score() - motif.pwm_min_score()) * cutoff result = pwmscan(seq.upper(), pwm, c, nreport) for (score, pos, strand) in result: if bed: first = id.split(" ")[0] (chr,loc) = first.split(":") if loc: (start, end) = map(int, loc.split("-")) print "%s\t%s\t%s\t%s" % (chr, start + pos, start + pos + len(pwm) , score) else: print "%s\t%s\t%s\t%s" % (id, pos, pos + len(pwm), score) else: print "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\tmotif_name \"%s\" ; motif_instance \"%s\"" % (
def genome(args): config = MotifConfig() if not os.path.exists(args.indexdir): print "Index_dir %s does not exist!" % (args.indexdir) sys.exit(1) if not os.path.exists(args.fastadir): print "FASTA dir %s does not exist!" % (args.fastadir) sys.exit(1) pred_bin = "genePredToBed" pred = find_executable(pred_bin) if not pred: sys.stderr.write("{} not found in path!\n".format(pred_bin)) sys.exit(1) fastadir = args.fastadir genomebuild = args.genomebuild genome_dir = os.path.join(fastadir, genomebuild) index_dir = os.path.join(args.indexdir, args.genomebuild) # Check for rights to write to directory if not os.path.exists(genome_dir): try: os.mkdir(genome_dir) except: sys.stderr.write("Could not create genome dir {}\n".format(genome_dir)) sys.exit(1) # Download gene file based on URL + genomebuild gene_file = os.path.join(config.get_gene_dir(), "%s.bed" % genomebuild) tmp = NamedTemporaryFile(delete=False, suffix=".gz") anno = [] f = urllib2.urlopen(UCSC_GENE_URL.format(genomebuild)) p = re.compile(r'\w+.Gene.txt.gz') for line in f.readlines(): m = p.search(line) if m: anno.append(m.group(0)) sys.stderr.write("Retrieving gene annotation for {}\n".format(genomebuild)) url = "" for a in ANNOS: if a in anno: url = UCSC_GENE_URL.format(genomebuild) + a break if url: urllib.urlretrieve( url, tmp.name ) sp.call("zcat {} | cut -f2-11 | {} /dev/stdin {}".format(tmp.name, pred, gene_file), shell=True) else: sys.stderr.write("No annotation found!") # download genome based on URL + genomebuild sys.stderr.write("Downloading {} genome\n".format(genomebuild)) for genome_url in [UCSC_GENOME_URL, ALT_UCSC_GENOME_URL]: remote = genome_url.format(genomebuild) genome_fa = os.path.join( genome_dir, os.path.split(remote)[-1] ) sys.stderr.write("Trying to download {}\n".format(genome_url.format(genomebuild))) urllib.urlretrieve( genome_url.format(genomebuild), genome_fa ) if not check_genome_file(genome_fa): continue break if not check_genome_file(genome_fa): sys.stderr.write("Failed to download genome\n") sys.exit(1) sys.stderr.write("Unpacking\n") if genome_fa.endswith("tar.gz"): cmd = "tar -C {0} -xvzf {1} && rm {1}".format(genome_dir, genome_fa) else: cmd = "gunzip {0} && rm {0}".format(genome_fa) sp.call(cmd, shell=True, cwd=genome_dir) fa_files = glob("{}/*.fa".format(genome_dir)) if len(fa_files) == 1: f = Fasta(fa_files[0]) for n,s in f.items(): with open("{}/{}.fa".format(n)) as f: f.write("{}\n{}\n".format(n,s)) os.unlink(fa_files[0]) sys.stderr.write("Creating index\n") g = GenomeIndex() g = g.create_index(genome_dir, index_dir)
plot_histogram(matches - width / 2 + len(motif) / 2, outfile, xrange=(-width / 2, width / 2), breaks=21, title="%s (p=%0.2e)" % (motif.id, p), xlabel="Position") return motif.id, p else: return motif.id, 1.0 if not options.fastafile and not options.pwmfile: parser.print_help() sys.exit() fastafile = options.fastafile pwmfile = options.pwmfile lwidth = options.width if not lwidth: f = Fasta(fastafile) lwidth = len(f.items()[0][1]) f = None job_server = pp.Server(secret="pumpkinrisotto") jobs = [] motifs = pwmfile_to_motifs(pwmfile) ids = [motif.id for motif in motifs] if options.ids: ids = options.ids.split(",") for motif in motifs: if motif.id in ids: outfile = os.path.join("%s_histogram" % motif.id) jobs.append(job_server.submit(motif_localization, (fastafile,motif,lwidth,outfile, options.cutoff), (),())) for job in jobs: