def as_fasta(seqs, index_dir=None): ftype = get_seqs_type(seqs) if ftype == "fasta": return seqs elif ftype == "fastafile": return Fasta(seqs) else: if index_dir is None: raise ValueError("need index_dir / genome to convert to FASTA") tmpfa = NamedTemporaryFile() if ftype == "bedfile": track2fasta(index_dir, seqs, tmpfa.name) else: if ftype == "regionfile": seqs = [l.strip() for l in open(seqs).readlines()] tmpbed = NamedTemporaryFile() for seq in seqs: vals = re.split(r'[:-]', seq) tmpbed.write("{}\t{}\t{}\n".format(*vals)) tmpbed.flush() track2fasta(index_dir, tmpbed.name, tmpfa.name) return Fasta(tmpfa.name)
def location(args): """ Creates histrogram of motif location. Parameters ---------- args : argparse object Command line arguments. """ fastafile = args.fastafile pwmfile = args.pwmfile lwidth = args.width if not lwidth: f = Fasta(fastafile) lwidth = len(f.items()[0][1]) f = None jobs = [] motifs = pwmfile_to_motifs(pwmfile) ids = [motif.id for motif in motifs] if args.ids: ids = args.ids.split(",") for motif in motifs: if motif.id in ids: outfile = os.path.join("%s_histogram" % motif.id) jobs.append( pool.apply_async( motif_localization, (fastafile, motif, lwidth, outfile, args.cutoff))) for job in jobs: job.get()
def __init__(self, outfile, fg_file=None, background=None, do_counter=True, job_server=None): self.lock = thread.allocate_lock() self.motifs = [] self.finished = [] self.stats = {} self.stat_jobs = [] self.outfile = outfile if job_server: self.job_server = job_server else: self.job_server = Pool(2) self.counter = 0 self.do_counter = do_counter open(outfile, "w").close() if fg_file and background: self.fg_fa = Fasta(fg_file) self.background = dict([(bg, Fasta(fname)) for bg, fname in background.items()]) self.do_stats = True else: self.do_stats = False
def calculate_enrichment(self, motif_file, fg, bg): """ fg: [sample_fa, sample_gff] bg: [[bg1_fa, bg1_gff, bg1_enrichment], [bg2_fa, bg2_gff, bg2_enrichment], .. etc] """ self.logger.info("Scanning background sequences with motifs") scan_cmd = scan_fasta_file_with_motifs jobs = [] if self.parallel: jobs.append(self.job_server().submit(scan_cmd, (fg[0], motif_file, self.SCAN_THRESHOLD, fg[1],), (),())) else: scan_cmd(fg[0], motif_file, self.SCAN_THRESHOD, fg[1]) for fasta_file, gff_file in [x[:2] for x in bg]: if self.parallel: jobs.append(self.job_server().submit(scan_cmd, (fasta_file, motif_file, self.SCAN_THRESHOLD, gff_file,), (),())) else: scan_cmd(fasta_file, motif_file, self.SCAN_THRESHOLD, gff_file) for job in jobs: error = job() if error: self.logger.error("Error in thread: %s" % error) sys.exit(1) self.logger.info("Calculating enrichment") enrichment_cmd = gff_enrichment num_sample = len(Fasta(fg[0]).items()) for fasta_file, gff_file, out_file in bg: num_bg = len(Fasta(fasta_file).items()) enrichment_cmd(fg[1], gff_file, num_sample, num_bg, out_file)
def peak2fasta(peak_ids, ref_genome): ''' Convert peak_id into fasta object. Args: peak_id (str or list of str): Peak_id. e.g. "chr5_0930303_9499409" or it can be a list of peak_id. e.g. ["chr5_0930303_9499409", "chr11_123445555_123445577"] ref_genome (str): Reference genome name. e.g. "mm9", "mm10", "hg19" etc Returns: gimmemotifs fasta object: DNA sequence in fasta format ''' genome_data = Genome(ref_genome) def peak2seq(peak_id): chromosome_name, start, end = decompose_chrstr(peak_id) locus = (int(start), int(end)) tmp = genome_data[chromosome_name][locus[0]:locus[1]] name = f"{tmp.name}_{tmp.start}_{tmp.end}" seq = tmp.seq return (name, seq) if type(peak_ids) is str: peak_ids = [peak_ids] fasta = Fasta() for peak_id in peak_ids: name, seq = peak2seq(peak_id) fasta.add(name, seq) return fasta
def __init__(self, fasta, size=None, n=None, k=1, matrix_only=False): self.k = k # Initialize super Fasta object Fasta.__init__(self) # Initialize Markov transition matrix self._initialize_matrices(fasta.seqs, k=k) if matrix_only: return c = 0 if not n: n = len(fasta) while len(self) < n: seq = choice(fasta.seqs) name = "random_Markov%s_%s" % (k, c) if size: random_seq = self._generate_sequence(size) else: random_seq = self._generate_sequence(len(seq)) self.add(name, random_seq) c += 1
def location(args): fastafile = args.fastafile pwmfile = args.pwmfile lwidth = args.width if not lwidth: f = Fasta(fastafile) lwidth = len(f.items()[0][1]) f = None jobs = [] motifs = pwmfile_to_motifs(pwmfile) ids = [motif.id for motif in motifs] if args.ids: ids = args.ids.split(",") for motif in motifs: if motif.id in ids: outfile = os.path.join("%s_histogram" % motif.id) jobs.append( pool.apply_async( motif_localization, (fastafile,motif,lwidth,outfile, args.cutoff) )) for job in jobs: job.get()
def _run_program(self, bin, fastafile, savedir, params=None): fastafile = os.path.abspath(fastafile) savedir = os.path.abspath(savedir) basename = "munk_in.fa" new_file = os.path.join(self.tmpdir, basename) out = open(new_file, "w") f = Fasta(fastafile) for name,seq in f.items(): header = " ".join(["%0.1f" % x for x in range(len(seq) / 2) + range(len(seq) / 2, 0, -1)]) out.write(">%s\n" % header) out.write("%s\n" % seq) out.close() fastafile = new_file outfile = fastafile + ".out" current_path = os.getcwd() os.chdir(self.dir()) cmd = "%s %s %s yes 1.0 p:%s > %s" % (bin, params["width"], params["width"], fastafile, outfile) p = Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE) stdout, stderr = p.communicate() motifs = [] if os.path.exists(outfile): motifs = self.parse(open(outfile)) os.chdir(current_path) return motifs, stdout, stderr
def scan_it_moods(infile, motifs, cutoff, bgfile, nreport=1, scan_rc=True, pvalue=None, count=False): tmpdir = mkdtemp() matrices = [] pseudocount = 1e-3 # sys.stderr.write("bgfile: {}\n".format(bgfile)) bg = MOODS.tools.bg_from_sequence_dna("".join(Fasta(bgfile).seqs), 1) for motif in motifs: pfmname = os.path.join(tmpdir, "{}.pfm".format(motif.id)) with open(pfmname, "w") as f: matrix = np.array(motif.pwm).transpose() for line in [" ".join([str(x) for x in row]) for row in matrix]: f.write("{}\n".format(line)) matrices.append(MOODS.parsers.pfm_log_odds(pfmname, bg, pseudocount)) thresholds = [] if pvalue is not None: thresholds = [ MOODS.tools.threshold_from_p(m, bg, float(pvalue)) for m in matrices ] # sys.stderr.write("{}\n".format(thresholds)) else: thresholds = [calc_threshold_moods(m, float(cutoff)) for m in matrices] scanner = MOODS.scan.Scanner(7) scanner.set_motifs(matrices, bg, thresholds) config = MotifConfig() ncpus = int(config.get_default_params()["ncpus"]) fa = Fasta(infile) chunk = 500 if (len(fa) / chunk) < ncpus: chunk = len(fa) / (ncpus + 1) jobs = [] func = scan_fa_with_motif_moods if count: func = scan_fa_with_motif_moods_count pool = mp.Pool() for i in range(0, len(fa), chunk): jobs.append( pool.apply_async( func, (fa[i:i + chunk], motifs, matrices, bg, thresholds, nreport, scan_rc), )) for job in jobs: for ret in job.get(): yield ret
def __init__(self, fasta, length=None, number=None, k=1, matrix_only=False): self.k = k # Initialize super Fasta object Fasta.__init__(self) # Initialize Markov transition matrix self._initialize_matrices(fasta.seqs, k=k) if matrix_only: return c = 0 if not number: number = len(fasta) while len(self) < number: seq = choice(fasta.seqs) id = "random_Markov%s_%s" % (k,c) if length: random_seq = self._generate_sequence(length) else: random_seq = self._generate_sequence(len(seq)) self.add(id, random_seq) c += 1
def setUp(self): self.data_dir = "test/data/pwmscan" self.motif = read_motifs(open(os.path.join(self.data_dir, "TATA.pwm")), fmt="pwm")[0] self.prom = Fasta(os.path.join(self.data_dir, "promoters.fa")) self.prom_gff = os.path.join(self.data_dir, "promoters_result.gff") self.random = Fasta(os.path.join(self.data_dir, "random_sequences.fa")) self.random_gff = os.path.join(self.data_dir, "random_result.gff") self.enrichment = os.path.join(self.data_dir, "enrichment.txt") self.tmp = NamedTemporaryFile().name
def remove_zero_seq(fasta_object): """ Remove DNA sequence with zero length """ fasta = Fasta() for i, seq in enumerate(fasta_object.seqs): if seq: name = fasta_object.ids[i] fasta.add(name, seq) return fasta
def download_genome(genomebuild, genome_dir): # download genome based on URL + genomebuild sys.stderr.write("Downloading {} genome\n".format(genomebuild)) for genome_url in UCSC_GENOME_URLS: remote = genome_url.format(genomebuild) genome_fa = os.path.join( genome_dir, os.path.split(remote)[-1] ) sys.stderr.write("Trying to download {}\n".format(genome_url.format(genomebuild))) try: urlretrieve( genome_url.format(genomebuild), genome_fa ) if not check_genome_file(genome_fa): os.unlink(genome_fa) continue break except: pass if not check_genome_file(genome_fa): sys.stderr.write("Failed to download genome\n") sys.exit(1) sys.stderr.write("Unpacking\n") genome_fa = os.path.basename(genome_fa) if genome_fa.endswith("tar.gz"): cmd = "tar -C {0} -xvzf {1} && rm {1}".format(genome_dir, genome_fa) elif genome_fa.endswith(".zip"): cmd = "unzip {0}".format(genome_fa) else: cmd = "gunzip {0}".format(genome_fa) sp.call(cmd, shell=True, cwd=genome_dir) fa_files = glob("{}/*.fa".format(genome_dir)) if len(fa_files) == 1: f = Fasta(fa_files[0]) for n,s in f.items(): with open("{}/{}.fa".format(genome_dir, n), "w") as f: f.write(">{}\n{}\n".format(n,s)) os.unlink(fa_files[0]) genome_fa = os.path.join(genome_dir, genome_fa) if os.path.exists(genome_fa): os.unlink(genome_fa)
def _create_background(self, bg_type, bedfile, fafile, outfile, organism="hg18", width=200, nr_times=10): fg = Fasta(fafile) if bg_type == "random": if int(self.markov_model) >= 6: self.logger.warn("Are you sure about the Markov model? It seems too high!") else: order = {"1":"1st","2":"2nd", "3":"3rd", "4":"4th", "5":"5th"}[str(self.markov_model)] self.logger.debug("Creating random background (%s order Markov)" % order) m = MarkovFasta(fg, k=int(self.markov_model), n=nr_times * len(fg)) m.writefasta(outfile) self.logger.debug("Random background: %s", outfile) # return the number of random sequences created return len(m) elif bg_type == "genomic": self.logger.debug("Creating genomic background") index_dir = os.path.join(self.config.get_index_dir(), organism) f = RandomGenomicFasta(index_dir, width, nr_times * len(fg)) f.writefasta(outfile) return len(f) elif bg_type == "gc": self.logger.debug("Creating GC matched background") f = MatchedGcFasta(fafile, organism, nr_times * len(fg)) f.writefasta(outfile) self.logger.debug("GC matched background: %s", outfile) return len(f) elif bg_type == "promoter": gene_file = os.path.join(self.config.get_gene_dir(), "%s.bed" % organism) index_dir = os.path.join(self.config.get_index_dir(), organism) self.logger.info( "Creating random promoter background (%s, using genes in %s)", organism, gene_file) f = PromoterFasta(gene_file, index_dir, width, nr_times * len(fg)) f.writefasta(outfile) self.logger.debug("Random promoter background: %s", outfile) return len(f) elif bg_type == "user": bg_file = self.params["user_background"] if not os.path.exists(bg_file): self.logger.error( "User-specified background file %s does not exist!", bg_file) sys.exit(1) else: self.logger.info("Copying user-specified background file %s to %s.", bg_file, outfile) fa = Fasta(bg_file) l = median([len(seq) for seq in fa.seqs]) if l < width * 0.95 or l > width * 1.05: self.logger.warn("The user-specified background file %s contains sequences with a median length of %s, while GimmeMotifs predicts motifs in sequences of length %s. This will influence the statistics! It is recommended to use background sequences of the same length.", bg_file, l, width) fa.writefasta(outfile) return len(fa)
def get_roc_values(motif, fg_file, bg_file): try: fg_result = motif.pwm_scan_score(Fasta(fg_file), cutoff=0.0, nreport=1) fg_vals = [sorted(x)[-1] for x in fg_result.values()] bg_result = motif.pwm_scan_score(Fasta(bg_file), cutoff=0.0, nreport=1) bg_vals = [sorted(x)[-1] for x in bg_result.values()] (x, y) = ROC_values(fg_vals, bg_vals) return None, x, y except Exception, e: error = e return error, [], []
def as_fasta(seqs, genome=None): ftype = get_seqs_type(seqs) if ftype == "fasta": return seqs elif ftype == "fastafile": return Fasta(seqs) else: if genome is None: raise ValueError("need genome to convert to FASTA") tmpfa = NamedTemporaryFile() if type(genome) == type(""): genome = Genome(genome) genome.track2fasta(seqs, tmpfa.name) return Fasta(tmpfa.name)
class TestMotifPwm(unittest.TestCase): """ A test class to test Motif pwmscan functionality and related things """ def setUp(self): self.data_dir = "test/data/pwmscan" self.motif = pwmfile_to_motifs(os.path.join(self.data_dir, "TATA.pwm"))[0] self.prom = Fasta(os.path.join(self.data_dir, "promoters.fa")) self.prom_gff = os.path.join(self.data_dir, "promoters_result.gff") self.random = Fasta(os.path.join(self.data_dir, "random_sequences.fa")) self.random_gff = os.path.join(self.data_dir, "random_result.gff") self.enrichment = os.path.join(self.data_dir, "enrichment.txt") self.tmp = NamedTemporaryFile().name def test1_pwm_scan(self): """ Scan a FASTA file with PWM of motif """ result = self.motif.pwm_scan(self.prom, nreport=1) # Every sequence should have a TATA match self.assertEquals(len(result.keys()), len(self.prom.items())) def test2_pwm_scan_to_gff(self): """ Scan a FASTA file with PWM of motif, and produce GFF """ self.motif.pwm_scan_to_gff(self.prom, self.tmp) self.assertEquals(open(self.prom_gff).read(), open(self.tmp).read()) def test3_gff_enrichment(self): """ Test gff_enrichment """ self.motif.pwm_scan_to_gff(self.random, self.random_gff) gff_enrichment(self.prom_gff, self.random_gff, 316, 3160, self.tmp) self.assertEquals(open(self.enrichment).read(), open(self.tmp).read()) def tearDown(self): pass
def check_denovo_input(inputfile, params): genome = params["genome"] background = params["background"] input_type = "BED" # If we can load it as fasta then it is a fasta, yeh? try: Fasta(inputfile) logger.debug("Inputfile is a FASTA file") input_type = "FASTA" except Exception: # Leave it to BED pass if input_type == "FASTA": valid_bg = FA_VALID_BGS elif input_type == "BED": valid_bg = BED_VALID_BGS if "genomic" in background: Genome(genome) # is it a valid bed-file etc. check_bed_file(inputfile) # bed-specific for bg in background: if not bg in valid_bg: logger.info("Input type is %s, ignoring background type '%s'", input_type, bg) background = [bg for bg in background if bg in valid_bg] if len(background) == 0: logger.error("No valid backgrounds specified!") sys.exit(1) return input_type, background
def prepare_denovo_input_fa(inputfile, params, outdir): """Create all the FASTA files for de novo motif prediction and validation. Parameters ---------- """ fraction = float(params["fraction"]) abs_max = int(params["abs_max"]) logger.info("preparing input (FASTA)") pred_fa = os.path.join(outdir, "prediction.fa") val_fa = os.path.join(outdir, "validation.fa") loc_fa = os.path.join(outdir, "localization.fa") # Split inputfile in prediction and validation set logger.debug( "Splitting %s into prediction set (%s) and validation set (%s)", inputfile, pred_fa, val_fa, ) divide_fa_file(inputfile, pred_fa, val_fa, fraction, abs_max) # File for location plots shutil.copy(val_fa, loc_fa) seqs = Fasta(loc_fa).seqs lsize = len(seqs[0]) all_same_size = not (False in [len(seq) == lsize for seq in seqs]) if not all_same_size: logger.warn( "PLEASE NOTE: FASTA file contains sequences of different sizes. " "Positional preference plots might be incorrect!")
def test_track2fasta_exons(self): """ track2fasta should convert bed12 to fasta""" from gimmemotifs.fasta import Fasta bedfile = os.path.join(self.fasta_dir, "genes.bed") fafile = os.path.join(self.fasta_dir, "genes.out") # Create index self.g.create_index(self.fasta_dir, self.index_dir) # Convert bed to fasta track2fasta(self.index_dir, bedfile, self.temp_file, use_strand=True) target = Fasta(fafile) test = Fasta(self.temp_file) for gene in test.ids: name = gene.split(" ")[-1] self.assertEqual(len(test[gene]), len(target[name])) self.assertEqual(test[gene].upper(), target[name].upper())
def test1_scan_sequences(self): """ Scanner """ for ncpus in [1, 2, 3]: s = Scanner(ncpus=ncpus) s.set_motifs(self.motifs) f = Fasta(self.fa) s.set_threshold(threshold=0.0) nmatches = [len(m[0]) for m in s._scan_sequences(f.seqs, 1, False)] self.assertEqual([1, 1, 1], nmatches) s.set_threshold(threshold=0.99) nmatches = [len(m[0]) for m in s._scan_sequences(f.seqs, 1, False)] self.assertEqual([0, 1, 1], nmatches) s.set_threshold(threshold=0.99) nmatches = [ len(m[0]) for m in s._scan_sequences(f.seqs, 10, False) ] self.assertEqual([0, 1, 2], nmatches) s.set_threshold(threshold=0.99) nmatches = [len(m[0]) for m in s._scan_sequences(f.seqs, 10, True)] self.assertEqual([0, 2, 4], nmatches)
def __init__(self, matchfile, genome="hg19", number=None, size=None): # Create temporary files tmpbed = NamedTemporaryFile(dir=mytmpdir()).name tmpfasta = NamedTemporaryFile(dir=mytmpdir()).name # Create bed-file with coordinates of random sequences matched_gc_bedfile(tmpbed, matchfile, genome, number, size=size) # Convert track to fasta Genome(genome).track2fasta(tmpbed, fastafile=tmpfasta) # Initialize super Fasta object Fasta.__init__(self, tmpfasta) # Delete the temporary files os.remove(tmpbed) os.remove(tmpfasta)
def get_roc_values(motif, fg_file, bg_file): error = None x = [] y = [] try: from gimmemotifs.fasta import Fasta from gimmemotifs.rocmetrics import ROC_values,ROC_AUC,MNCP,max_fmeasure fg_result = motif.pwm_scan_score(Fasta(fg_file), cutoff=0.0, nreport=1) fg_vals = [sorted(x)[-1] for x in fg_result.values()] bg_result = motif.pwm_scan_score(Fasta(bg_file), cutoff=0.0, nreport=1) bg_vals = [sorted(x)[-1] for x in bg_result.values()] (x, y) = ROC_values(fg_vals, bg_vals) except Exception,e: error = e
def as_fasta(seqs, genome=None): ftype = get_seqs_type(seqs) if ftype == "fasta": return seqs elif ftype == "fastafile": return Fasta(seqs) else: if genome is None: raise ValueError("need genome to convert to FASTA") tmpfa = NamedTemporaryFile() if isinstance(genome, str): genome = Genome(genome) if isinstance(seqs, np.ndarray): seqs = list(seqs) genome.track2fasta(seqs, tmpfa.name) return Fasta(tmpfa.name)
def __init__(self, fasta, length=None, multiply=10): # Initialize super Fasta object Fasta.__init__(self) # Initialize Markov transition matrix self._initialize_matrices(fasta.seqs) c = 0 for seq in fasta.seqs: for i in range(multiply): id = "random_1st_order_%s" % (c) if length: random_seq = self._generate_sequence(length) else: random_seq = self._generate_sequence(len(seq)) self.add(id, random_seq) c += 1
def __init__(self, index="/usr/share/gimmemotifs/genome_index/hg18", length=None, n=None): length = int(length) # Create temporary files tmpbed = NamedTemporaryFile(dir=mytmpdir()).name tmpfasta = NamedTemporaryFile(dir=mytmpdir()).name # Create bed-file with coordinates of random sequences create_random_genomic_bedfile(tmpbed, index, length, n) # Convert track to fasta track2fasta(index, tmpbed, tmpfasta, use_strand=True) # Initialize super Fasta object Fasta.__init__(self, tmpfasta) # Delete the temporary files os.remove(tmpbed) os.remove(tmpfasta)
def __init__( self, outfile, genome=None, fg_file=None, background=None, gc=False, do_counter=True, job_server=None, ): self.lock = thread.allocate_lock() self.motifs = [] self.finished = [] self.stats = {} self.stat_jobs = [] self.outfile = outfile self.genome = genome if job_server: self.job_server = job_server else: self.job_server = Pool(2) self.counter = 0 self.do_counter = do_counter open(outfile, "w").close() if fg_file and background: self.fg_fa = Fasta(fg_file) self.background = dict( [(bg, Fasta(fname)) for bg, fname in background.items()] ) self.do_stats = True self.gc = gc self.zscore = self.gc if self.gc: if genome is None: raise ValueError( "Need a genome when calculating GC% zscores for motif statistics" ) else: self.genome = genome else: self.do_stats = False
def __init__(self, bedfile, genefile, index="/usr/share/gimmemotifs/genome_index/hg18", length=None, multiply=10, match_chromosome=True): self.match_chromosome = match_chromosome # Create temporary files tmpbed = NamedTemporaryFile().name tmpfasta = NamedTemporaryFile().name # Create bed-file with coordinates of random sequences self._create_bedfile(tmpbed, bedfile, genefile, length, multiply) # Convert track to fasta track2fasta(index, tmpbed, tmpfasta) # Initialize super Fasta object Fasta.__init__(self, tmpfasta) # Delete the temporary files os.remove(tmpbed) os.remove(tmpfasta)
def __init__(self, genefile, index="/usr/share/gimmemotifs/genome_index/hg18", length=None, n=None): length = int(length) # Create temporary files tmpbed = NamedTemporaryFile().name tmpfasta = NamedTemporaryFile().name # Create bed-file with coordinates of random sequences self._create_promoter_bedfile(tmpbed, genefile, length, n) # Convert track to fasta track2fasta(index, tmpbed, tmpfasta, use_strand=True) # Initialize super Fasta object Fasta.__init__(self, tmpfasta) # Delete the temporary files os.remove(tmpbed) os.remove(tmpfasta)
def __init__(self, genome, size=None, n=None): size = int(size) # Create temporary files tmpbed = NamedTemporaryFile(dir=mytmpdir()).name tmpfasta = NamedTemporaryFile(dir=mytmpdir()).name # Create bed-file with coordinates of random sequences create_random_genomic_bedfile(tmpbed, genome, size, n) # Convert track to fasta Genome(genome).track2fasta(tmpbed, fastafile=tmpfasta, stranded=True) # Initialize super Fasta object Fasta.__init__(self, tmpfasta) # Delete the temporary files os.remove(tmpbed) os.remove(tmpfasta)
def setUp(self): self.data_dir = "test/data/pwmscan" self.motif = pwmfile_to_motifs(os.path.join(self.data_dir, "TATA.pwm"))[0] self.prom = Fasta(os.path.join(self.data_dir, "promoters.fa")) self.prom_gff = os.path.join(self.data_dir, "promoters_result.gff") self.random = Fasta(os.path.join(self.data_dir, "random_sequences.fa")) self.random_gff = os.path.join(self.data_dir, "random_result.gff") self.enrichment = os.path.join(self.data_dir, "enrichment.txt") self.tmp = NamedTemporaryFile().name
def __init__(self, bedfile, genefile, index="/usr/share/gimmemotifs/genome_index/hg18", length=None, multiply=10, match_chromosome=True): self.match_chromosome = match_chromosome length = int(length) # Create temporary files tmpbed = NamedTemporaryFile().name tmpfasta = NamedTemporaryFile().name # Create bed-file with coordinates of random sequences self._create_bedfile(tmpbed, bedfile, genefile, length, multiply) # Convert track to fasta track2fasta(index, tmpbed, tmpfasta) # Initialize super Fasta object Fasta.__init__(self, tmpfasta) # Delete the temporary files os.remove(tmpbed) os.remove(tmpfasta)
def get_scores(motif, fg_file, bg_file): error = None auc = None mncp = None max_f = None y = None try: fg_result = motif.pwm_scan_score(Fasta(fg_file), cutoff=0.0, nreport=1) fg_vals = [sorted(x)[-1] for x in fg_result.values()] bg_result = motif.pwm_scan_score(Fasta(bg_file), cutoff=0.0, nreport=1) bg_vals = [sorted(x)[-1] for x in bg_result.values()] (x, y) = ROC_values(fg_vals, bg_vals) auc = ROC_AUC(fg_vals, bg_vals) mncp = MNCP(fg_vals, bg_vals) max_f, y = max_fmeasure(x, y) except Exception, e: error = e
def scan_fasta_file_with_motifs(fastafile, motiffile, threshold, gfffile, scan_rc=True): error = None try: from gimmemotifs.fasta import Fasta from gimmemotifs.motif import pwmfile_to_motifs motifs = pwmfile_to_motifs(motiffile) fa = Fasta(fastafile) for motif in motifs: motif.pwm_scan_to_gff(fa, gfffile, nreport=1, cutoff=float(threshold), scan_rc=scan_rc, append=True) except Exception,e : error = e
def set_background(self, fname=None, genome=None, length=200, nseq=10000): """Set the background to use for FPR and z-score calculations. Background can be specified either as a genome name or as the name of a FASTA file. Parameters ---------- fname : str, optional Name of FASTA file to use as background. genome : str, optional Name of genome to use to retrieve random sequences. length : int, optional Length of genomic sequences to retrieve. The default is 200. nseq : int, optional Number of genomic sequences to retrieve. """ length = int(length) if genome and fname: raise ValueError("Need either genome or filename for background.") if fname: if not os.path.exists(fname): raise IOError( "Background file {} does not exist!".format(fname)) self.background = Fasta(fname) self.background_hash = file_checksum(fname) return if not genome: if self.genome: genome = self.genome logger.info( "Using default background: genome {} with length {}". format(genome, length)) else: raise ValueError( "Need either genome or filename for background.") logger.info("Using background: genome {} with length {}".format( genome, length)) with Cache(CACHE_DIR) as cache: self.background_hash = "{}\{}".format(genome, int(length)) fa = cache.get(self.background_hash) if not fa: fa = RandomGenomicFasta(genome, length, nseq) cache.set(self.background_hash, fa) self.background = fa
def __init__(self, matchfile, genome="hg19", number=None): config = MotifConfig() index = os.path.join(config.get_index_dir(), genome) # Create temporary files tmpbed = NamedTemporaryFile(dir=mytmpdir()).name tmpfasta = NamedTemporaryFile(dir=mytmpdir()).name # Create bed-file with coordinates of random sequences matched_gc_bedfile(tmpbed, matchfile, genome, number) # Convert track to fasta track2fasta(index, tmpbed, tmpfasta) # Initialize super Fasta object Fasta.__init__(self, tmpfasta) # Delete the temporary files os.remove(tmpbed) os.remove(tmpfasta)
def divide_fa_file(fname, sample, rest, fraction, abs_max): fa = Fasta(fname) ids = fa.ids[:] x = int(fraction * len(ids)) if x > abs_max: x = abs_max sample_seqs = random.sample(ids, x) # Rest f_sample = open(sample, "w") f_rest = open(rest, "w") for name,seq in fa.items(): if name in sample_seqs: f_sample.write(">%s\n%s\n" % (name, seq)) else: f_rest.write(">%s\n%s\n" % (name, seq)) f_sample.close() f_rest.close() return x, len(ids[x:])
def motif_localization(fastafile, motif, width, outfile, cutoff=0.9): NR_HIST_MATCHES = 100 matches = motif.pwm_scan(Fasta(fastafile), cutoff=cutoff, nreport=NR_HIST_MATCHES) if len(matches) > 0: ar = [] for a in matches.values(): ar += a matches = np.array(ar) p = ks_pvalue(matches, width - len(motif)) plot_histogram(matches - width / 2 + len(motif) / 2, outfile, xrange=(-width / 2, width / 2), breaks=21, title="%s (p=%0.2e)" % (motif.id, p), xlabel="Position") return motif.id, p else: return motif.id, 1.0
def location(args): """ Creates histrogram of motif location. Parameters ---------- args : argparse object Command line arguments. """ fastafile = args.fastafile pwmfile = args.pwmfile lwidth = args.width if not lwidth: f = Fasta(fastafile) lwidth = len(f.items()[0][1]) f = None jobs = [] motifs = pwmfile_to_motifs(pwmfile) ids = [motif.id for motif in motifs] if args.ids: ids = args.ids.split(",") n_cpus = int(MotifConfig().get_default_params()["ncpus"]) pool = Pool(processes=n_cpus, maxtasksperchild=1000) for motif in motifs: if motif.id in ids: outfile = os.path.join("%s_histogram" % motif.id) jobs.append( pool.apply_async( motif_localization, (fastafile,motif,lwidth,outfile, args.cutoff) )) for job in jobs: job.get()
def __init__(self, fasta, length=None, multiply=10, k=1, matrix_only=False): self.k = k # Initialize super Fasta object Fasta.__init__(self) # Initialize Markov transition matrix self._initialize_matrices(fasta.seqs, k=k) if matrix_only: return c = 0 for seq in fasta.seqs: for i in range(multiply): id = "random_Markov%s_%s" % (k,c) if length: random_seq = self._generate_sequence(length) else: random_seq = self._generate_sequence(len(seq)) self.add(id, random_seq) c += 1
class TestMotifPwm(unittest.TestCase): """ A test class to test Motif pwmscan functionality and related things """ def setUp(self): self.data_dir = "test/data/pwmscan" self.motif = pwmfile_to_motifs(os.path.join(self.data_dir, "TATA.pwm"))[0] self.prom = Fasta(os.path.join(self.data_dir, "promoters.fa")) self.prom_gff = os.path.join(self.data_dir, "promoters_result.gff") self.random = Fasta(os.path.join(self.data_dir, "random_sequences.fa")) self.random_gff = os.path.join(self.data_dir, "random_result.gff") self.enrichment = os.path.join(self.data_dir, "enrichment.txt") self.tmp = NamedTemporaryFile().name def test1_pwm_scan(self): """ Scan a FASTA file with PWM of motif """ result = self.motif.pwm_scan(self.prom, nreport=1) # Every sequence should have a TATA match self.assertEquals(len(result.keys()), len(self.prom.items())) def test2_pwm_scan_to_gff(self): """ Scan a FASTA file with PWM of motif, and produce GFF """ self.motif.pwm_scan_to_gff(self.prom, self.tmp) for line in open(self.tmp): vals = line.strip().split("\t") self.assertEquals(9, len(vals)) self.assertTrue(int(vals[3]) > 0) self.assertTrue(int(vals[4]) > 0) self.assertTrue(float(vals[5]) > 5.25) self.assertTrue(float(vals[5]) < 9.06) self.assertIn(vals[6], ["+", "-"]) def test3_gff_enrichment(self): """ Test gff_enrichment """ self.motif.pwm_scan_to_gff(self.random, self.random_gff) gff_enrichment(self.prom_gff, self.random_gff, 316, 3160, self.tmp) f = open(self.tmp) f.readline() # Header vals = f.readline().strip().split("\t") self.assertEquals(vals[0], "TATA-box") self.assertLess(float(vals[2]), 1e-60) self.assertGreater(float(vals[5]), 1.5) def tearDown(self): pass
parser = OptionParser() parser.add_option("-p", "--pwmfile", dest="pwmfile", help="File with pwms", metavar="FILE") parser.add_option("-i", "--inputfile", dest="inputfile", help="FASTA file with background sequences", metavar="FILE") parser.add_option("-f", "--fpr", dest="fpr", help="Desired fpr", type="float", metavar="FLOAT") (options, args) = parser.parse_args() if not options.pwmfile or not options.inputfile or not options.fpr: parser.print_help() exit() if options.fpr < 0 or options.fpr > 1: print "Please specify a FPR between 0 and 1" sys.exit() f = Fasta(options.inputfile) motifs = pwmfile_to_motifs(options.pwmfile) print "Motif\tScore\tCutoff" for motif in motifs: pwm = motif.pwm scores = [] min_score = motif.pwm_min_score() for name,seq in f.items(): result = pwmscan(seq.upper(), pwm, min_score, 1, True) score = result[0][0] scores.append(score) opt_score = scoreatpercentile(scores, 100 - (100 * options.fpr)) cutoff = (opt_score - min_score) / (motif.pwm_max_score() - min_score) print "%s\t%s\t%s" % (motif.id, opt_score , cutoff)
def nmer_predict(fastafile): from tempfile import NamedTemporaryFile,mkdtemp from gimmemotifs.fasta import Fasta from numpy import sum,histogram from subprocess import Popen,PIPE from gimmemotifs.motif import Motif,motif_from_align from gimmemotifs.cluster import cluster_motifs from string import maketrans def rc(seq): t = maketrans("ATCG", "TAGC") return seq[::-1].translate(t) f = Fasta(fastafile) nmer = {} N = {6:4, 8:3,10:2,12:1} tmp = NamedTemporaryFile() abs_cutoff = len(f.items()) / 100.0 * 2 for check_n,cutoff in N.items(): for id,seq in f.items(): for i in range(len(seq) - check_n): n = seq[i: i + check_n] nmer.setdefault(n.upper(), []).append(i) for n,pos in nmer.items(): if len(pos) > abs_cutoff: hist = histogram(pos, bins=9, range=(0,200))[0] if sum(hist[3:6]) > sum(hist[0:3] * N[len(n)]) and sum(hist[3:6]) > sum(hist[7:]) * N[len(n)]: tmp.write(">%s\n" % n) for char in n: w = [] for x in ["A", "C", "G", "T"]: if x == char: w.append(len(pos)) else: w.append(0) tmp.write("\t".join([str(x) for x in w]) + "\n") tmp.flush() tmpname = tmp.name tree = cluster_motifs(tmpname, "subtotal", "ed", "mean", False, threshold=-0.1, include_bg=False) clusters = tree.getResult() def refine_by_scanning(motifs, fastafile): tmp_gff = NamedTemporaryFile() file_in = NamedTemporaryFile() for m in motifs: file_in.write("%s\n" % m.to_pfm()) file_in.flush() cmd = "pwmscan.py -i %s -p %s -c 0.8 > %s" % (fastafile, file_in.name, tmp_gff.name) p = Popen(cmd, shell=True) stdout,stderr = p.communicate() aligns = {} for line in open(tmp_gff.name): vals = line.strip().split("\t") motif,instance = [x.split(" ")[1].replace('"', "") for x in vals[8].split(" ; ")] if vals[6] == "+": aligns.setdefault(motif,[]).append(instance.upper()) else: aligns.setdefault(motif,[]).append(rc(instance.upper())) tmp_out = NamedTemporaryFile() refined_motifs = [] for id,align in aligns.items(): if len(align) > 10: motif = motif_from_align(align) refined_motifs.append(motif) return refined_motifs motifs = refine_by_scanning([x[0] for x in clusters], fastafile) tmp4 = NamedTemporaryFile() for m in motifs: tmp4.write("%s\n" % m.to_pfm()) tmp4.flush() motifs = [] tree = cluster_motifs(tmp4.name, "total", "wic", "mean", True, threshold=0.95, include_bg=True) clusters = tree.getResult() for i, (cluster,members) in enumerate(clusters): cluster.id = "Nmer_%s" % (i + 1) motifs.append(cluster) refined_motifs = refine_by_scanning(motifs, fastafile) for i,m in enumerate(refined_motifs): m.id = "WannaMotif_%s" % (i + 1) return refined_motifs, "", ""
) return motif.id, p else: return motif.id, 1.0 if not options.fastafile and not options.pwmfile: parser.print_help() sys.exit() fastafile = options.fastafile pwmfile = options.pwmfile lwidth = options.width if not lwidth: f = Fasta(fastafile) lwidth = len(f.items()[0][1]) f = None job_server = pp.Server(secret="pumpkinrisotto") jobs = [] motifs = pwmfile_to_motifs(pwmfile) ids = [motif.id for motif in motifs] if options.ids: ids = options.ids.split(",") for motif in motifs: if motif.id in ids: outfile = os.path.join("%s_histogram" % motif.id) jobs.append(job_server.submit(motif_localization, (fastafile, motif, lwidth, outfile, options.cutoff), (), ()))
if not (options.inputfile and (options.pwmfile or options.mdmodulefile)): parser.print_help() sys.exit(0) inputfile = options.inputfile if options.nreport: nreport = int(options.nreport) cutoff = float(options.cutoff) motifs = pwmfile_to_motifs(options.pwmfile) bed = options.bed f = Fasta(inputfile) strandmap = {-1:"-",1:"+"} for (id,seq) in f.items(): for motif in motifs: pwm = motif.pwm c = motif.pwm_min_score() + (motif.pwm_max_score() - motif.pwm_min_score()) * cutoff result = pwmscan(seq.upper(), pwm, c, nreport) for (score, pos, strand) in result: if bed: first = id.split(" ")[0] (chr,loc) = first.split(":") if loc: (start, end) = map(int, loc.split("-")) print "%s\t%s\t%s\t%s" % (chr, start + pos, start + pos + len(pwm) , score) else: print "%s\t%s\t%s\t%s" % (id, pos, pos + len(pwm), score)
def genome(args): config = MotifConfig() if not os.path.exists(args.indexdir): print "Index_dir %s does not exist!" % (args.indexdir) sys.exit(1) if not os.path.exists(args.fastadir): print "FASTA dir %s does not exist!" % (args.fastadir) sys.exit(1) pred_bin = "genePredToBed" pred = find_executable(pred_bin) if not pred: sys.stderr.write("{} not found in path!\n".format(pred_bin)) sys.exit(1) fastadir = args.fastadir genomebuild = args.genomebuild genome_dir = os.path.join(fastadir, genomebuild) index_dir = os.path.join(args.indexdir, args.genomebuild) # Check for rights to write to directory if not os.path.exists(genome_dir): try: os.mkdir(genome_dir) except: sys.stderr.write("Could not create genome dir {}\n".format(genome_dir)) sys.exit(1) # Download gene file based on URL + genomebuild gene_file = os.path.join(config.get_gene_dir(), "%s.bed" % genomebuild) tmp = NamedTemporaryFile(delete=False, suffix=".gz") anno = [] f = urllib2.urlopen(UCSC_GENE_URL.format(genomebuild)) p = re.compile(r'\w+.Gene.txt.gz') for line in f.readlines(): m = p.search(line) if m: anno.append(m.group(0)) sys.stderr.write("Retrieving gene annotation for {}\n".format(genomebuild)) url = "" for a in ANNOS: if a in anno: url = UCSC_GENE_URL.format(genomebuild) + a break if url: urllib.urlretrieve( url, tmp.name ) sp.call("zcat {} | cut -f2-11 | {} /dev/stdin {}".format(tmp.name, pred, gene_file), shell=True) else: sys.stderr.write("No annotation found!") # download genome based on URL + genomebuild sys.stderr.write("Downloading {} genome\n".format(genomebuild)) for genome_url in [UCSC_GENOME_URL, ALT_UCSC_GENOME_URL]: remote = genome_url.format(genomebuild) genome_fa = os.path.join( genome_dir, os.path.split(remote)[-1] ) sys.stderr.write("Trying to download {}\n".format(genome_url.format(genomebuild))) urllib.urlretrieve( genome_url.format(genomebuild), genome_fa ) if not check_genome_file(genome_fa): continue break if not check_genome_file(genome_fa): sys.stderr.write("Failed to download genome\n") sys.exit(1) sys.stderr.write("Unpacking\n") if genome_fa.endswith("tar.gz"): cmd = "tar -C {0} -xvzf {1} && rm {1}".format(genome_dir, genome_fa) else: cmd = "gunzip {0} && rm {0}".format(genome_fa) sp.call(cmd, shell=True, cwd=genome_dir) fa_files = glob("{}/*.fa".format(genome_dir)) if len(fa_files) == 1: f = Fasta(fa_files[0]) for n,s in f.items(): with open("{}/{}.fa".format(n)) as f: f.write("{}\n{}\n".format(n,s)) os.unlink(fa_files[0]) sys.stderr.write("Creating index\n") g = GenomeIndex() g = g.create_index(genome_dir, index_dir)