def draw_plot(motiffile): """generating histogram""" count = [] control = [] mf = Motif.read(open(motiffile),'jaspar-pfm') for record in SeqIO.parse(sys.argv[2],'fasta'): hit = search_motif(mf,record.seq) if hit == None: continue else: count.append(hit) for record in SeqIO.parse(sys.argv[3],'fasta'): hit = search_motif(mf,record.seq) if hit == None: continue else: control.append(hit) # assume the sequence length is 201, center base +/- 100bp pylab.figure() pylab.hist(count, np.linspace(-100,100,101),color='g') num, bin = np.histogram(control, np.linspace(-100,100,101)) pylab.plot(np.linspace(-100,100,100), num, color='r') pylab.xlabel('Distance relative to Stat5 motif') pylab.ylabel('No. Stat5 peaks') motifname = os.path.basename(motiffile) pylab.title(motifname.split('.')[0]) pylab.savefig(motifname.split('.')[0]+'.png')
def __call__(self, fasta): "Run the method." start_time = time.time() ensure_dir_exists(self.options.output_dir) predictions = [] # run MEME self.meme_cmd_args, self.stdoutdata, self.starts, self.Zs, self.thetas, self.lambdas = run_meme( fasta, self.options) # parse output from Bio import Motif for motif in Motif.parse(open(os.path.join(self.options.output_dir, 'meme.txt')), "MEME"): for instance in motif.instances: # MEME parser seems to count from 1, not 0 start = instance.start - 1 prediction = instance.sequence_name, Interval( start, start + motif.length), instance.strand == '-' predictions.append(prediction) logger.info('MEME took %.1f seconds', time.time() - start_time) return predictions
def readPwmFile(pwmFileName, outputLocation, pseudocounts=0.0): """Reads a PWM file in Jaspar format and returns a Biopython PWM object. Keyword arguments: pwmFileName -- The path + name of the PWM file. outputLocation -- Path to write temporary pseudocount-pwm PWM. pseudocounts -- Amount of pseudocounts to add in each matrix cell. (default 0.0) Returns: pwm -- Biopython PWM object. """ # Adding pseudocounts pwmFile = open(pwmFileName,"r"); tempFileName = outputLocation+pwmFileName.split("/")[-1]+"temp" pwmFileT = open(tempFileName,"w") for line in pwmFile: pwmFileT.write(" ".join([str(float(e)+pseudocounts) for e in line.strip().split(" ")])+"\n") pwmFile.close() pwmFileT.close() # Creating PWM from pseudocounted input pwmFile = open(tempFileName,"r") pwm = Motif.read(pwmFile,"jaspar-pfm") pwmFile.close() os.system("rm "+tempFileName) return pwm
def makePWMscorefiles(fastafiles, pwmfiles, destdir, both_strands=True): for fastaf in fastafile: ### seqence only needed for length here. MOODS does this parsing again later but without reporting length. thisseqname = fastaf.split('/')[-1].split('.')[0] thisseq = Bio.SeqIO.read(fastaf, "fasta", alphabet=IUPAC.unambiguous_dna) #thisseq = Bio.SeqIO.parse(thisseqname, "fasta", alphabet=IUPAC.unambiguous_dna) print 'Doing sequence ', thisseqname, 'length=', len(thisseq) for pwmf in pwmfiles: thispwmname = pwmf.split('/')[-1] thispwm = Motif.read(open(pwmf), "jaspar-pfm") print ' Doing scanPWM one strands for pwm ', thispwmname, ', length=', len(thispwm[0]), datetime.now() onestrandsindexvector = thispwm.scanPWM(thisseq.seq) x = onestrandsindexvector[0:len(thispwm)-1].copy() # adding missing bp-values on the end to get the same length as seq. x[:]=np.NAN onestrandsindexvector=np.append(onestrandsindexvector, x) onestrandsindexvector = np.array([onestrandsindexvector]) # takes long time. print ' bp with nan score is ', np.isnan(onestrandsindexvector).sum(), ' expected ', (len(thispwm)-1) print ' finding best score per bp, ', datetime.now() bestscorevector = getMaxPWMScore( onestrandsindexvector, len(thispwm)) print ' writing wiggle for score per start index.', datetime.now() vegardswritewiggle(onestrandsindexvector[0,], name=thispwmname, chr=thisseqname, destpath=destdir + '/' + 'start_index_score/'+ thispwmname) print ' writing wiggle for bestscore. ', datetime.now() vegardswritewiggle(bestscorevector, name=thispwmname, chr=thisseqname, destpath=destdir + '/' + 'best_score_in_window/'+thispwmname)
def findPFM(jobID, motifObjList, wordObjDict, numMotifs, outputDir): """ find the PFM(Position Frequency Matrix) for the top motifs """ #write the words of motif in Jaspar site format like here: https://github.com/biopython/biopython/blob/master/Doc/cookbook/motif/Arnt.sites alphaList = ['A', 'C', 'G', 'T'] siteFileName = ''.join([jobID,'_jasparWordFile']) pfmFileName = ''.join([jobID,'_PFM']) pfmFile = open(pfmFileName, 'wb') counter = 1 #write the words for motifObj in motifObjList: seedWord = motifObj.seedWord siteFile = open(siteFileName, 'wb') for word in motifObj.wordList: wordCount = wordObjDict[word].O for i in range(int(wordCount)): siteFile.write('>site ' + str(counter) + '\n' + word + '\n') counter += 1 siteFile.close() srf = Motif.read(open(siteFileName),'jaspar-sites') srf.make_counts_from_instances() pfmFile.write('\n>' + seedWord + '\n') for alpha in alphaList: pfmFile.write(alpha + ' ' + str(srf.counts[alpha]) + '\n') shutil.move(pfmFileName, outputDir) os.remove(siteFileName) pfmFile.close() return
def yield_motifs(): with open('/home/will/LTRtfAnalysis/Jaspar_PWMs.txt') as handle: for key, lines in groupby(handle, methodcaller('startswith', '>')): if key: name = lines.next().strip().split()[-1].lower() else: tmp = ''.join(lines) mot = Motif.read(StringIO(tmp), 'jaspar-pfm') yield name, mot yield name+'-R', mot.reverse_complement() tmp = u"""A 0 0 6 1 0 0 0 4 2 2 0 0 3 C 1 1 1 0 5 6 4 1 0 0 0 3 5 5 4 0 G 0 6 0 1 1 0 0 0 0 7 1 1 0 0 1 0 T 6 0 0 0 1 1 3 5 7 0 0 0 0 2 2 4""" mot = Motif.read(StringIO(tmp), 'jaspar-pfm') yield 'coup2', mot yield 'coup2-R', mot.reverse_complement()
def yield_motifs(): with open('/home/will/LTRtfAnalysis/Jaspar_PWMs.txt') as handle: for key, lines in groupby(handle, methodcaller('startswith', '>')): if key: name = lines.next().strip().split()[-1].lower() else: tmp = ''.join(lines) mot = Motif.read(StringIO(tmp), 'jaspar-pfm') yield name, mot yield name+'-R', mot.reverse_complement()
def yield_motifs(): motifdir = '/home/will/Tip60Data/TFdata/' with open(motifdir + 'matrix_only.txt') as handle: for key, lines in groupby(handle, methodcaller('startswith', '>')): if key: name = lines.next().strip().split()[-1].lower() else: tmp = ''.join(lines) mot = Motif.read(StringIO(tmp), 'jaspar-pfm') yield name, mot yield name+'-R', mot.reverse_complement()
def _compute(self): windowLen = len(Motif.read(open(self._pfmFileName), "jaspar-pfm")) pwmScores = self._pwmScoreArrayStat.getResult() complementPwmScores = self._complementPwmScoreArrayStat.getResult() ret = np.zeros((windowLen*2, len(pwmScores)), dtype='float32') + np.float32(np.nan) for n in range(0, windowLen): ret[2*n,n:] = pwmScores[0:len(pwmScores)-n] ret[2*n + 1,n:] = complementPwmScores[0:len(complementPwmScores)-n] return np.nanmax(ret, axis=0)
def setUp(self): self.ACin = open("Motif/alignace.out") self.MEMEin = open("Motif/meme.out") self.PFMin = open("Motif/SRF.pfm") self.SITESin = open("Motif/Arnt.sites") self.TFout = "Motif/tf.out" self.FAout = "Motif/fa.out" self.PFMout = "Motif/fa.out" from Bio.Seq import Seq self.m = Motif.Motif() self.m.add_instance(Seq("ATATA", self.m.alphabet))
def build_motif(seqs): """Create motif from sequences""" m = Motif.Motif(alphabet=IUPAC.unambiguous_dna) for seq in seqs: try: m.add_instance(Seq(seq, m.alphabet)) except: print "Diff motif size length?" return None m.make_counts_from_instances() return m
def _compute(self): sequence = self._sequenceStat.getResult().valsAsNumpyArray() bioSeq = Seq(sequence.tostring(), alphabet=IUPAC.unambiguous_dna) thisPwm = Motif.read(open(self._pfmFileName), "jaspar-pfm") if self._complement: thisPwm = thisPwm.reverse_complement() try: pwmScoreArray = thisPwm.scanPWM(bioSeq) except MemoryError, e: #when sequence is shorter than pwm return
def parse_meme_output_for_sites(meme_output): "Parse MEME-like output" logging.info('Parsing predictions from %s', meme_output) predicted_sites = defaultdict(P.IntIntervalSet) motifs = list(Motif.parse(open(meme_output), "MEME")) for motif in motifs: for instance in motif.instances: logging.info('Prediction: sequence = %s; site = %s; pos = %3d', instance.sequence_name, instance, instance.start) predicted_sites[instance.sequence_name].add( P.IntInterval(instance.start, instance.start + len(instance))) return predicted_sites
def _compute(self): from Bio import Motif windowLen = len(Motif.read(open(self._pfmFileName), "jaspar-pfm")) pwmScores = self._pwmScoreArrayStat.getResult() complementPwmScores = self._complementPwmScoreArrayStat.getResult() ret = np.zeros((windowLen * 2, len(pwmScores)), dtype='float32') + np.float32(np.nan) for n in range(0, windowLen): ret[2 * n, n:] = pwmScores[0:len(pwmScores) - n] ret[2 * n + 1, n:] = complementPwmScores[0:len(complementPwmScores) - n] return np.nanmax(ret, axis=0)
def search_motif(motiflist, seq, col, extend): """search motif PWM from sequence list""" freq_list = [] shift_list = [] mf = Motif.read(open(motiflist),'jaspar-pfm') background = 0 for sequence,control in itertools.izip(seq, col): hit = [(pos,score) for pos,score in mf.search_pwm(sequence,threshold=7.0)] scores = np.array([score for (pos,score) in hit]) positions = np.array([pos for (pos,score) in hit]) if extend != 0: dist = [abs(extend-base) if base >=0 else abs(-1*extend-base) for base in positions] freq_list.append(len(scores)) shift_list += dist background += len([score for pos,score in mf.search_pwm(control,threshold=7.0)]) return freq_list, background, len(mf), shift_list
def _compute(self): from Bio.Alphabet import IUPAC from Bio.Seq import Seq from Bio import Motif sequence = self._sequenceStat.getResult().valsAsNumpyArray() bioSeq = Seq(sequence.tostring(), alphabet=IUPAC.unambiguous_dna) thisPwm = Motif.read(open(self._pfmFileName), "jaspar-pfm") if self._complement: thisPwm = thisPwm.reverse_complement() try: pwmScoreArray = thisPwm.scanPWM(bioSeq) except MemoryError, e: #when sequence is shorter than pwm return
def search_motif(motiflist, seq1, seq2): """search pwm for each motif in the motiflist form sequence""" count_all = np.array([[0,0],[0,0]]) mf = Motif.read(open(motiflist),'jaspar-pfm') cutoff = 0.8*mf.max_score() for sequence in seq1: count = [(pos,score) for pos,score in mf.search_pwm(sequence)] max_score = max([j for i,j in count]) if count else -1e5 if max_score > cutoff: count_all[0,0] += 1 else: count_all[1,0] += 1 for sequence in seq2: count = [(pos,score) for pos,score in mf.search_pwm(sequence)] max_score3 = max([j for i,j in count]) if count else -1e5 if max_score > cutoff: count_all[0,1] += 1 else: count_all[1,1] += 1 return count_all
def makePWMscorefiles(fastafiles, pwmfiles, destdir, both_strands=True): for fastaf in fastafile: ### seqence only needed for length here. MOODS does this parsing again later but without reporting length. thisseqname = fastaf.split('/')[-1].split('.')[0] thisseq = Bio.SeqIO.read(fastaf, "fasta", alphabet=IUPAC.unambiguous_dna) #thisseq = Bio.SeqIO.parse(thisseqname, "fasta", alphabet=IUPAC.unambiguous_dna) print 'Doing sequence ', thisseqname, 'length=', len(thisseq) for pwmf in pwmfiles: thispwmname = pwmf.split('/')[-1] thispwm = Motif.read(open(pwmf), "jaspar-pfm") print ' Doing scanPWM one strands for pwm ', thispwmname, ', length=', len( thispwm[0]), datetime.now() onestrandsindexvector = thispwm.scanPWM(thisseq.seq) x = onestrandsindexvector[0:len(thispwm) - 1].copy( ) # adding missing bp-values on the end to get the same length as seq. x[:] = np.NAN onestrandsindexvector = np.append(onestrandsindexvector, x) onestrandsindexvector = np.array([onestrandsindexvector ]) # takes long time. print ' bp with nan score is ', np.isnan( onestrandsindexvector).sum(), ' expected ', (len(thispwm) - 1) print ' finding best score per bp, ', datetime.now() bestscorevector = getMaxPWMScore(onestrandsindexvector, len(thispwm)) print ' writing wiggle for score per start index.', datetime.now() vegardswritewiggle(onestrandsindexvector[0, ], name=thispwmname, chr=thisseqname, destpath=destdir + '/' + 'start_index_score/' + thispwmname) print ' writing wiggle for bestscore. ', datetime.now() vegardswritewiggle(bestscorevector, name=thispwmname, chr=thisseqname, destpath=destdir + '/' + 'best_score_in_window/' + thispwmname)
def get_pwm_from_clustalw(clustalw_fname): """ Get PWM from CLUSTALW alignments file. Return PWM and motif object. """ from Bio import Motif from Bio.Alphabet import IUPAC import Bio.Seq as bio_seq import Bio.AlignIO as align_io # Load CLUSTALW file if not os.path.isfile(clustalw_fname): raise Exception, "CLUSTALW file %s does not exist" % (clustalw_fname) clustalw_input = align_io.read(clustalw_fname, "clustal") motif_obj = Motif.Motif(alphabet=IUPAC.unambiguous_dna) # Add sequences from CLUSTALW alignment to motif object for clustalw_seq in clustalw_input.get_all_seqs(): curr_seq = bio_seq.Seq(str(clustalw_seq.seq), IUPAC.unambiguous_dna) motif_obj.add_instance(curr_seq) # Compute PWM pwm = motif_obj.pwm() return pwm, motif_obj
import sys import random import os import numpy as np from Bio import SeqIO from Bio import Motif from scipy.stats import fisher_exact loc = '/compbio/data/motif/human-mouse/' ifp = open('logolist.txt') for line in ifp: name = line.rstrip() mat = np.loadtxt(loc+name) pfm = np.transpose(mat) np.savetxt(name,pfm,fmt='%d') mymotif = Motif.read(open(name),'jaspar-pfm') mymotif.weblogo(name+'.png')
def test_pfm_parsing(self): """Test to be sure that Motif can parse pfm files. """ motif = Motif.Motif() motif.from_jaspar_pfm(self.PFMin) assert motif.length == 12
def test_alignace_parsing(self): """Test to be sure that Motif can parse AlignAce output files. """ parser = Motif.AlignAceParser() record = parser.parse(self.ACin) assert len(record.motifs) == 16
#fastafile='/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/data/sequence/dnaRAND.txt' #fastafile='/usit/invitro/hyperbrowser/standardizedTracks/hg19/Sequence/DNA/chr21.fa' #### test av biopythons motif pakke med scanPWM #matrix = MOODS.load_matrix('/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/vegardtest_virker.pfm') #9 from Bio import Motif from Bio.Seq import Seq from Bio.Alphabet import IUPAC from datetime import datetime # Let's create an instance of the E2F1 motif (downloaded from the # jaspar database): testpwm= '/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/vegardtest_virker.pfm' motif=Motif.read(open(testpwm), "jaspar-pfm") # the format method displays the motif in a variety of formats: print motif.format('transfac') fastafile='/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/data/sequence/dnaRAND.txt' fastafile='/usit/invitro/hyperbrowser/standardizedTracks/hg19/Sequence/DNA/chr1.fa' handle = open(fastafile, "r") records = list(Bio.SeqIO.parse(handle, "fasta", alphabet=IUPAC.unambiguous_dna)) handle.close() thisseq = records[0].seq print datetime.now() hits = motif.scanPWM(thisseq) print datetime.now()
def test_pfm_parsing(self): """Test to be sure that Motif can parse pfm files. """ motif= Motif.read(self.PFMin,"jaspar-pfm") assert motif.length==12
def __init__(self, regions, genome_fasta, jaspar_file=None, jaspar_thresh=9999, annotations=None, motif_positions=None, method='motility'): """ Adds motif tracks to BasePrinter, using motility and a file containing a JASPAR-format definition of a motif. :param regions: An iterable of pybedtools.Interval objects :param genome_fasta: FASTA file from which sequences for `regions` will be extracted :param jaspar_file: If provided, a file in JASPAR format. Motifs in each sequence will be identified :param jaspar_thresh: Score threshold below which motifs will be ignored. :param motif_positions: If this is a list of integer indexes, these positions will be converted to uppercase. :param method: "motility" or "biopython" """ super(MotifPrinter, self).__init__(regions=regions, genome_fasta=genome_fasta) import motility assert method in ['biopython', 'motility'] self.method = method pwm = list(helpers.pwm_from_jaspar(jaspar_file)) assert len(pwm) == 1 self.pwm = motility.PWM(pwm[0][1]) tmp = open('tmp', 'w') for line in open(jaspar_file): if line.startswith('>'): continue for i in '[]ATCG': line = line.replace(i, '') tmp.write(line) tmp.close() self.motif = Motif.read(open(tmp.name), 'jaspar-pfm') if method == 'biopython': sd = Motif.ScoreDistribution(self.motif) jaspar_thresh = sd.threshold_patser() self.jaspar_thresh = jaspar_thresh if motif_positions is None: motif_positions = [] self.motif_positions = motif_positions self._annotations = {} if annotations: for k, v in annotations.items(): self._annotations[k] = pybedtools.BedTool(v).saveas() self.trackfuncs.append(self.motifs) self.trackfuncs.append(self.annotations) self.intervals = []
##########testing ....... #calculate_both_strands=True #outputdir = '/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/tempoutput' #fastafile=['/usit/invitro/hyperbrowser/standardizedTracks/hg19/Sequence/DNA/chr1.fa'] #pwmfiles=['/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/vegardtest2.pfm'] #fastaf = '/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/data/sequence/dnaRAND.txt' fastaf = '/usit/invitro/hyperbrowser/standardizedTracks/hg19/Sequence/DNA/chr1.fa' pwmf = '/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/vegardtest2.pfm' destdir = outputdir jaspar_file='/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/JASPAR/all_data/matrix_only/matrix_only.txt' thispwm = Motif.read(open(jaspar_file), "jaspar-pfm") #pwmfiles=['/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/vegardtest2.pfm', '/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/vegardtest1.pfm'] #fastafile='/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/string_in_ex1_as_fasta.txt' #fastafile='/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/dummychrom.fasta' #fastafile='/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/data/sequence/dnaRAND.txt' #fastafile='/usit/invitro/hyperbrowser/standardizedTracks/hg19/Sequence/DNA/chr21.fa' #### test av biopythons motif pakke med scanPWM print "ferdig i pwmtest4"
makePWMscorefiles(fastafile[0:1], pwmfiles[0:1], outputdir) print datetime.now() ##########testing ....... #calculate_both_strands=True #outputdir = '/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/tempoutput' #fastafile=['/usit/invitro/hyperbrowser/standardizedTracks/hg19/Sequence/DNA/chr1.fa'] #pwmfiles=['/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/vegardtest2.pfm'] #fastaf = '/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/data/sequence/dnaRAND.txt' fastaf = '/usit/invitro/hyperbrowser/standardizedTracks/spombe2007/Sequence/DNA/chr1.fa' #fastaf = '' #pwmf = '/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/vegardtest2.pfm' pwmf = '/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/data/matrix/JASPAR_CORE_2008/MA0086.pwf' destdir = outputdir jaspar_file = '/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/JASPAR/all_data/matrix_only/matrix_only.txt' thispwm = Motif.read(open(jaspar_file), "jaspar-pfm") #pwmfiles=['/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/vegardtest2.pfm', '/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/vegardtest1.pfm'] #fastafile='/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/string_in_ex1_as_fasta.txt' #fastafile='/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/dummychrom.fasta' #fastafile='/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/data/sequence/dnaRAND.txt' #fastafile='/usit/invitro/hyperbrowser/standardizedTracks/hg19/Sequence/DNA/chr21.fa' #### test av biopythons motif pakke med scanPWM print "ferdig i pwmtest4"
options = stempy.get_default_options() options.output_dir = os.path.join('output', 'test-meme-like-output') options.min_w = options.max_w = 8 options.meme_like_output = 'meme.out' algorithm = stempy.Algorithm(options) fasta = os.path.join(os.path.dirname(__file__), 'fasta', 'T00759-tiny.fa') algorithm(fasta) logging.info('Showing MEME output from %s', algorithm.meme_like_output_file) os.system('cat %s' % algorithm.meme_like_output_file) # # Test BioPython parser # from Bio import Motif motifs = list(Motif.parse(open(algorithm.meme_like_output_file), "MEME")) # # Doesn't quite work with pycogent yet. Pycogent expects a summary section # that contains sites in all the sequences # #from cogent import LoadSeqs #from cogent.parse.meme import MemeParser #results = MemeParser(open(algorithm.meme_like_output_file, 'U')) #seqs = LoadSeqs(fasta, aligned=False) #results.Alignment = seqs #for motif in results.Motifs: # module = motif.Modules[0] # print module.ID, module.Evalue, len(module.NamedSeqs) #
ostr += ' ' + str(count[l]) ostr += '\n' count_dict[name] = ostr # <codecell> print(count_dict['COUP2']) # <codecell> from io import StringIO motif_dict = {} for key, tmp in count_dict.items(): print key, type(tmp) motif_dict[key] = Motif.read(StringIO(tmp), 'jaspar-pfm') tmp = u"""A 0 0 16 5 3 0 16 C 1 0 2 12 0 15 0 G 0 15 0 1 1 3 1 T 17 3 0 0 14 0 1""" motif_dict['AP1'] = Motif.read(StringIO(tmp), 'jaspar-pfm') tmp = u"""A 3 1 4 2 4 2 18 18 0 C 0 1 1 9 2 15 0 0 6 G 0 4 6 2 10 0 0 0 2 T 15 12 7 5 2 1 0 0 10""" motif_dict['CEBP'] = Motif.read(StringIO(tmp), 'jaspar-pfm')
def test_sites_parsing(self): """Test to be sure that Motif can parse sites files. """ motif = Motif.read(self.SITESin, "jaspar-sites") assert motif.length == 6
def test_pfm_parsing(self): """Test to be sure that Motif can parse pfm files. """ motif = Motif.read(self.PFMin, "jaspar-pfm") assert motif.length == 12
def __init__(self, type, score_type, upstr=None, invariant=[], score_dict={}, bounds=None, filter_score=None, note_str_func=default_note_str, # mutant_check= _mutant_check ** attribs ): self.type = type self.upstr = upstr self.invariant = invariant self.filter_score = filter_score self.note_str = note_str_func self.bounds = bounds self.score_type = score_type # self.mutant_check = mutant_check #------------------------------------------------ # FIRST: figure out what score type this motif is. #------------------------------------------------ # aho-corasick search tree if self.score_type == 'acora': # we need a score dict from a file and a separate acora object if not isinstance(score_dict, str): raise NeedScoreDictFileException self.score_dict = \ dict([record.split() for record in open(score_dict)]) self.acora_tree = acora.AcoraBuilder(self.score_dict.keys()).build() self.score = self.acora # ternary search tree elif score_type == 'tst': # make our file of nmers and scores a tst object self.score_dict = tst.TST() tstmap = lambda tuple: self.score_dict.put(*tuple) map(tstmap, ([record.split() for record in open(score_dict)])) self.score = self.tst # max ent nmer score elif score_type == 'max_ent': self.score_dict = {} self.score = self.max_ent # open up an 'interactive' pipe to the maxent software for this motif programs = {'me_splice_donor':'score5', 'me_splice_acceptor':'score3'} self.command = cfg.programTemplate.substitute(path=cfg.maxEntPath, program=programs[self.type]) # positon frequency matrix list elif score_type == 'pfm': print "Loading position frequency matrices from {}...".format(self.type) pfm_glob = glob.glob(score_dict) name_pattern = re.compile('.*/(\w+).pfm') self.score_dict = {} for motif_file in pfm_glob: motif_obj = Motif.read(open(motif_file), 'jaspar-pfm') motif_name = name_pattern.match(motif_file).group(1) print "\t{}...".format(motif_name) motif_obj.name = motif_name if len(motif_obj) > 7: self.score_dict[motif_name] = motif_obj motif_obj.sd = \ ScoreDistribution(motif_obj, precision=10 ** 3) # low false-positive rate to make sure motifs are real motif_obj.thresh = max(1, motif_obj.sd.threshold_fpr(0.01)) self.score = self.pfm print "Motif matrices done." #------------------------------------------------ # SECOND: parse filter score information. #------------------------------------------------ # if filter score is an int, make it a lambda function that determines # whether or not it is a 'worthwile' score; this could be as simple as # > 0, or it could be a range, etc, etc. The lambda function will # return true if the score should be kept and false if it should not. if isinstance(self.filter_score, float) or \ isinstance(self.filter_score, int): self.filter_score = \ lambda val, min = self.filter_score: \ val > min elif isinstance(self.filter_score, tuple): self.filter_score = \ lambda val, minmax = self.filter_score: \ val < minmax[0] or val > minmax[1] # if filter_score is none, always return true elif self.filter_score == None: self.filter_score = lambda val: True #------------------------------------------------ # THIRD: add motif to motif type dict and cleanup #------------------------------------------------ motif_types[self.type] = self self.attribs = attribs
#!/usr/bin/env python # counts all dinucleotides in a DNA fasta file import sys from Bio.Seq import Seq from Bio import SeqIO from Bio import Motif from Bio.Alphabet import IUPAC fastafile = sys.argv[1] AA=Motif.Motif(alphabet=IUPAC.unambiguous_dna) AA.add_instance(Seq("AA",AA.alphabet)) CA=Motif.Motif(alphabet=IUPAC.unambiguous_dna) CA.add_instance(Seq("CA",CA.alphabet)) GA=Motif.Motif(alphabet=IUPAC.unambiguous_dna) GA.add_instance(Seq("GA",GA.alphabet)) TA=Motif.Motif(alphabet=IUPAC.unambiguous_dna) TA.add_instance(Seq("TA",TA.alphabet)) AC=Motif.Motif(alphabet=IUPAC.unambiguous_dna) AC.add_instance(Seq("AC",AC.alphabet)) CC=Motif.Motif(alphabet=IUPAC.unambiguous_dna) CC.add_instance(Seq("CC",CC.alphabet)) GC=Motif.Motif(alphabet=IUPAC.unambiguous_dna) GC.add_instance(Seq("GC",GC.alphabet)) TC=Motif.Motif(alphabet=IUPAC.unambiguous_dna) TC.add_instance(Seq("TC",TC.alphabet)) AG=Motif.Motif(alphabet=IUPAC.unambiguous_dna) AG.add_instance(Seq("AG",AG.alphabet)) CG=Motif.Motif(alphabet=IUPAC.unambiguous_dna) CG.add_instance(Seq("CG",CG.alphabet)) GG=Motif.Motif(alphabet=IUPAC.unambiguous_dna) GG.add_instance(Seq("GG",GG.alphabet))
#fastafile='/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/data/sequence/dnaRAND.txt' #fastafile='/usit/invitro/hyperbrowser/standardizedTracks/hg19/Sequence/DNA/chr21.fa' #### test av biopythons motif pakke med scanPWM #matrix = MOODS.load_matrix('/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/vegardtest_virker.pfm') #9 from Bio import Motif from Bio.Seq import Seq from Bio.Alphabet import IUPAC from datetime import datetime # Let's create an instance of the E2F1 motif (downloaded from the # jaspar database): testpwm = '/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/vegardtest_virker.pfm' motif = Motif.read(open(testpwm), "jaspar-pfm") # the format method displays the motif in a variety of formats: print motif.format('transfac') fastafile = '/xanadu/home/vegardny/prosjekter/hyperbrowser/pwm_vs_snp/vegard_debug_MOODS/examples/data/sequence/dnaRAND.txt' fastafile = '/usit/invitro/hyperbrowser/standardizedTracks/hg19/Sequence/DNA/chr1.fa' handle = open(fastafile, "r") records = list(Bio.SeqIO.parse(handle, "fasta", alphabet=IUPAC.unambiguous_dna)) handle.close() thisseq = records[0].seq print datetime.now() hits = motif.scanPWM(thisseq) print datetime.now()
def bitScoreMM(pwmFileName, genomeDict, mpbsDict, scoringMethod, tempLocation, pseudocounts=0.1, bitscore=12.0, fpr=0.01, precision=10**4, highCutoff=0.7, functionalDepth=0.9): """Performs basic motif matching algorithm and writes the results to a dictionary indexed by chromosome. Keyword arguments: pwmFileName -- PWM file name. genomeDict -- Genome dictionary. mpbsDict -- Dictionary of MPBSs to insert the results. scoringMethod -- Method to evaluate which MPBSs are enriched. tempLocation -- Location to write temporary PWM files in order to help PWM creation and pseudocounting. pseudocounts -- Amount of pseudocounts to add in each PWM matrix's cell. (default 0.1) bitscore -- The cutoff bitscore value. (default 12.0) fpr -- False positive rate to determine the cutoff value. (default 0.01) precision -- Motif score distribution precision. (default 10**4) highCutoff -- High cutoff for Boyle's rule. (default 0.7) functionalDepth -- Functional depth for Boyle's rule. (default 0.9) Returns: mpbsDict -- This method inserts entries on the mpbsDict. """ # Reading PWM pwm = createPwmDict(pwmFileName, pseudocounts) pwmName = pwmFileName.split("/")[-1].split(".")[0] pwmLen = len(pwm["A"]) background = math.log(0.25, 2) * pwmLen # Evaluating threshold pwmThreshold = 0.0 if (scoringMethod == "bitscore"): pwmThreshold = bitscore elif (scoringMethod == "fpr"): bioPwm = biopythonMM.readPwmFile(pwmFileName, tempLocation, pseudocounts) sd = Motif.ScoreDistribution(bioPwm, precision=precision) pwmThreshold = sd.threshold_fpr(fpr) elif (scoringMethod == "boyle"): maxScore = 0.0 minScore = 0.0 # TODO Boyle's rule is not suited for negative values. for i in range(0, pwmLen): maxScore += max(pwm["A"][i], pwm["C"][i], pwm["G"][i], pwm["T"][i]) maxScore -= background pwmThreshold = min(highCutoff * maxScore, functionalDepth * (maxScore - minScore)) else: sys.stderr.write("Choose a valid scoring method.\n") sys.exit(0) # Creating aditional parameters chrList = constants.getChromList(reference=[mpbsDict]) tempMpbsDict = dict([(e, []) for e in chrList]) maxValue = -99.0 revDict = dict([("A", "T"), ("T", "A"), ("C", "G"), ("G", "C"), ("N", "N")]) # Iterating on chromosomes for chrName in chrList: # Reading genome sequence = genomeDict[chrName].upper() # Performing motif matching for pos in xrange(0, len(sequence) - pwmLen + 1): scoreF = -background scoreR = -background for i in range(0, pwmLen): scoreF += pwm[sequence[pos + i]][i] scoreR += pwm[revDict[sequence[pos + pwmLen - i - 1]]][i] if (scoreF > pwmThreshold): if (scoreF > maxValue): maxValue = scoreF tempMpbsDict[chrName].append( [pos, pos + pwmLen, pwmName, scoreF, "+"]) if (scoreR > pwmThreshold): if (scoreR > maxValue): maxValue = scoreR tempMpbsDict[chrName].append( [pos, pos + pwmLen, pwmName, scoreR, "-"]) # Update scores - new scores are within [0,1000] for chrName in chrList: for e in tempMpbsDict[chrName]: mpbsDict[chrName].append([ e[0], e[1], e[2], int(1000 * (e[3] - pwmThreshold) / (maxValue - pwmThreshold)), e[4] ]) return 0
def biopythonMM(pwmFileName, genomeDict, mpbsDict, scoringMethod, tempLocation, pseudocounts=0.1, bitscore=12.0, fpr=0.01, precision=10**4, highCutoff=0.7, functionalDepth=0.9): """Performs Biopython based motif matching and writes the results to a dictionary indexed by chromosome. Keyword arguments: pwmFileName -- PWM file name. genomeDict -- Genome dictionary. mpbsDict -- Dictionary of MPBSs to insert the results. scoringMethod -- Method to evaluate which MPBSs are enriched. tempLocation -- Location to write temporary PWM files in order to help PWM creation and pseudocounting. pseudocounts -- Amount of pseudocounts to add in each PWM matrix's cell. (default 0.1) bitscore -- The cutoff bitscore value. (default 12.0) fpr -- False positive rate to determine the cutoff value. (default 0.01) precision -- Motif score distribution precision. (default 10**4) highCutoff -- High cutoff for Boyle's rule. (default 0.7) functionalDepth -- Functional depth for Boyle's rule. (default 0.9) Returns: mpbsDict -- This method inserts entries on the mpbsDict. """ # Reading PWM pwm = readPwmFile(pwmFileName, tempLocation, pseudocounts) pwmName = pwmFileName.split("/")[-1].split(".")[0] pwmLen = len(pwm) # Evaluating threshold pwmThreshold = 0.0 if (scoringMethod == "bitscore"): pwmThreshold = bitscore elif (scoringMethod == "fpr"): sd = Motif.ScoreDistribution(pwm, precision=precision) pwmThreshold = sd.threshold_fpr(fpr) elif (scoringMethod == "boyle"): maxScore = pwm.max_score() minScore = 0.0 # TODO Boyle's rule is not suited for negative values. pwmThreshold = min(highCutoff * maxScore, functionalDepth * (maxScore - minScore)) else: sys.stderr.write("Choose a valid scoring method.\n") sys.exit(0) # Creating aditional parameters chrList = constants.getChromList(reference=[mpbsDict]) tempMpbsDict = dict([(e, []) for e in chrList]) maxValue = -99.0 # Iterating on chromosomes for chrName in chrList: # Reading genome sequence = genomeDict[chrName] # Performing biopython's motif matching for pos, score in pwm.search_pwm(sequence, threshold=pwmThreshold): if (score > maxValue): maxValue = score if (pos >= 0): tempMpbsDict[chrName].append( [pos, pos + pwmLen, pwmName, score, "+"]) else: tempMpbsDict[chrName].append( [-pos, -pos + pwmLen, pwmName, score, "-"]) # Update scores - new scores are within [0,1000] for chrName in chrList: for e in tempMpbsDict[chrName]: mpbsDict[chrName].append([ e[0], e[1], e[2], int(1000 * (e[3] - pwmThreshold) / (maxValue - pwmThreshold)), e[4] ]) return 0
def reverse(sequence): retSeq = [] for c in sequence[::-1]: retSeq.append(revDict[c]) return "".join(retSeq) # Fetching sequences bedName = bedFileName.split("/")[-1][:-4] os.system("fastaFromBed -fi " + fastaFileName + " -fo " + outputLocation + bedName + ".txt" + " -bed " + bedFileName + " -tab") # Reading pwm pwmFile = open(pwmFileName, "r") pwm = Motif.read(pwmFile, "jaspar-pfm") motif = str(pwm.consensus()).upper() pwmFile.close() # Reading input vectors misVec = [] misVecSpec = [] posVec = [] posVecSpec = [] scoreVec = [] bedFile = open(bedFileName, "r") seqFile = open(outputLocation + bedName + ".txt", "r") for bedLine in bedFile: # Reading line seqLine = seqFile.readline()
mot = Motif.read(StringIO(tmp), 'jaspar-pfm') yield name, mot yield name+'-R', mot.reverse_complement() pwm_dict = {} for num, (name, mot) in enumerate(yield_motifs()): if num % 100 == 0: print num pwm_dict[name] = mot tmp = u"""A 0 0 6 1 0 0 0 4 2 2 0 0 3 C 1 1 1 0 5 6 4 1 0 0 0 3 5 5 4 0 G 0 6 0 1 1 0 0 0 0 7 1 1 0 0 1 0 T 6 0 0 0 1 1 3 5 7 0 0 0 0 2 2 4""" pwm_dict['coup2'] = Motif.read(StringIO(tmp), 'jaspar-pfm') pwm_dict['coup2-R'] = Motif.read(StringIO(tmp), 'jaspar-pfm').reverse_complement() # <codecell> from Bio.Alphabet import IUPAC def score_seq(seq, mot): bseq = Seq(seq, alphabet=IUPAC.unambiguous_dna) scores = mot.scanPWM(bseq) for pos, score in enumerate(scores.flatten(),1): if ~np.isnan(score): tseq = seq[pos:pos+len(mot)] yield pos, tseq, score
def test_sites_parsing(self): """Test to be sure that Motif can parse sites files. """ motif= Motif.read(self.SITESin,"jaspar-sites") assert motif.length==6
def motifMatchingBiopython(combinationList,pwmList,coordDict,pwmLocation,genomeList,tempLocation,fpr=0.01,pseudocounts=0.0,precision=10**4,color="black"): """Performs Biopython based motif matching and returns a list containing the matches and writes the results on bed files. Keyword arguments: combinationList -- List of the number of cobinding combinations. pwmList -- List of PWMs where each entry represents the name of a PWM file. coordDict -- Dictionary of coordinates where the motif matching will be applied. pwmLocation -- Path containing the motif pwm files. genomeList -- List of fasta files containing the sequences to perform the motif matching, where the headers are the chromosomes. tempLocation -- Location to write temporary PWM files in order to help PWM creation and pseudocounting. fpr -- False positive rate to determine the cutoff value. (default 0.01) pseudocounts -- Amount of pseudocounts to add in each PWM matrix's cell. (default 0.0) precision -- Motif score distribution precision. (default 10**4) color -- Color of the bed entries. Can be 'green', 'red' or 'black'. (default 'black') Returns: mpbsDict -- Dictionary (for each PWM) of dictionaries (for each chromosome) of motif predicted binding sites. statDict -- Dictionary of statistics for Fisher test concerning the number of motifs inside enriched regions. geneDict -- Dictionary of genes (position NAME in bed file) that contains each motif. """ # Reading PWM pwmDict = dict() for pwmName in pwmList: pwmDict[pwmName] = readPwmFile(pwmLocation+pwmName+".pwm","/".join(tempLocation.split("/")[:-1])+"/",pseudocounts) # Evaluating thresholds pwmThresholdDict = dict() for pwmName in pwmList: sd = Motif.ScoreDistribution(pwmDict[pwmName],precision=precision) pwmThresholdDict[pwmName] = sd.threshold_fpr(fpr) # Reading genome genomeDict = genome.readFastaFiles(genomeList) # Creating chromosome list chrList = constants.getChromList(reference=[coordDict]) # Removing chrX, chrY and chrM # TODO Stop removing these chromosomes #chrListT = [] #for e in chrList: # if(e not in ["chrX", "chrY", "chrM"]): chrListT.append(e) #chrList = chrListT # Evaluating bed additionals if(color == "green"): color = "0,130,0" elif(color == "red"): color = "130,0,0" elif(color == "black"): color = "0,0,0" # Create combinations dictionary keys combKeys = [] for c in combinationList: for b in [",".join(e) for e in itertools.combinations(pwmList,c)]: combKeys.append(b) # Iterating on chromosomes mpbsDict = dict([(e,dict()) for e in pwmDict.keys()]) statDict = dict([(e,[0,0]) for e in combKeys]) # Left is evidence / Right is not evidence geneDict = dict([(e,[]) for e in combKeys]) maxDict = dict([(e,-99.0) for e in pwmDict.keys()]) ct=0 for chrName in chrList: # Reading genome if(chrName not in genomeDict.keys()): continue sequence = genomeDict[chrName] # Iterating on coordinate dictionary for e in mpbsDict.keys(): mpbsDict[e][chrName] = [] for coord in coordDict[chrName]: ct=ct+1 #print "region", ct # Getting current sequence based on coordinates currSeq = sequence[coord[0]:coord[1]] # Keeping track of the factors found in this coordinate flagMotifs = dict([(e,False) for e in pwmDict.keys()]) # Iterating on PWMs for pwmName in pwmDict.keys(): pwmLen = len(pwmDict[pwmName]) for pos, score in pwmDict[pwmName].search_pwm(currSeq,threshold=pwmThresholdDict[pwmName]): if(score > maxDict[pwmName]): maxDict[pwmName] = score if(pos >= 0): mpbsDict[pwmName][chrName].append([pos+coord[0],pos+coord[0]+pwmLen,pwmName,score,"+",pos+coord[0],pos+coord[0]+pwmLen,color]) else: mpbsDict[pwmName][chrName].append([-pos+coord[0],-pos+coord[0]+pwmLen,pwmName,score,"-",-pos+coord[0],-pos+coord[0]+pwmLen,color]) flagMotifs[pwmName] = True # Updating statistic counts and genes motifsFoundList = [k for k in pwmList if flagMotifs[k]] motifsFoundKeys = [] motifsNotFoundKeys = [e for e in combKeys] for c in combinationList: for b in [",".join(e) for e in itertools.combinations(motifsFoundList,c)]: motifsFoundKeys.append(b) motifsNotFoundKeys.remove(b) for k in motifsFoundKeys: statDict[k][0] += 1 for e in coord[2].split(":"): geneDict[k].append(e) for k in motifsNotFoundKeys: statDict[k][1] += 1 # Update scores - new scores are within [0,1000] for pwmName in pwmDict.keys(): for chrName in mpbsDict[pwmName].keys(): for e in mpbsDict[pwmName][chrName]: e[3] = int(1000*(e[3]-pwmThresholdDict[pwmName])/(maxDict[pwmName]-pwmThresholdDict[pwmName])) # Remove repetitive genes from geneList for k in geneDict.keys(): geneDict[k] = list(set(geneDict[k])) return mpbsDict, statDict, geneDict
# 1. <inputFileName>.png: The logo graphic. ##################################################################################################################### import sys import os import math from Bio import Motif # Reading input nucsPerImage = int(sys.argv[1]) inputFileLocation = sys.argv[2] outputLocation = sys.argv[3] # Reading pfm file inputFile = open(inputFileLocation, "r") pwm = Motif.read(inputFile, "jaspar-pfm") # Writing whole or splited logo if (nucsPerImage <= 0): pwm.weblogo(outputLocation + (inputFileLocation.split("/")[-1].split(".")[0]) + ".png", res=300) else: tempPWM = [[], [], [], []] nucs = ["A", "C", "G", "T"] counter = 0 fileCount = 0 for i in range(0, len(pwm)): for j in range(0, len(nucs)): tempPWM[j].append(pwm.counts[nucs[j]][i]) counter += 1
def procurar(self,sequencia_string): ''' Retorno = vetor de resultados tamanho da sequencia ''' files = glob.glob( self.diretorio + "*.pfm") motivos_finais=[] nomes = open( self.diretorio +"matrix_list.txt", 'r').read() nomes_vetor = nomes.split('\n') #print nomes_vetor # print files #motif = Motif.read(open("PFMDir/1026_10858445.pfm"), "jaspar-pfm") # motif.make_instances_from_counts() 3 lista_motivos = [] lista_nomes = [] motivos_checar_repetidos = {} for motivo in files: #print motivo isolar_nome = re.search(r'(\d*_\d*)\.pfm$', motivo) isolado = isolar_nome.group(1) for n_pesquisa in nomes_vetor: isolado_pesquisa = n_pesquisa.split('\t') id_encontrado = isolado_pesquisa[0] # print '<'+n_pesquisa+'>' if (id_encontrado == isolado): #print isolado nome_recolocar = n_pesquisa.split('\t')[2].split('_')[1] if nome_recolocar in motivos_checar_repetidos: motivos_checar_repetidos[nome_recolocar] += 1 #print ('ja encontrado') else: #print motivo motivos_checar_repetidos[nome_recolocar] = 1 lista_motivos.append(Motif.read(open(motivo), "jaspar-pfm")) lista_nomes.append(nome_recolocar) sequencia = Seq(sequencia_string.upper()) #print lista_nomes #print(len(lista_motivos)) # print motif.has_counts # print motif.counts contador = 0 for converter in lista_motivos: #print contador #print converter.instances converter.make_instances_from_counts() contador += 1 # print motif. # motif.weblogo("teste.bmp") tamanho_seq=len(sequencia_string) tss=((tamanho_seq*self.pontas)/100) three_end= tamanho_seq-tss #print tss,three_end count_erro=0 nome_contador=0 saida_hmm=[] classificar=[] motif_Contador =1 for procura in lista_motivos: #print ("---------------->" + str(motif_Contador) + "<---------------------------------------") #print (procura) motif_Contador += 1 #for position_s,score_p in procura.search_pwm(sequencia,threshold=1): ##print procura #print math.fabs(position_s),score_p,"teste" #print lista_nomes[nome_contador] #print '-----------------' #print count_errorandomseq.txt #count_erro=+ 1 #print "----------------------------------------------" for pos, seq in procura.search_pwm(sequencia,threshold=10): #print teste #print "entrou" posicao="" if (pos<=tss): posicao="1" if (pos>=three_end) : posicao="3" if (pos>tss and pos<three_end): posicao="2" motivos_finais.append((pos, seq ,lista_nomes[nome_contador],posicao)) #motivos_finais.append(str(pos) + "\t" + seq + "\t" +lista_nomes[nome_contador]+"\t"+posicao) saida_hmm.append((posicao,lista_nomes[nome_contador])) ordenar_teste=(posicao,lista_nomes[nome_contador]) ordenar_teste=sorted(sorted(ordenar_teste)) classificar.append(ordenar_teste) #print (str(pos) + "\t" + seq + "\t" +lista_nomes[nome_contador]+"\t"+ str(posicao)) nome_contador+=1 return (motivos_finais,tamanho_seq,lista_nomes,saida_hmm,classificar)
def test_meme_parsing(self): """Test to be sure that Motif can parse MEME output files. """ parser = Motif.MEMEParser() record = parser.parse(self.MEMEin) assert len(record.motifs) == 1
def make_seq(seq, comp = False): if comp: return Seq(seq,Alphabet.IUPAC.unambiguous_dna).reverse_complement() else: return Seq(seq,Alphabet.IUPAC.unambiguous_dna) def score_seq(mot, seq, comp = False): return mot.scanPWM(make_seq(seq, comp = comp))[0] tmp = u"""A 1 2 0 0 0 2 0 0 1 2 C 1 1 0 0 5 0 1 0 1 0 G 4 4 8 8 2 4 5 6 6 0 T 2 1 0 0 1 2 2 2 0 6""" sp1_mot = Motif.read(StringIO(tmp), 'jaspar-pfm') # <codecell> test_seqs = [('sp3', 'GAGGCGTGGC'), ('sp2', 'TGGGCGGGAC'), ('sp1', 'GGGGAGTGGC')] res = [] for name, base_seq in test_seqs: bs = list(base_seq) mat = np.zeros((6, len(bs)+2)) for n in range(len(bs)): olet = bs[n] for ln, let in enumerate('ACTG'): bs[n] = let
def test_sites_parsing(self): """Test to be sure that Motif can parse sites files. """ motif = Motif.Motif() motif.from_jaspar_sites(self.SITESin) assert motif.length == 6
def fimoMM(pwmFileName, genomeFile, mpbsDict, scoringMethod, tempLocation, pseudocounts=0.1, bitscore=12.0, fpr=0.01, precision=10**4, highCutoff=0.7, functionalDepth=0.9, threshold=0.0001): """Performs FIMO motif matching algorithm and writes the results to a dictionary indexed by chromosome. Keyword arguments: pwmFileName -- PWM file name. genomeFile -- Fasta file containing the regions to be analyzed mpbsDict -- Dictionary of MPBSs to insert the results. scoringMethod -- Method to evaluate which MPBSs are enriched. tempLocation -- Location to write temporary PWM files in order to help PWM creation and pseudocounting. pseudocounts -- Amount of pseudocounts to add in each PWM matrix's cell. (default 0.1) bitscore -- The cutoff bitscore value. (default 12.0) fpr -- False positive rate to determine the cutoff value. (default 0.01) precision -- Motif score distribution precision. (default 10**4) highCutoff -- High cutoff for Boyle's rule. (default 0.7) functionalDepth -- Functional depth for Boyle's rule. (default 0.9) threshold -- The cutoff threshold value. (default 0.0001) Returns: mpbsDict -- This method inserts entries on the mpbsDict. """ # Converting jaspar to MEME memeFileName = jasparToMeme(pwmFileName, tempLocation, pseudocounts) tempPath = "/".join(memeFileName.split("/")[:-1]) + "/" fimoFileName = tempPath + "results.txt" errorOutputName = tempPath + "error.txt" # Evaluating threshold pwmThreshold = 0.0 if (scoringMethod == "bitscore"): pwmThreshold = bitscore threshold = 0.1 elif (scoringMethod == "fpr"): bioPwm = biopythonMM.readPwmFile(pwmFileName, tempLocation, pseudocounts) sd = Motif.ScoreDistribution(bioPwm, precision=precision) pwmThreshold = sd.threshold_fpr(fpr) threshold = 0.1 print bioPwm.max_score() elif (scoringMethod == "boyle"): maxScore = 0.0 minScore = 0.0 # TODO Boyle's rule is not suited for negative values. pwmBoyle = bitScoreMM.createPwmDict(pwmFileName, pseudocounts) pwmLen = len(pwmBoyle["A"]) for i in range(0, pwmLen): maxScore += max(pwmBoyle["A"][i], pwmBoyle["C"][i], pwmBoyle["G"][i], pwmBoyle["T"][i]) background = math.log(0.25, 2) * pwmLen maxScore -= background pwmThreshold = min(highCutoff * maxScore, functionalDepth * (maxScore - minScore)) threshold = 0.1 elif (scoringMethod == "fimo"): pass else: sys.stderr.write("Choose a valid scoring method.\n") sys.exit(0) # Performing FIMO os.system( "fimo --text --verbosity 1 --max-stored-scores 1000000 --output-pthresh " + str(threshold) + " " + memeFileName + " " + genomeFile + " > " + fimoFileName + " 2> " + errorOutputName) # Reading FIMO output tempMpbsDict = dict() fimoFile = open(fimoFileName, "r") fimoFile.readline() maxValue = -999 for line in fimoFile: ll = line.strip().split("\t") ll = [ll[0][0], ll[0][1:]] + ll[1:] if (scoringMethod != "fimo" and float(ll[5]) < pwmThreshold): continue if (float(ll[5]) > maxValue): maxValue = float(ll[5]) if (ll[2] in tempMpbsDict.keys()): if (ll[0] == "+"): tempMpbsDict[ll[2]].append( [int(ll[3]) - 1, int(ll[4]), ll[1], float(ll[5]), ll[0]]) else: tempMpbsDict[ll[2]].append( [int(ll[4]) - 1, int(ll[3]), ll[1], float(ll[5]), ll[0]]) else: if (ll[0] == "+"): tempMpbsDict[ll[2]] = [[ int(ll[3]) - 1, int(ll[4]), ll[1], float(ll[5]), ll[0] ]] else: tempMpbsDict[ll[2]] = [[ int(ll[4]) - 1, int(ll[3]), ll[1], float(ll[5]), ll[0] ]] fimoFile.close() # Update scores and remove MPBSs with score below pwmThreshold (if it is being used) for chrName in tempMpbsDict.keys(): for e in tempMpbsDict[chrName]: if (chrName in mpbsDict.keys()): mpbsDict[chrName].append([ e[0], e[1], e[2], int(1000 * (e[3] - pwmThreshold) / (maxValue - pwmThreshold)), e[4] ]) else: mpbsDict[chrName] = [[ e[0], e[1], e[2], int(1000 * (e[3] - pwmThreshold) / (maxValue - pwmThreshold)), e[4] ]] # Removing temporary PWM folder os.system("rm -rf " + "/".join(memeFileName.split("/")[:-1])) return 0
#!/usr/bin/env python # counts a motif (arg1) with overlaps in a fasta file (arg2) import sys from Bio.Seq import Seq from Bio import SeqIO from Bio import Motif from Bio.Alphabet import IUPAC theMotif = sys.argv[1] fastafile = sys.argv[2] momo=Motif.Motif(alphabet=IUPAC.unambiguous_dna) momo.add_instance(Seq(theMotif,momo.alphabet)) momoc=0 handle = open(fastafile) def countMotif(myseqrecord, mymotif): i=0 for pos in mymotif.search_instances(myseqrecord.seq): i+=1 return i for seq_record in SeqIO.parse(handle, "fasta"): momoc=momoc + countMotif(seq_record,momo) handle.close() print "motif",theMotif, "found", momoc, "times in the", fastafile, "file"
from Bio import Motif from scipy.stats import fisher_exact def search_motif(mf,seq): """search pwm for each motif in the motiflist form sequence""" cutoff = 0.8*mf.max_score() result = [(score,pos) for pos,score in mf.search_pwm(seq)] if not result: return None scores = [item[0] for item in result] pos = [item[1] for item in result] if max(scores) > cutoff: return pos[scores.index(max(scores))] else: return None count = 0 mf = Motif.read(open(sys.argv[1]),'jaspar-pfm') ofp = open(sys.argv[2]+'.motif.fa','w') for i,record in enumerate(SeqIO.parse(sys.argv[2],'fasta')): hit = search_motif(mf,record.seq) if hit == None: continue else: record.id = record.id+'_'+str(hit) print >> ofp, '>'+record.id print >> ofp, record.seq count += 1 print str(i+1)+'\ttotal seq(s)' print str(count)+'\tcontains motifs ('+str(count/float(i+1)*100)+'%)'