def testUnique(seqs, Ws, prefixes=('ACA',)): """Check we count the unique W-mers correctly by counting them in an independent way.""" import seqan import numpy as npy from jemima.wmers import countWmersMulti index = seqan.IndexStringDNA5SetESA(seqs) counts = npy.zeros((2*len(index), len(Ws)), dtype=int) numunique = countWmersMulti( index.topdownhistory(), Ws, counts, countunique=True) unique = _countUnique(seqs, Ws) if (npy.array([len(unique[W]) for W in Ws]) != numunique).all(): raise ValueError('Counts did not match.') for prefix in prefixes: it = index.topdownhistory() if not it.goDown(prefix): raise ValueError('Prefix "%s" does not exist in text' % prefix) for Widx, W in enumerate(Ws): count1 = len(filter( lambda wmer: wmer.startswith(prefix), unique[W])) count2 = counts[it.value.id, Widx] if count1 != count2: raise ValueError( 'Counts for "%s" did not match: %s != %s' % ( prefix, count1, count2))
def testUnique(seqs, Ws, prefixes=('ACA', )): """Check we count the unique W-mers correctly by counting them in an independent way.""" import seqan import numpy as npy from jemima.wmers import countWmersMulti index = seqan.IndexStringDNA5SetESA(seqs) counts = npy.zeros((2 * len(index), len(Ws)), dtype=int) numunique = countWmersMulti(index.topdownhistory(), Ws, counts, countunique=True) unique = _countUnique(seqs, Ws) if (npy.array([len(unique[W]) for W in Ws]) != numunique).all(): raise ValueError('Counts did not match.') for prefix in prefixes: it = index.topdownhistory() if not it.goDown(prefix): raise ValueError('Prefix "%s" does not exist in text' % prefix) for Widx, W in enumerate(Ws): count1 = len( filter(lambda wmer: wmer.startswith(prefix), unique[W])) count2 = counts[it.value.id, Widx] if count1 != count2: raise ValueError('Counts for "%s" did not match: %s != %s' % (prefix, count1, count2))
def _countWmers(index, Ws, countunique): counts = npy.zeros((2*len(index), len(Ws)), dtype=npy.uint) rootcounts = wmers.countWmersMulti( index.topdownhistory(), Ws, counts, countunique=countunique) # Count how many W-mers are represented by the children # of each node childfreqs = npy.zeros((2*len(index), len(Ws), jem.SIGMA)) wmers.countChildren( index.topdownhistory(), Ws, counts, childfreqs) childfreqs = jem.normalisearray(childfreqs) return rootcounts, counts, childfreqs
def _countWmers(index, Ws, countunique): counts = npy.zeros((2 * len(index), len(Ws)), dtype=npy.uint) rootcounts = wmers.countWmersMulti(index.topdownhistory(), Ws, counts, countunique=countunique) # Count how many W-mers are represented by the children # of each node childfreqs = npy.zeros((2 * len(index), len(Ws), jem.SIGMA)) wmers.countChildren(index.topdownhistory(), Ws, counts, childfreqs) childfreqs = jem.normalisearray(childfreqs) return rootcounts, counts, childfreqs
def countWmersMulti(it, Ws, counts, countunique=True): """Count all the :math:`W`-mer occurrences (or unique W-mers) below the iterator for multiple widths, Ws. Arguments: - *it*: The iterator below which to count occurrences. - *Ws*: The widths to count for. - *counts*: The counts array of shape (2*len(index), len(Ws)) - *countunique*: If true, count the number of unique W-mers below each vertex for each width. Otherwise count the number of occurrences. """ nodecounts = counts[it.value.id] maxW = Ws[-1] firstunknown = findfirstparentunknown(it, maxW) # Do we have to descend any further? Is our representative as long as # largest W? Did we find an unknown base? if firstunknown == it.repLength and it.repLength < maxW: # Yes we should descend so go down and add up counts from child nodes if it.goDown(): while True: nodecounts += countWmersMulti(it, Ws, counts, countunique) if not it.goRight(): break it.goUp() # Determine which Ws our representative is longer than longestWidx = bisect.bisect(Ws, firstunknown) # Determine which Ws our parent representative is longer than parentWidx = not it.isRoot and bisect.bisect( Ws[:longestWidx], it.repLength - it.parentEdgeLength) or 0 # Set those counts to number of occurrences if countunique: nodecounts[parentWidx:longestWidx] = 1 else: nodecounts[parentWidx:longestWidx] = it.numOccurrences return nodecounts
# jem.logo(runx1pwm, 'runx1') # jem.logo(runx1withpc, 'runx1-pc') logging.info('Loading sequences') # seqs = seqan.StringDNASet(('AAAAAAAA', 'ACGTACGT', 'TATATATA')) numbases, seqs, ids = seqan.readFastaDNA('T00759-small.fa') logging.info('Loaded %d bases from %d sequences', numbases, len(seqs)) lambda_ = len(seqs) / float(numbases) logging.info('Building index') index = seqan.IndexStringDNASetESA(seqs) logging.info('Counting W-mers') Ws = [W] Wmercounts = npy.zeros((2*len(index), len(Ws)), dtype=npy.uint) numWmers = wmers.countWmersMulti(index.topdownhistory(), Ws, Wmercounts)[0] logging.info('Got %d %d-mers', numWmers, W) childWmerfreqs = npy.zeros((2*len(index), len(Ws), jem.SIGMA)) wmers.countWmerChildren(index.topdownhistory(), W, Wmercounts, childWmerfreqs) childWmerfreqs = jem.normalisearray(childWmerfreqs) sumestimator = jis.makesumestimator(numWmers) logging.info('Importance sampling using background model to find one seed') rdm.seed(2) memocb = jis.importancesample( index, W, childWmerfreqs[:, 0], jis.UniformImportanceWeight(), numsamples=1, callback=jis.ISCbMemo()) pwm = jem.pwmfromWmer(memocb.Xns[0], numseedsites, 1.) jem.logo(pwm, 'seed') numsamples = 3000