def testIter(genomeLen, numReads, readLen, mutFreq, errors, errorFreq, prop=1): # Initialize the reference gene t = ['$'] for i in range(genomeLen): t = [random.choice(['A', 'C', 'T', 'G'])] + t # constuct the fm index alphabet = ['$', 'A', 'C', 'G', 'T'] b = 5 fm = bwt.constructFM(t, b, alphabet) startsOrig = [] for i in range(numReads): start = random.randint(0, genomeLen - readLen - 1) startsOrig += [start] starts = startsOrig[:] # mutate the reference genome to get new genome t2 = t[:] for i in range(int(round(mutFreq * genomeLen))): base = random.randint(0, len(t2) - 1) mutType = random.randint(0, 2) # substitution if mutType == 0: t2[base] = random.choice(['A', 'C', 'T', 'G']) # insertion elif mutType == 1: t2 = t2[:base] + [random.choice(['A', 'C', 'T', 'G'])] + t2[base:] for s in xrange(len(starts)): if starts[s] >= base: starts[s] += 1 # deletion else: t2 = t2[:base] + t2[base + 1:] for s in xrange(len(starts)): if starts[s] >= base: starts[s] -= 1 # generate reads from new genome reads = [] for i in xrange(len(starts)): reads += [t2[starts[i]:starts[i] + readLen]] # introduce substitution errors with 1% chance at each base for j in xrange(readLen): if random.random() < errorFreq: reads[i][j] = random.choice(['A', 'C', 'T', 'G']) return iu.iterativeUpdateError(fm, b, alphabet, reads, startsOrig, errors, 5, True, readLen, genomeLen, prop)
def testIter(genomeLen, numReads, readLen, mutFreq, errors, errorFreq): # Initialize the reference gene t = ['$'] for i in range(genomeLen): t = [random.choice(['A', 'C', 'T', 'G'])] + t # constuct the fm index alphabet = ['$', 'A', 'C', 'G', 'T'] b = 5 fm = bwt.constructFM(t, b, alphabet) startsOrig = [] for i in range(numReads): start = random.randint(0, genomeLen-readLen-1) startsOrig += [start] starts = startsOrig[:] # mutate the reference genome to get new genome t2 = t[:] for i in range(int(round(mutFreq*genomeLen))): base = random.randint(2*readLen, len(t2)-2*readLen) mutType = random.randint(0,2) # substitution if mutType == 0: t2[base] = random.choice(['A', 'C', 'T', 'G']) # insertion elif mutType == 1: t2 = t2[:base] + [random.choice(['A', 'C', 'T', 'G'])] + t2[base:] for s in xrange(len(starts)): if starts[s] >= base: starts[s] += 1 # deletion else: t2 = t2[:base] + t2[base+1:] for s in xrange(len(starts)): if starts[s] >= base: starts[s] -= 1 # generate reads from new genome reads = [] for i in xrange(len(starts)): reads += [t2[starts[i]:starts[i]+readLen]] # introduce substitution errors with 1% chance at each base for j in xrange(readLen): if random.random() < errorFreq: reads[i][j] = random.choice(['A', 'C', 'T', 'G']) return iu.iterativeUpdateError(fm, b, alphabet, reads, startsOrig, errors, 5, True, readLen, genomeLen)
def iterEM(genomeLen, numReads, readLen, mutFreq, errors, errorFreq): ''' Prop = proportion of reads to contribute to mutations ''' # Initialize the reference gene t = ['$'] for i in range(genomeLen): t = [random.choice(['A', 'C', 'T', 'G'])] + t # constuct the fm index alphabet = ['$', 'A', 'C', 'G', 'T'] b = 5 fm = bwt.constructFM(t, b, alphabet) startsOrig = [] for i in range(numReads): start = random.randint(0, genomeLen-readLen-1) startsOrig += [start] starts = startsOrig[:] # mutate the reference genome to get new genome t2 = t[:] for i in range(int(round(mutFreq*genomeLen))): base = random.randint(0, len(t2)-1) mutType = random.randint(0,2) # substitution if mutType == 0: t2[base] = random.choice(['A', 'C', 'T', 'G']) # insertion elif mutType == 1: t2 = t2[:base] + [random.choice(['A', 'C', 'T', 'G'])] + t2[base:] for s in xrange(len(starts)): if starts[s] >= base: starts[s] += 1 # deletion else: t2 = t2[:base] + t2[base+1:] for s in xrange(len(starts)): if starts[s] >= base: starts[s] -= 1 # generate reads from new genome reads = [] for i in xrange(len(starts)): reads += [t2[starts[i]:starts[i]+readLen]] # introduce substitution errors with 1% chance at each base for j in xrange(readLen): if random.random() < errorFreq: reads[i][j] = random.choice(['A', 'C', 'T', 'G']) tempReads = reads[:] tempFM = copy.deepcopy(fm) tempStarts = startsOrig[:] accuracyOrig, sizeOrig = iterativeEM.iterativeEM(tempFM, b, alphabet, tempReads, tempStarts, errors, 5, readLen, genomeLen, 1) tempFM = copy.deepcopy(fm) tempReads = reads[:] tempStarts = startsOrig[:] accuracyRed, sizeRed = iterativeEM.iterativeEM(tempFM, b, alphabet, tempReads, tempStarts, errors, 5, readLen, genomeLen, 0.1) tempFM = copy.deepcopy(fm) tempReads = reads[:] tempStarts = startsOrig[:] accuracyDist1, sizeDist1 = iterativeEMDist.iterativeEMDist(tempFM, b, alphabet, tempReads, tempStarts, errors, 5, readLen, genomeLen, 1, 1) tempFM = copy.deepcopy(fm) tempReads = reads[:] tempStarts = startsOrig[:] accuracyDist2, sizeDist2 = iterativeEMDist.iterativeEMDist(tempFM, b, alphabet, tempReads, tempStarts, errors, 5, readLen, genomeLen, 2, 1) tempFM = copy.deepcopy(fm) tempReads = reads[:] tempStarts = startsOrig[:] accuracyDistChunk2, sizeDistChunk2 = iterativeEMDist.iterativeEMDist(tempFM, b, alphabet, tempReads, tempStarts, errors, 5, readLen, genomeLen, 2, 50) return (accuracyOrig, accuracyRed, accuracyDist1, accuracyDist2, accuracyDistChunk2), (sizeOrig, sizeRed, sizeDist1, sizeDist2, sizeDistChunk2)
def countCorrect(genomeLen, numReads, readLen, mutFreq, errors): # Initialize the reference gene t = ['$'] for i in range(genomeLen): t = [random.choice(['A', 'C', 'T', 'G'])] + t # constuct the fm index alphabet = ['$', 'A', 'C', 'G', 'T'] b = 5 fm = bwt.constructFM(t, b, alphabet) startsOrig = [] for i in range(numReads): start = random.randint(0, genomeLen-readLen-1) startsOrig += [start] starts = startsOrig[:] # mutate the reference genome to get new genome t2 = t[:] for i in range(int(round(mutFreq*genomeLen))): base = random.randint(0, len(t2)-1) mutType = random.randint(0,2) # substitution if mutType == 0: t2[base] = random.choice(['A', 'C', 'T', 'G']) # insertion elif mutType == 1: t2 = t2[:base] + [random.choice(['A', 'C', 'T', 'G'])] + t2[base:] for s in xrange(len(starts)): if starts[s] >= base: starts[s] += 1 # deletion else: t2 = t2[:base] + t2[base+1:] for s in xrange(len(starts)): if starts[s] >= base: starts[s] -= 1 # generate reads from new genome reads = [] for i in xrange(len(starts)): reads += [t2[starts[i]:starts[i]+readLen]] # Match reads against t2 correct = 0 incorrect = 0 for i in range(numReads): #print 'Read ' + str(i+1) #print ' ' + ''.join(reads[i]) m = bwt.findApproximate(fm, b, alphabet, ''.join(reads[i]), errors) found = False #print 'Searching for ' + str(startsOrig[i]) #print m #print ''.join(reads[i]) #print ''.join(t[starts[i]:starts[i]+readLen]) for j in xrange(-errors, errors+1): if startsOrig[i]+j in m.keys() and not found: #print 'Found!\n' correct += 1 found = True if not found: #print 'Not found\n' incorrect += 1 print ' Accuracy: ' + str(correct) + ' / ' + str(correct+incorrect) + ' = ' + str(float(correct)/(incorrect+correct)) return float(correct) / (incorrect+correct)
#!/usr/bin/env python3 import bwt import random alpha = ['A', 'C', 'G', 'T'] t = ['$'] + [random.choice(alpha) for x in range(10000)] alphabet = alpha + ['$'] b = 5 print('OldRow, First, Checkpt, NewRow, SA, Checkpt, Reord') fm = bwt.constructFM(t, b, alphabet) bwt.insert(fm, b, alphabet, 1, random.choice(alpha), timing=True) bwt.insert(fm, b, alphabet, 5, random.choice(alpha), timing=True) bwt.insert(fm, b, alphabet, 10, random.choice(alpha), timing=True) bwt.insert(fm, b, alphabet, 50, random.choice(alpha), timing=True) bwt.insert(fm, b, alphabet, 100, random.choice(alpha), timing=True) bwt.insert(fm, b, alphabet, 500, random.choice(alpha), timing=True) bwt.insert(fm, b, alphabet, 1000, random.choice(alpha), timing=True) bwt.insert(fm, b, alphabet, 5000, random.choice(alpha), timing=True) print() fm = bwt.constructFM(t, b, alphabet) bwt.insert(fm, b, alphabet, 1, 'A', timing=True) bwt.insert(fm, b, alphabet, 5, 'A', timing=True) bwt.insert(fm, b, alphabet, 10, 'A', timing=True) bwt.insert(fm, b, alphabet, 50, 'A', timing=True) bwt.insert(fm, b, alphabet, 100, 'A', timing=True) bwt.insert(fm, b, alphabet, 500, 'A', timing=True) bwt.insert(fm, b, alphabet, 1000, 'A', timing=True) bwt.insert(fm, b, alphabet, 5000, 'A', timing=True) bwt.insert(fm, b, alphabet, 10000, 'A', timing=True) print()
def iterEM(genomeLen, numReads, readLen, mutFreq, errors, errorFreq): ''' Prop = proportion of reads to contribute to mutations ''' # Initialize the reference gene t = ['$'] for i in range(genomeLen): t = [random.choice(['A', 'C', 'T', 'G'])] + t # constuct the fm index alphabet = ['$', 'A', 'C', 'G', 'T'] b = 5 fm = bwt.constructFM(t, b, alphabet) startsOrig = [] for i in range(numReads): start = random.randint(0, genomeLen - readLen - 1) startsOrig += [start] starts = startsOrig[:] # mutate the reference genome to get new genome t2 = t[:] for i in range(int(round(mutFreq * genomeLen))): base = random.randint(0, len(t2) - 1) mutType = random.randint(0, 2) # substitution if mutType == 0: t2[base] = random.choice(['A', 'C', 'T', 'G']) # insertion elif mutType == 1: t2 = t2[:base] + [random.choice(['A', 'C', 'T', 'G'])] + t2[base:] for s in xrange(len(starts)): if starts[s] >= base: starts[s] += 1 # deletion else: t2 = t2[:base] + t2[base + 1:] for s in xrange(len(starts)): if starts[s] >= base: starts[s] -= 1 # generate reads from new genome reads = [] for i in xrange(len(starts)): reads += [t2[starts[i]:starts[i] + readLen]] # introduce substitution errors with 1% chance at each base for j in xrange(readLen): if random.random() < errorFreq: reads[i][j] = random.choice(['A', 'C', 'T', 'G']) tempReads = reads[:] tempFM = copy.deepcopy(fm) tempStarts = startsOrig[:] accuracyOrig, sizeOrig = iterativeEM.iterativeEM(tempFM, b, alphabet, tempReads, tempStarts, errors, 5, readLen, genomeLen, 1) tempFM = copy.deepcopy(fm) tempReads = reads[:] tempStarts = startsOrig[:] accuracyRed, sizeRed = iterativeEM.iterativeEM(tempFM, b, alphabet, tempReads, tempStarts, errors, 5, readLen, genomeLen, 0.1) tempFM = copy.deepcopy(fm) tempReads = reads[:] tempStarts = startsOrig[:] accuracyDist1, sizeDist1 = iterativeEMDist.iterativeEMDist( tempFM, b, alphabet, tempReads, tempStarts, errors, 5, readLen, genomeLen, 1, 1) tempFM = copy.deepcopy(fm) tempReads = reads[:] tempStarts = startsOrig[:] accuracyDist2, sizeDist2 = iterativeEMDist.iterativeEMDist( tempFM, b, alphabet, tempReads, tempStarts, errors, 5, readLen, genomeLen, 2, 1) tempFM = copy.deepcopy(fm) tempReads = reads[:] tempStarts = startsOrig[:] accuracyDistChunk2, sizeDistChunk2 = iterativeEMDist.iterativeEMDist( tempFM, b, alphabet, tempReads, tempStarts, errors, 5, readLen, genomeLen, 2, 50) return (accuracyOrig, accuracyRed, accuracyDist1, accuracyDist2, accuracyDistChunk2), (sizeOrig, sizeRed, sizeDist1, sizeDist2, sizeDistChunk2)
def iterEM(genomeLen, numReads, readLen, mutFreq, errors, errorFreq): """ Prop = proportion of reads to contribute to mutations """ # Initialize the reference gene t = ["$"] for i in range(genomeLen): t = [random.choice(["A", "C", "T", "G"])] + t # constuct the fm index alphabet = ["$", "A", "C", "G", "T"] b = 5 fm = bwt.constructFM(t, b, alphabet) # initialize starting points for reads # reads are twice as likely to originate from the first half of the genome startsOrig = [] for i in range(numReads): start = random.randint(0, round(1.5 * (genomeLen - readLen - 1))) if start > genomeLen / 2: start -= genomeLen / 2 startsOrig += [int(start)] starts = startsOrig[:] # mutate the reference genome to get new genome t2 = t[:] for i in range(int(round(mutFreq * genomeLen))): base = random.randint(0, len(t2) - 1) mutType = random.randint(0, 2) # substitution if mutType == 0: t2[base] = random.choice(["A", "C", "T", "G"]) # insertion elif mutType == 1: t2 = t2[:base] + [random.choice(["A", "C", "T", "G"])] + t2[base:] for s in xrange(len(starts)): if starts[s] >= base: starts[s] += 1 # deletion else: t2 = t2[:base] + t2[base + 1 :] for s in xrange(len(starts)): if starts[s] >= base: starts[s] -= 1 # generate reads from new genome reads = [] for i in xrange(len(starts)): reads += [t2[starts[i] : starts[i] + readLen]] # introduce substitution errors with 1% chance at each base for j in xrange(readLen): if random.random() < errorFreq: reads[i][j] = random.choice(["A", "C", "T", "G"]) tempReads = reads[:] tempFM = copy.deepcopy(fm) tempStarts = startsOrig[:] accuracyOrig, sizeOrig = iterativeEM.iterativeEM( tempFM, b, alphabet, tempReads, tempStarts, errors, 5, readLen, genomeLen, 1 ) tempFM = copy.deepcopy(fm) tempReads = reads[:] tempStarts = startsOrig[:] accuracyRed, sizeRed = iterativeEM.iterativeEM( tempFM, b, alphabet, tempReads, tempStarts, errors, 5, readLen, genomeLen, 0.1 ) tempFM = copy.deepcopy(fm) tempReads = reads[:] tempStarts = startsOrig[:] accuracyChunk50, sizeChunk50 = iterativeEMDist.iterativeEMDist( tempFM, b, alphabet, tempReads, tempStarts, errors, 5, readLen, genomeLen, 2, 50 ) tempFM = copy.deepcopy(fm) tempReads = reads[:] tempStarts = startsOrig[:] accuracyChunk100, sizeChunk100 = iterativeEMDist.iterativeEMDist( tempFM, b, alphabet, tempReads, tempStarts, errors, 5, readLen, genomeLen, 2, 100 ) return ( (accuracyOrig, accuracyRed, accuracyChunk50, accuracyChunk100), (sizeOrig, sizeRed, sizeChunk50, sizeChunk100), )
length = lengths[i] numRuns = runLens[i] for n in xrange(numRuns): # Generate a long random string of random length t = ['$'] for i in range(length): t = [random.choice(['A', 'C', 'T', 'G'])] + t alphabet = ['$', 'A', 'C', 'G', 'T'] b = 50 # Construct the fm index startBuild = time.time() fm = bwt.constructFM(t, b, alphabet) buildTime += time.time() - startBuild letters = set(t) letters.remove('$') # Substitution of a character subId = random.randint(0,length-1) newChar = random.choice(list(letters)) t2 = t[:subId] + [newChar] + t[subId+1:] startUpdate = time.time() fm_new = bwt.substitute(fm, b, alphabet, subId, newChar) timeUpdate = time.time() - startUpdate updateTimes[0] += timeUpdate
def countCorrect(genomeLen, numReads, readLen, mutFreq, errors): # Initialize the reference gene t = ['$'] for i in range(genomeLen): t = [random.choice(['A', 'C', 'T', 'G'])] + t # constuct the fm index alphabet = ['$', 'A', 'C', 'G', 'T'] b = 5 fm = bwt.constructFM(t, b, alphabet) startsOrig = [] for i in range(numReads): start = random.randint(0, genomeLen - readLen - 1) startsOrig += [start] starts = startsOrig[:] # mutate the reference genome to get new genome t2 = t[:] for i in range(int(round(mutFreq * genomeLen))): base = random.randint(0, len(t2) - 1) mutType = random.randint(0, 2) # substitution if mutType == 0: t2[base] = random.choice(['A', 'C', 'T', 'G']) # insertion elif mutType == 1: t2 = t2[:base] + [random.choice(['A', 'C', 'T', 'G'])] + t2[base:] for s in xrange(len(starts)): if starts[s] >= base: starts[s] += 1 # deletion else: t2 = t2[:base] + t2[base + 1:] for s in xrange(len(starts)): if starts[s] >= base: starts[s] -= 1 # generate reads from new genome reads = [] for i in xrange(len(starts)): reads += [t2[starts[i]:starts[i] + readLen]] # Match reads against t2 correct = 0 incorrect = 0 for i in range(numReads): #print 'Read ' + str(i+1) #print ' ' + ''.join(reads[i]) m = bwt.findApproximate(fm, b, alphabet, ''.join(reads[i]), errors) found = False #print 'Searching for ' + str(startsOrig[i]) #print m #print ''.join(reads[i]) #print ''.join(t[starts[i]:starts[i]+readLen]) for j in xrange(-errors, errors + 1): if startsOrig[i] + j in m.keys() and not found: #print 'Found!\n' correct += 1 found = True if not found: #print 'Not found\n' incorrect += 1 print ' Accuracy: ' + str(correct) + ' / ' + str( correct + incorrect) + ' = ' + str( float(correct) / (incorrect + correct)) return float(correct) / (incorrect + correct)