def toKmers(kmers,revcompkmers,sequence,name,kmerLength): from seqdata import revcomp, comp for i in range(len(sequence)-kmerLength+1): kmer = sequence[i:i+kmerLength] print kmer try: kmers[kmer].append(name) except KeyError:kmers[kmer]=[name] try: revcompkmers[revcomp(kmer)].append(name) except KeyError:revcompkmers[revcomp(kmer)]=[name] return kmers,revcompkmers
def toKmers(kmers, revcompkmers, sequence, name, kmerLength): from seqdata import revcomp, comp for i in range(len(sequence) - kmerLength + 1): kmer = sequence[i:i + kmerLength] print kmer try: kmers[kmer].append(name) except KeyError: kmers[kmer] = [name] try: revcompkmers[revcomp(kmer)].append(name) except KeyError: revcompkmers[revcomp(kmer)] = [name] return kmers, revcompkmers
def addHandleToCollection(handle,handles,kmers,revcompkmers,activePrimers,kmerLength): from seqdata import revcomp, comp handles.append(handle) activePrimers.append((handle.sequence,handle.id)) activePrimers.append((revcomp(handle.sequence),str(handle.id)+'_rc')) sequence = handle.sequence name = handle.id kmers,revcompkmers = toKmers(kmers,revcompkmers,sequence,name,kmerLength) return handle,handles,kmers,revcompkmers,activePrimers
def addHandleToCollection(handle, handles, kmers, revcompkmers, activePrimers, kmerLength): from seqdata import revcomp, comp handles.append(handle) activePrimers.append((handle.sequence, handle.id)) activePrimers.append((revcomp(handle.sequence), str(handle.id) + '_rc')) sequence = handle.sequence name = handle.id kmers, revcompkmers = toKmers(kmers, revcompkmers, sequence, name, kmerLength) return handle, handles, kmers, revcompkmers, activePrimers
def sequence_layout(layout='HLA'): if layout == 'HLA': H1 = HLA_H1 H2 = HLA_H2 H3 = HLA_H3 DBS= HLA_DBS elif layout == 'WFA': H1 = WFA_H1 H2 = WFA_H2 H3 = WFA_H3 DBS= WFA_DBS else: print 'Error: No layout specified.' return 1 import seqdata output = '# \n' output += '# The expected layout of inserts should be:\n' output += '# \n' output += '# H1-DBS-revcomp(H2)-someDNA-revcomp(H3)\n' output += '# \n' output += '# Using the currently defined sequences this should be:\n' output += '# '+H1+'-'+DBS+'-'+seqdata.revcomp(H2)+'-someDNA-'+seqdata.revcomp(H3)+'\n' output += '# '+'\n' output += '# '+'With illumina handles this will be:'+'\n' output += '# '+ILLI5+'-'+H1+'-'+DBS+'-'+seqdata.revcomp(H2)+'-someDNA-'+seqdata.revcomp(H3)+'-'+ILLI7+'\n' output += '# '+'or if ligated the other direction might also occur:'+'\n' output += '# '+ILLI5+'-'+H3+'-someDNA-'+H2+'-'+seqdata.revcomp(DBS)+'-'+seqdata.revcomp(H1)+'-'+ILLI7+'\n' return output
def main(): import seqdata print '# ' print '# The expected layout of inserts should be:' print '# ' print '# H1-DBS-revcomp(H2)-someDNA-revcomp(H3)' print '# ' print '# Using the currently defined sequences this should be:' print '# '+H1+'-'+DBS+'-'+seqdata.revcomp(H2)+'-someDNA-'+seqdata.revcomp(H3) print '# ' print '# '+'With illumina handles this will be:' print '# '+ILLI5+'-'+H1+'-'+DBS+'-'+seqdata.revcomp(H2)+'-someDNA-'+seqdata.revcomp(H3)+'-'+ILLI7 print '# '+'or if ligated the other direction might also occur:' print '# '+ILLI5+'-'+H3+'-someDNA-'+H2+'-'+seqdata.revcomp(DBS)+'-'+seqdata.revcomp(H1)+'-'+ILLI7
def checkMatchedByPrimer(self,activePrimers): from seqdata import revcomp from misc import hamming_distance # go through all active primers in collection for seq, name in activePrimers: #print ', '.join([name for seq, name in activePrimers]) # go through all the kmers in the sequence of the handle for i in range(len(self.sequence)-self.kmerLength): primer3prime = seq[-self.kmerLength:] prekmer = self.sequence[i:i+self.kmerLength] # check if the primer three prime end matches the kmer or the revcomp kmer for kmer in [prekmer,revcomp(prekmer)]: if primer3prime == kmer: return name distance = hamming_distance( primer3prime, kmer ) if distance < self.minHD: return name if distance < self.minHD+1: distance = hamming_distance( primer3prime[-5:], kmer[-5:] ) if distance < 1: return name return False
def checkMatchedByPrimer(self, activePrimers): from seqdata import revcomp from misc import hamming_distance # go through all active primers in collection for seq, name in activePrimers: #print ', '.join([name for seq, name in activePrimers]) # go through all the kmers in the sequence of the handle for i in range(len(self.sequence) - self.kmerLength): primer3prime = seq[-self.kmerLength:] prekmer = self.sequence[i:i + self.kmerLength] # check if the primer three prime end matches the kmer or the revcomp kmer for kmer in [prekmer, revcomp(prekmer)]: if primer3prime == kmer: return name distance = hamming_distance(primer3prime, kmer) if distance < self.minHD: return name if distance < self.minHD + 1: distance = hamming_distance(primer3prime[-5:], kmer[-5:]) if distance < 1: return name return False
#H2 = 'GACAGTTCCAAGAGGTCATG' #H1691 #H3 = 'TAGGACCAGCGTCTCAGTAT' #H4328 #################### WFA system ############################################################################## #################### WFA2 system ############################################################################## WFA_H1 = 'CAGTTGATCATCAGCAGGTAATCTGG' #E WFA_DBS = 'BDHVBDHVBDHVBDHVBDHV' WFA_H2 = 'CTGTCTCTTATACACATCTCATGAGAACGTCGTTGACGATGGACAGTTCCAAGAGGTCATG' #H1691'+H5+TES WFA_H3 = 'TAGGACCAGCGTCTCAGTAGAGATGTGTATAAGAGACAG' #H43283'G+TES #################### WFA system ############################################################################## #################### HLA system ############################################################################## import seqdata HLA_H1 = 'ACCGAGTGGTGAGTCATAGT' HLA_DBS = 'BDVHBDVHBDVHBDVHBDVH' HLA_H2 = seqdata.revcomp('CTAGCTTCACGAGTTCATCG') HLA_H3 = 'AGATGGCCGTTATGATAGCG' #################### HLA system ############################################################################## #################### Universal ############################################################################## ILLI5 = 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT' ILLI7 = 'AGATCGGAAGAGCACACGTCTGAACTCCAGTCACNNNNNNATCTCGTATGCCGTCTTCTGCTTG' IND_HANDLE_1 = 'TTAGTCTCCGACGGCAGGCTTCAAT' IND_HANDLE_2 = 'ACGCACCCACCGGGACTCAG' #################### Universal ############################################################################## def main(): import seqdata print '# '
def check3primEnd(self, kmers, revcompkmers, handles, fiveprime=False): """ Look at matches between the handle sequence and the kmer collections """ from seqdata import revcomp from misc import hamming_distance # create dictionary of the kmers in the handle sequence revcompself = {} for i in range(len(self.sequence) - self.kmerLength + 1): if not fiveprime: kmer = revcomp(self.sequence[i:i + self.kmerLength]) else: kmer = self.sequence[i:i + self.kmerLength] try: revcompself[kmer].append('revcomp self') except KeyError: revcompself[kmer] = ['revcomp self'] # check if the active end matches some other kmers in collection toCLose = False # initial vaule if fiveprime: # set the sequence we are checking and if we are looking at the three primer or five prime end of the sequence ENDSEQ = revcomp(self.sequence[:self.kmerLength]) endName = 'first ' else: ENDSEQ = self.sequence[-self.kmerLength:] endName = 'last ' assert len( ENDSEQ ) == self.kmerLength, 'Error: the script is trying to check wrong number of end bases\n' self.output = '\ngenereated handle#' + str( self.id) + ' ' + 'check ' + endName + str( self.kmerLength ) + '=' + ENDSEQ + ':=> ' # give some info for the output # # check if 3'/5' of the handle match any sequence in kmers or revcomp-kmers # # Look for perfect matches of the end sequence to kmer dictionaries for dictionary, name in [(revcompself, 'self-rc '), (kmers, ''), (revcompkmers, 'rc ')]: if ENDSEQ in dictionary: self.output += endName + str( self.kmerLength ) + ' (' + ENDSEQ + ') perfect ' + name + 'match to ' + ' ' + str( dictionary[ENDSEQ]) self.resonFordeath = name + 'kmer match' return # Look for matches with missmatch for dictionary, name in [(revcompself, 'self-rc'), (kmers, ''), (revcompkmers, 'rc '), (revcompself, 'self-rc')]: for kmer, hits in dictionary.iteritems(): assert len( kmer ) == self.kmerLength, 'Error: kmer of wrong length: ' + kmer + ' in ' + ', '.join( hits) if kmer.count('N'): continue # check for distance of full kmer to kmer dictionaries distFull = hamming_distance(ENDSEQ, kmer) if distFull < self.minHD: toCLose = True self.output += str(distFull) + ' mm to ' + str( hits) + ' (' + kmer + ') too close,' self.resonFordeath = name + 'kmer match' break if distFull < self.minHD + 1: # if almost to close check last five bases so that we have at least 2 nonmatching bases in this part distLastFive = hamming_distance(ENDSEQ[-5:], kmer[-5:]) if distLastFive < 3: toCLose = True self.output += str( distLastFive) + ' mm in last5 to ' + str( hits) + ' (' + kmer + ') too close,' self.resonFordeath = name + 'kmer match in last5 ' break #if distFull < self.minHD+1 and ENDSEQ[-3] == kmer[-3]: # looks for uniq three mers skip sthis mostly there are non 4**3 is to few # toCLose = True; # self.output+= ' lastbase(s) identical, '+name+' '+kmer # self.resonFordeath = name+'lastbase(s) identical ' # break #else:output+= str(dist)+' mm to '+str(hits)+' ok ' if toCLose: return # # check if 3' bases in handle matches any other 3' in other handles # for handle2 in handles: dist = hamming_distance(ENDSEQ, handle2.sequence[-self.kmerLength:]) if dist < self.minHD: toCLose = True self.output += str(dist) + ' mm to ' + str( handle2.id) + '(' + handle2.sequence[ -self.kmerLength:] + ')' + ' too close,' self.resonFordeath = '3 prime ends match' break else: self.output += str(dist) + ' mm to ' + str( handle2.id) + ' (' + handle2.sequence[ -self.kmerLength:] + ') ' + 'ok |' + ' ' if toCLose: return
def sequence_layout(layout='HLA'): if layout == 'HLA': H1 = HLA_H1 H2 = HLA_H2 H3 = HLA_H3 DBS = HLA_DBS elif layout == 'WFA': H1 = WFA_H1 H2 = WFA_H2 H3 = WFA_H3 DBS = WFA_DBS ######################################################################################## # NEWSTUFF FROM FRICK # Comments # - Can DBS be None without breaking stuff? # - remember to Check if DBS can be removed. elif layout == 'ChIB': # Stuff which goes strait into HLA pipeline. H1 = ChIB_H1 H2 = ChIB_H6 # NB ChIB H4 is imported as H2 due to structure of HLA pipeline. H3 = seqdata.revcomp(ChIB_H6) DBS = ChIB_DBS # Remove if you find where this is used in the pipeline. Something imports it somewhere. # Custom object sequences for ChIB xyz barcode layout real_H1 = ChIB_H1 # Not necessary, same as H2, but less confusing for reading/writing ChIB scripts. real_H2 = ChIB_H2 real_H3 = ChIB_H3 real_H4 = ChIB_H4 real_H5 = ChIB_H5 real_H6 = ChIB_H6 #real_H6prim = ChIB_H6prim #real_H7prim = ChIB_H7prim # Not necessary, same as H2, but less confusing for reading/writing ChIB scripts. ######################################################################################### else: print 'Error: No layout specified.' return 1 import seqdata output = '# \n' output += '# The expected layout of inserts should be:\n' output += '# \n' output += '# H1-DBS-revcomp(H2)-someDNA-revcomp(H3)\n' output += '# \n' output += '# Using the currently defined sequences this should be:\n' output += '# ' + H1 + '-' + DBS + '-' + seqdata.revcomp( H2) + '-someDNA-' + seqdata.revcomp(H3) + '\n' output += '# ' + '\n' output += '# ' + 'With illumina handles this will be:' + '\n' output += '# ' + ILLI5 + '-' + H1 + '-' + DBS + '-' + seqdata.revcomp( H2) + '-someDNA-' + seqdata.revcomp(H3) + '-' + ILLI7 + '\n' output += '# ' + 'or if ligated the other direction might also occur:' + '\n' output += '# ' + ILLI5 + '-' + H3 + '-someDNA-' + H2 + '-' + seqdata.revcomp( DBS) + '-' + seqdata.revcomp(H1) + '-' + ILLI7 + '\n' return output
ChIB_H3 = 'GATATTGCACGGTTGAACGG' #ChIB_H4_H5_H6 = seqdata.revcomp('ACGGTTCCTCAATGTCTGCCGTAACCTCGGCATTATCGCGGTATTGGACAGGACCT') ChIB_H4 = 'ACGGTTCCTCAATGTCTGCC' ChIB_H5 = 'GTAACCTCGGCATTATCGCG' ChIB_H6 = 'GTATTGGACAGGACC' #Change to real H6, some of H5 included in handle and 3' T. ChIB_DBS = 'NNNNNNNNAATTACCAGGCCAGTCGGTCNNNNNNNNGATATTGCACGGTTGAACGGNNNNNNNN' #ChIB_H6prim = seqdata.revcomp('GGTCCTGTCCAATAC') ChIB_H7 = 'CGGTCTTGGCTTGTCCTT' # REAL SEQ 'CGGTCTTGGCTTGTCCTTGC' shortend by two due to NN bases called for frist two bases in reverse read //PH #################### ChIB system ############################################################################# #################### HLA system ############################################################################## import seqdata HLA_H1 = 'ACCGAGTGGTGAGTCATAGT' HLA_DBS = 'BDVHBDVHBDVHBDVHBDVH' HLA_H2 = seqdata.revcomp('CTAGCTTCACGAGTTCATCG') HLA_H3 = 'AGATGGCCGTTATGATAGCG' #################### HLA system ############################################################################## #################### Universal ############################################################################## ILLI5 = 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT' ILLI7 = 'AGATCGGAAGAGCACACGTCTGAACTCCAGTCACNNNNNNATCTCGTATGCCGTCTTCTGCTTG' IND_HANDLE_1 = 'TTAGTCTCCGACGGCAGGCTTCAAT' IND_HANDLE_2 = 'ACGCACCCACCGGGACTCAG' #################### Universal ############################################################################## def sequence_layout(layout='HLA'): if layout == 'HLA': H1 = HLA_H1
def check3primEnd(self, kmers,revcompkmers,handles,fiveprime=False): """ Look at matches between the handle sequence and the kmer collections """ from seqdata import revcomp from misc import hamming_distance # create dictionary of the kmers in the handle sequence revcompself = {} for i in range(len(self.sequence)-self.kmerLength+1): if not fiveprime: kmer = revcomp(self.sequence[i:i+self.kmerLength]) else: kmer = self.sequence[i:i+self.kmerLength] try: revcompself[kmer].append('revcomp self') except KeyError:revcompself[kmer] = ['revcomp self'] # check if the active end matches some other kmers in collection toCLose = False # initial vaule if fiveprime: # set the sequence we are checking and if we are looking at the three primer or five prime end of the sequence ENDSEQ = revcomp(self.sequence[:self.kmerLength]) endName = 'first ' else: ENDSEQ = self.sequence[-self.kmerLength:] endName = 'last ' assert len(ENDSEQ) == self.kmerLength, 'Error: the script is trying to check wrong number of end bases\n' self.output = '\ngenereated handle#'+str(self.id)+' '+'check '+endName+str(self.kmerLength)+'='+ENDSEQ+':=> ' # give some info for the output # # check if 3'/5' of the handle match any sequence in kmers or revcomp-kmers # # Look for perfect matches of the end sequence to kmer dictionaries for dictionary, name in [(revcompself,'self-rc '),(kmers,''),(revcompkmers,'rc ')]: if ENDSEQ in dictionary: self.output+= endName+str(self.kmerLength)+' ('+ENDSEQ+') perfect '+name+'match to '+' '+str(dictionary[ENDSEQ]); self.resonFordeath = name+'kmer match' return # Look for matches with missmatch for dictionary, name in [(revcompself,'self-rc'),(kmers,''),(revcompkmers,'rc '),(revcompself,'self-rc')]: for kmer,hits in dictionary.iteritems(): assert len(kmer) == self.kmerLength, 'Error: kmer of wrong length: '+kmer+' in '+', '.join(hits) if kmer.count('N'): continue # check for distance of full kmer to kmer dictionaries distFull = hamming_distance(ENDSEQ,kmer) if distFull < self.minHD: toCLose = True; self.output+= str(distFull)+' mm to '+str(hits)+' ('+kmer+') too close,' self.resonFordeath = name+'kmer match' break if distFull < self.minHD+1: # if almost to close check last five bases so that we have at least 2 nonmatching bases in this part distLastFive = hamming_distance(ENDSEQ[-5:],kmer[-5:]) if distLastFive < 3: toCLose = True; self.output+= str(distLastFive)+' mm in last5 to '+str(hits)+' ('+kmer+') too close,' self.resonFordeath = name+'kmer match in last5 ' break #if distFull < self.minHD+1 and ENDSEQ[-3] == kmer[-3]: # looks for uniq three mers skip sthis mostly there are non 4**3 is to few # toCLose = True; # self.output+= ' lastbase(s) identical, '+name+' '+kmer # self.resonFordeath = name+'lastbase(s) identical ' # break #else:output+= str(dist)+' mm to '+str(hits)+' ok ' if toCLose: return # # check if 3' bases in handle matches any other 3' in other handles # for handle2 in handles: dist = hamming_distance(ENDSEQ,handle2.sequence[-self.kmerLength:]) if dist < self.minHD: toCLose = True; self.output+= str(dist)+' mm to '+str(handle2.id)+'('+handle2.sequence[-self.kmerLength:]+')'+' too close,' self.resonFordeath = '3 prime ends match' break else: self.output+= str(dist)+' mm to '+str(handle2.id)+' ('+handle2.sequence[-self.kmerLength:]+') '+'ok |'+' ' if toCLose: return