示例#1
0
def toKmers(kmers,revcompkmers,sequence,name,kmerLength):
    
    from seqdata import revcomp, comp
    
    for i in range(len(sequence)-kmerLength+1):
        kmer = sequence[i:i+kmerLength]
        print kmer
        try: kmers[kmer].append(name)
        except KeyError:kmers[kmer]=[name]
        try: revcompkmers[revcomp(kmer)].append(name)
        except KeyError:revcompkmers[revcomp(kmer)]=[name]
    return kmers,revcompkmers
示例#2
0
def toKmers(kmers, revcompkmers, sequence, name, kmerLength):

    from seqdata import revcomp, comp

    for i in range(len(sequence) - kmerLength + 1):
        kmer = sequence[i:i + kmerLength]
        print kmer
        try:
            kmers[kmer].append(name)
        except KeyError:
            kmers[kmer] = [name]
        try:
            revcompkmers[revcomp(kmer)].append(name)
        except KeyError:
            revcompkmers[revcomp(kmer)] = [name]
    return kmers, revcompkmers
示例#3
0
def addHandleToCollection(handle,handles,kmers,revcompkmers,activePrimers,kmerLength):
    from seqdata import revcomp, comp
    handles.append(handle)
    activePrimers.append((handle.sequence,handle.id))
    activePrimers.append((revcomp(handle.sequence),str(handle.id)+'_rc'))
    sequence = handle.sequence
    name = handle.id
    kmers,revcompkmers = toKmers(kmers,revcompkmers,sequence,name,kmerLength)
    return handle,handles,kmers,revcompkmers,activePrimers
示例#4
0
def addHandleToCollection(handle, handles, kmers, revcompkmers, activePrimers,
                          kmerLength):
    from seqdata import revcomp, comp
    handles.append(handle)
    activePrimers.append((handle.sequence, handle.id))
    activePrimers.append((revcomp(handle.sequence), str(handle.id) + '_rc'))
    sequence = handle.sequence
    name = handle.id
    kmers, revcompkmers = toKmers(kmers, revcompkmers, sequence, name,
                                  kmerLength)
    return handle, handles, kmers, revcompkmers, activePrimers
示例#5
0
def sequence_layout(layout='HLA'):

    if layout == 'HLA':
        H1 = HLA_H1
        H2 = HLA_H2
        H3 = HLA_H3
        DBS= HLA_DBS
    elif layout == 'WFA':
        H1 = WFA_H1
        H2 = WFA_H2
        H3 = WFA_H3
        DBS= WFA_DBS
    else:
        print 'Error: No layout specified.'
        return 1

    import seqdata
    
    output  = '#  \n'
    output += '#  The expected layout of inserts should be:\n'
    output += '#  \n'
    output += '#  H1-DBS-revcomp(H2)-someDNA-revcomp(H3)\n'
    output += '#  \n'
    output += '#  Using the currently defined sequences this should be:\n'
    output += '#  '+H1+'-'+DBS+'-'+seqdata.revcomp(H2)+'-someDNA-'+seqdata.revcomp(H3)+'\n'
    output += '#  '+'\n'
    output += '#  '+'With illumina handles this will be:'+'\n'
    output += '#  '+ILLI5+'-'+H1+'-'+DBS+'-'+seqdata.revcomp(H2)+'-someDNA-'+seqdata.revcomp(H3)+'-'+ILLI7+'\n'
    output += '#  '+'or if ligated the other direction might also occur:'+'\n'
    output += '#  '+ILLI5+'-'+H3+'-someDNA-'+H2+'-'+seqdata.revcomp(DBS)+'-'+seqdata.revcomp(H1)+'-'+ILLI7+'\n'
    
    return output
示例#6
0
def main():
    
    import seqdata
    
    print '#  '
    print '#  The expected layout of inserts should be:'
    print '#  '
    print '#  H1-DBS-revcomp(H2)-someDNA-revcomp(H3)'
    print '#  '
    print '#  Using the currently defined sequences this should be:'
    print '#  '+H1+'-'+DBS+'-'+seqdata.revcomp(H2)+'-someDNA-'+seqdata.revcomp(H3)
    print '#  '
    print '#  '+'With illumina handles this will be:'
    print '#  '+ILLI5+'-'+H1+'-'+DBS+'-'+seqdata.revcomp(H2)+'-someDNA-'+seqdata.revcomp(H3)+'-'+ILLI7
    print '#  '+'or if ligated the other direction might also occur:'
    print '#  '+ILLI5+'-'+H3+'-someDNA-'+H2+'-'+seqdata.revcomp(DBS)+'-'+seqdata.revcomp(H1)+'-'+ILLI7
示例#7
0
 def checkMatchedByPrimer(self,activePrimers):
     
     from seqdata import revcomp
     from misc import hamming_distance
     
     # go through all active primers in collection
     for seq, name in activePrimers:
        #print ', '.join([name for seq, name in activePrimers])
        
         # go through all the kmers in the sequence of the handle
         for i in range(len(self.sequence)-self.kmerLength):
             primer3prime = seq[-self.kmerLength:]
             prekmer = self.sequence[i:i+self.kmerLength]
   
             # check if the primer three prime end matches the kmer or the revcomp kmer
             for kmer in [prekmer,revcomp(prekmer)]:
                if primer3prime == kmer: return name
                distance = hamming_distance( primer3prime, kmer )
                if distance < self.minHD: return name
                if distance < self.minHD+1:
                   distance = hamming_distance( primer3prime[-5:], kmer[-5:] )
                   if distance < 1: return name
     return False
示例#8
0
    def checkMatchedByPrimer(self, activePrimers):

        from seqdata import revcomp
        from misc import hamming_distance

        # go through all active primers in collection
        for seq, name in activePrimers:
            #print ', '.join([name for seq, name in activePrimers])

            # go through all the kmers in the sequence of the handle
            for i in range(len(self.sequence) - self.kmerLength):
                primer3prime = seq[-self.kmerLength:]
                prekmer = self.sequence[i:i + self.kmerLength]

                # check if the primer three prime end matches the kmer or the revcomp kmer
                for kmer in [prekmer, revcomp(prekmer)]:
                    if primer3prime == kmer: return name
                    distance = hamming_distance(primer3prime, kmer)
                    if distance < self.minHD: return name
                    if distance < self.minHD + 1:
                        distance = hamming_distance(primer3prime[-5:],
                                                    kmer[-5:])
                        if distance < 1: return name
        return False
示例#9
0
#H2 = 'GACAGTTCCAAGAGGTCATG' #H1691
#H3 = 'TAGGACCAGCGTCTCAGTAT' #H4328
#################### WFA system ##############################################################################

#################### WFA2 system ##############################################################################
WFA_H1  = 'CAGTTGATCATCAGCAGGTAATCTGG' #E
WFA_DBS = 'BDHVBDHVBDHVBDHVBDHV'
WFA_H2 = 'CTGTCTCTTATACACATCTCATGAGAACGTCGTTGACGATGGACAGTTCCAAGAGGTCATG' #H1691'+H5+TES
WFA_H3 = 'TAGGACCAGCGTCTCAGTAGAGATGTGTATAAGAGACAG' #H43283'G+TES
#################### WFA system ##############################################################################

#################### HLA system ##############################################################################
import seqdata
HLA_H1  = 'ACCGAGTGGTGAGTCATAGT'
HLA_DBS = 'BDVHBDVHBDVHBDVHBDVH'
HLA_H2 = seqdata.revcomp('CTAGCTTCACGAGTTCATCG')
HLA_H3 = 'AGATGGCCGTTATGATAGCG'
#################### HLA system ##############################################################################

#################### Universal ##############################################################################
ILLI5 = 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT'
ILLI7 = 'AGATCGGAAGAGCACACGTCTGAACTCCAGTCACNNNNNNATCTCGTATGCCGTCTTCTGCTTG'
IND_HANDLE_1 = 'TTAGTCTCCGACGGCAGGCTTCAAT'
IND_HANDLE_2 = 'ACGCACCCACCGGGACTCAG'
#################### Universal ##############################################################################

def main():
    
    import seqdata
    
    print '#  '
示例#10
0
    def check3primEnd(self, kmers, revcompkmers, handles, fiveprime=False):
        """ Look at matches between the handle sequence and the kmer collections
        """

        from seqdata import revcomp
        from misc import hamming_distance

        # create dictionary of the kmers in the handle sequence
        revcompself = {}
        for i in range(len(self.sequence) - self.kmerLength + 1):
            if not fiveprime:
                kmer = revcomp(self.sequence[i:i + self.kmerLength])
            else:
                kmer = self.sequence[i:i + self.kmerLength]
            try:
                revcompself[kmer].append('revcomp self')
            except KeyError:
                revcompself[kmer] = ['revcomp self']

        # check if the active end matches some other kmers in collection
        toCLose = False  # initial vaule

        if fiveprime:  # set the sequence we are checking and if we are looking at the three primer or five prime end of the sequence
            ENDSEQ = revcomp(self.sequence[:self.kmerLength])
            endName = 'first '
        else:
            ENDSEQ = self.sequence[-self.kmerLength:]
            endName = 'last '
        assert len(
            ENDSEQ
        ) == self.kmerLength, 'Error: the script is trying to check wrong number of end bases\n'
        self.output = '\ngenereated handle#' + str(
            self.id) + ' ' + 'check ' + endName + str(
                self.kmerLength
            ) + '=' + ENDSEQ + ':=> '  # give some info for the output

        #
        # check if 3'/5' of the handle match any sequence in kmers or revcomp-kmers
        #

        # Look for perfect matches of the end sequence to kmer dictionaries
        for dictionary, name in [(revcompself, 'self-rc '), (kmers, ''),
                                 (revcompkmers, 'rc ')]:
            if ENDSEQ in dictionary:
                self.output += endName + str(
                    self.kmerLength
                ) + ' (' + ENDSEQ + ') perfect ' + name + 'match to ' + ' ' + str(
                    dictionary[ENDSEQ])
                self.resonFordeath = name + 'kmer match'
                return

        # Look for matches with missmatch
        for dictionary, name in [(revcompself, 'self-rc'), (kmers, ''),
                                 (revcompkmers, 'rc '),
                                 (revcompself, 'self-rc')]:
            for kmer, hits in dictionary.iteritems():
                assert len(
                    kmer
                ) == self.kmerLength, 'Error: kmer of wrong length: ' + kmer + ' in ' + ', '.join(
                    hits)
                if kmer.count('N'): continue

                # check for distance of full kmer to kmer dictionaries
                distFull = hamming_distance(ENDSEQ, kmer)
                if distFull < self.minHD:
                    toCLose = True
                    self.output += str(distFull) + ' mm to ' + str(
                        hits) + ' (' + kmer + ') too close,'
                    self.resonFordeath = name + 'kmer match'
                    break

                if distFull < self.minHD + 1:  # if almost to close check last five bases so that we have at least 2 nonmatching bases in this part
                    distLastFive = hamming_distance(ENDSEQ[-5:], kmer[-5:])
                    if distLastFive < 3:
                        toCLose = True
                        self.output += str(
                            distLastFive) + ' mm in last5 to ' + str(
                                hits) + ' (' + kmer + ') too close,'
                        self.resonFordeath = name + 'kmer match  in last5 '
                        break

                #if distFull < self.minHD+1 and ENDSEQ[-3] == kmer[-3]:  # looks for uniq three mers skip sthis mostly there are non 4**3 is to few
                #      toCLose = True;
                #      self.output+= ' lastbase(s) identical, '+name+' '+kmer
                #      self.resonFordeath = name+'lastbase(s) identical '
                #      break
                #else:output+= str(dist)+' mm to '+str(hits)+' ok '
            if toCLose: return

        #
        # check if 3' bases in handle matches any other 3' in other handles
        #
        for handle2 in handles:
            dist = hamming_distance(ENDSEQ,
                                    handle2.sequence[-self.kmerLength:])
            if dist < self.minHD:
                toCLose = True
                self.output += str(dist) + ' mm to ' + str(
                    handle2.id) + '(' + handle2.sequence[
                        -self.kmerLength:] + ')' + ' too close,'
                self.resonFordeath = '3 prime ends match'
                break
            else:
                self.output += str(dist) + ' mm to ' + str(
                    handle2.id) + ' (' + handle2.sequence[
                        -self.kmerLength:] + ') ' + 'ok |' + ' '
        if toCLose: return
示例#11
0
def sequence_layout(layout='HLA'):

    if layout == 'HLA':
        H1 = HLA_H1
        H2 = HLA_H2
        H3 = HLA_H3
        DBS = HLA_DBS
    elif layout == 'WFA':
        H1 = WFA_H1
        H2 = WFA_H2
        H3 = WFA_H3
        DBS = WFA_DBS

    ########################################################################################
    # NEWSTUFF FROM FRICK
    # Comments
    #   - Can DBS be None without breaking stuff?
    #   - remember to Check if DBS can be removed.
    elif layout == 'ChIB':

        # Stuff which goes strait into HLA pipeline.
        H1 = ChIB_H1
        H2 = ChIB_H6  # NB ChIB H4 is imported as H2 due to structure of HLA pipeline.
        H3 = seqdata.revcomp(ChIB_H6)
        DBS = ChIB_DBS  # Remove if you find where this is used in the pipeline. Something imports it somewhere.

        # Custom object sequences for ChIB xyz barcode layout
        real_H1 = ChIB_H1  # Not necessary, same as H2, but less confusing for reading/writing ChIB scripts.
        real_H2 = ChIB_H2
        real_H3 = ChIB_H3
        real_H4 = ChIB_H4
        real_H5 = ChIB_H5
        real_H6 = ChIB_H6
        #real_H6prim = ChIB_H6prim
        #real_H7prim = ChIB_H7prim
        # Not necessary, same as H2, but less confusing for reading/writing ChIB scripts.

    #########################################################################################
    else:
        print 'Error: No layout specified.'
        return 1

    import seqdata

    output = '#  \n'
    output += '#  The expected layout of inserts should be:\n'
    output += '#  \n'
    output += '#  H1-DBS-revcomp(H2)-someDNA-revcomp(H3)\n'
    output += '#  \n'
    output += '#  Using the currently defined sequences this should be:\n'
    output += '#  ' + H1 + '-' + DBS + '-' + seqdata.revcomp(
        H2) + '-someDNA-' + seqdata.revcomp(H3) + '\n'
    output += '#  ' + '\n'
    output += '#  ' + 'With illumina handles this will be:' + '\n'
    output += '#  ' + ILLI5 + '-' + H1 + '-' + DBS + '-' + seqdata.revcomp(
        H2) + '-someDNA-' + seqdata.revcomp(H3) + '-' + ILLI7 + '\n'
    output += '#  ' + 'or if ligated the other direction might also occur:' + '\n'
    output += '#  ' + ILLI5 + '-' + H3 + '-someDNA-' + H2 + '-' + seqdata.revcomp(
        DBS) + '-' + seqdata.revcomp(H1) + '-' + ILLI7 + '\n'

    return output
示例#12
0
ChIB_H3 = 'GATATTGCACGGTTGAACGG'
#ChIB_H4_H5_H6 = seqdata.revcomp('ACGGTTCCTCAATGTCTGCCGTAACCTCGGCATTATCGCGGTATTGGACAGGACCT')
ChIB_H4 = 'ACGGTTCCTCAATGTCTGCC'
ChIB_H5 = 'GTAACCTCGGCATTATCGCG'
ChIB_H6 = 'GTATTGGACAGGACC'  #Change to real H6, some of H5 included in handle and 3' T.
ChIB_DBS = 'NNNNNNNNAATTACCAGGCCAGTCGGTCNNNNNNNNGATATTGCACGGTTGAACGGNNNNNNNN'
#ChIB_H6prim = seqdata.revcomp('GGTCCTGTCCAATAC')
ChIB_H7 = 'CGGTCTTGGCTTGTCCTT'  # REAL SEQ 'CGGTCTTGGCTTGTCCTTGC' shortend by two due to NN bases called for frist two bases in reverse read //PH
#################### ChIB system #############################################################################

#################### HLA system ##############################################################################
import seqdata

HLA_H1 = 'ACCGAGTGGTGAGTCATAGT'
HLA_DBS = 'BDVHBDVHBDVHBDVHBDVH'
HLA_H2 = seqdata.revcomp('CTAGCTTCACGAGTTCATCG')
HLA_H3 = 'AGATGGCCGTTATGATAGCG'
#################### HLA system ##############################################################################

#################### Universal ##############################################################################
ILLI5 = 'AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT'
ILLI7 = 'AGATCGGAAGAGCACACGTCTGAACTCCAGTCACNNNNNNATCTCGTATGCCGTCTTCTGCTTG'
IND_HANDLE_1 = 'TTAGTCTCCGACGGCAGGCTTCAAT'
IND_HANDLE_2 = 'ACGCACCCACCGGGACTCAG'
#################### Universal ##############################################################################


def sequence_layout(layout='HLA'):

    if layout == 'HLA':
        H1 = HLA_H1
示例#13
0
    def check3primEnd(self, kmers,revcompkmers,handles,fiveprime=False):
        """ Look at matches between the handle sequence and the kmer collections
        """
        
        from seqdata import revcomp
        from misc import hamming_distance

        # create dictionary of the kmers in the handle sequence
        revcompself = {}
        for i in range(len(self.sequence)-self.kmerLength+1):
           if not fiveprime: kmer = revcomp(self.sequence[i:i+self.kmerLength])
           else:             kmer =         self.sequence[i:i+self.kmerLength]
           try:            revcompself[kmer].append('revcomp self')
           except KeyError:revcompself[kmer] = ['revcomp self']
        
        # check if the active end matches some other kmers in collection
        toCLose = False # initial vaule
        
        if fiveprime: # set the sequence we are checking and if we are looking at the three primer or five prime end of the sequence
           ENDSEQ = revcomp(self.sequence[:self.kmerLength])
           endName = 'first '
        else:
           ENDSEQ = self.sequence[-self.kmerLength:]
           endName = 'last '
        assert len(ENDSEQ) == self.kmerLength, 'Error: the script is trying to check wrong number of end bases\n'
        self.output = '\ngenereated handle#'+str(self.id)+' '+'check '+endName+str(self.kmerLength)+'='+ENDSEQ+':=> ' # give some info for the output
        
        #
        # check if 3'/5' of the handle match any sequence in kmers or revcomp-kmers
        #
        
        # Look for perfect matches of the end sequence to kmer dictionaries
        for dictionary, name in [(revcompself,'self-rc '),(kmers,''),(revcompkmers,'rc ')]:
           if ENDSEQ in dictionary:
              self.output+= endName+str(self.kmerLength)+' ('+ENDSEQ+') perfect '+name+'match to '+' '+str(dictionary[ENDSEQ]);
              self.resonFordeath = name+'kmer match'
              return

        # Look for matches with missmatch
        for dictionary, name in [(revcompself,'self-rc'),(kmers,''),(revcompkmers,'rc '),(revcompself,'self-rc')]:
            for kmer,hits in dictionary.iteritems():
                assert len(kmer) == self.kmerLength, 'Error: kmer of wrong length: '+kmer+' in '+', '.join(hits)
                if kmer.count('N'): continue
                
                # check for distance of full kmer to kmer dictionaries
                distFull = hamming_distance(ENDSEQ,kmer)
                if distFull < self.minHD:
                    toCLose = True;
                    self.output+= str(distFull)+' mm to '+str(hits)+' ('+kmer+') too close,'
                    self.resonFordeath = name+'kmer match'
                    break
                  
                if distFull < self.minHD+1: # if almost to close check last five bases so that we have at least 2 nonmatching bases in this part
                    distLastFive = hamming_distance(ENDSEQ[-5:],kmer[-5:])
                    if distLastFive < 3:
                        toCLose = True;
                        self.output+= str(distLastFive)+' mm in last5 to '+str(hits)+' ('+kmer+') too close,'
                        self.resonFordeath = name+'kmer match  in last5 '
                        break
                
                #if distFull < self.minHD+1 and ENDSEQ[-3] == kmer[-3]:  # looks for uniq three mers skip sthis mostly there are non 4**3 is to few
                #      toCLose = True;
                #      self.output+= ' lastbase(s) identical, '+name+' '+kmer
                #      self.resonFordeath = name+'lastbase(s) identical '
                #      break
                #else:output+= str(dist)+' mm to '+str(hits)+' ok '
            if toCLose: return
    
        #
        # check if 3' bases in handle matches any other 3' in other handles
        #
        for handle2 in handles:
           dist = hamming_distance(ENDSEQ,handle2.sequence[-self.kmerLength:])
           if dist < self.minHD:
              toCLose = True;
              self.output+= str(dist)+' mm to '+str(handle2.id)+'('+handle2.sequence[-self.kmerLength:]+')'+' too close,'
              self.resonFordeath = '3 prime ends match'
              break
           else: self.output+= str(dist)+' mm to '+str(handle2.id)+' ('+handle2.sequence[-self.kmerLength:]+') '+'ok |'+' '
        if toCLose: return