예제 #1
0
def plotPatternBeyondRepeat(genomeSource1, genomeSource2, startpt1, endpt1,
                            startpt2, endpt2, plotRange):
    f1 = open(genomeSource1, 'r')
    f2 = open(genomeSource2, 'r')

    pointerLocation1 = endpt1
    pointerLocation2 = endpt2

    windowSize = 10
    distanceList = []

    for index in range(plotRange):
        f1.seek(pointerLocation1)
        f2.seek(pointerLocation2)

        str1 = f1.read(windowSize)
        str2 = f2.read(windowSize)

        pointerLocation1 = pointerLocation1 + windowSize
        pointerLocation2 = pointerLocation2 + windowSize

        distance = distanceComputeLib.hammingDistance(
            str1, str2, min(windowSize, len(str1), len(str2)))
        distanceList.append(distance)

    plt.subplot(211)
    plt.plot(range(0,
                   len(distanceList) * windowSize, windowSize), distanceList)

    windowSize = 10
    distanceList = []

    pointerLocation1 = startpt1 - windowSize
    pointerLocation2 = startpt2 - windowSize

    #    for index in range(plotRange):
    #        f1.seek(pointerLocation1)
    #        f2.seek(pointerLocation2)

    #        str1 = f1.read(windowSize)
    #        str2 = f2.read(windowSize)

    #        pointerLocation1 = pointerLocation1 - windowSize
    #        pointerLocation2 = pointerLocation2 - windowSize

    #        distance = hammingDistance(str1, str2, windowSize)
    #        distanceList.append(distance)

    #    plt.subplot(212)
    #    plt.plot(range(0,len(distanceList)*windowSize,windowSize), distanceList)

    #plt.show()

    f1.close()
    f2.close()
def plotPatternBeyondRepeat(genomeSource1,genomeSource2, startpt1, endpt1, startpt2, endpt2,plotRange):
    f1 = open(genomeSource1,'r')
    f2 = open(genomeSource2,'r')
    
    pointerLocation1 = endpt1
    pointerLocation2 = endpt2
    
    windowSize = 10
    distanceList = []
    
    for index in range(plotRange):
        f1.seek(pointerLocation1)
        f2.seek(pointerLocation2)
    
        str1 = f1.read(windowSize)
        str2 = f2.read(windowSize)
        
        pointerLocation1 = pointerLocation1 + windowSize
        pointerLocation2 = pointerLocation2 + windowSize
        
        distance = distanceComputeLib.hammingDistance(str1, str2, min(windowSize,len(str1),len(str2)))
        distanceList.append(distance)        
        
    plt.subplot(211)
    plt.plot(range(0,len(distanceList)*windowSize,windowSize), distanceList)

    
    windowSize = 10
    distanceList = []
    
    pointerLocation1 = startpt1 - windowSize
    pointerLocation2 = startpt2 - windowSize


#    for index in range(plotRange):
#        f1.seek(pointerLocation1)
#        f2.seek(pointerLocation2)
    
#        str1 = f1.read(windowSize)
#        str2 = f2.read(windowSize)
        
#        pointerLocation1 = pointerLocation1 - windowSize
#        pointerLocation2 = pointerLocation2 - windowSize
        
#        distance = hammingDistance(str1, str2, windowSize)
#        distanceList.append(distance)        
        
#    plt.subplot(212)
#    plt.plot(range(0,len(distanceList)*windowSize,windowSize), distanceList)
        
    #plt.show()
    
    f1.close()
    f2.close()
예제 #3
0
def effectOfInterleaving(genomeSource1, Lrepeat, Liid):

    f1 = open(genomeSource1, 'r')

    G = len(f1.read())
    print G

    oldh = Liid

    totalNumberOfRounds = 100000

    for numberOfRounds in range(totalNumberOfRounds):
        i = random.randint(0, G - Lrepeat - Liid - 1)
        j = random.randint(i + 1, G - Lrepeat - Liid)

        f1.seek(i)
        substring1 = f1.read(Liid)
        f1.seek(j)
        substring2 = f1.read(Liid)

        h1 = distanceComputeLib.hammingDistance(substring1, substring2,
                                                len(substring1))

        f1.seek(i + Lrepeat)
        substring1 = f1.read(Liid)
        f1.seek(j + Lrepeat)
        substring2 = f1.read(Liid)

        h2 = distanceComputeLib.hammingDistance(substring1, substring2,
                                                len(substring1))

        h = max(h1, h2)
        print i, j, h

        if h < oldh:
            oldh = h

    print "Minimum hamming distance over ", totalNumberOfRounds, " is ", oldh, "\n"
    f1.close()
def effectOfInterleaving(genomeSource1,Lrepeat, Liid):
        
    f1 = open(genomeSource1, 'r')

    G = len(f1.read())
    print G
    
    oldh = Liid
    
    totalNumberOfRounds = 100000
    
    for numberOfRounds in range(totalNumberOfRounds):
        i = random.randint(0,G- Lrepeat - Liid -1)
        j = random.randint(i+1, G- Lrepeat - Liid)

        f1.seek(i)
        substring1 = f1.read(Liid)
        f1.seek(j)
        substring2= f1.read(Liid)
        
        h1= distanceComputeLib.hammingDistance(substring1,substring2, len(substring1))
        
        f1.seek(i + Lrepeat)
        substring1 = f1.read(Liid)
        f1.seek(j+ Lrepeat)
        substring2= f1.read(Liid)
        
        h2= distanceComputeLib.hammingDistance(substring1,substring2, len(substring1))            

        h = max(h1,h2)
        print i,j, h
        
        if h < oldh:
            oldh = h
                
    print "Minimum hamming distance over " ,totalNumberOfRounds, " is ", oldh, "\n"
    f1.close()
    
예제 #5
0
def checking(temp,genomeSource1,genomeSource2 ):
    print temp
    f1 = open(genomeSource1,'r')
    f2 = open(genomeSource2,'r')
    
    f1.seek(temp[0]-1)
    f2.seek(temp[1]-1)
    
    str1  = f1.read(temp[2])
    str2 = f2.read(temp[2])
    

    print "Hamming distance", distanceComputeLib.hammingDistance(str1,str2, len(str1))


    f1.seek(temp[0]-2)
    f2.seek(temp[1]-2)
    
    str1  = f1.read(temp[2]+2)
    str2 = f2.read(temp[2]+2)
    print "Hamming distance", distanceComputeLib.hammingDistance(str1,str2, len(str1))

    f1.close()
    f2.close()    
def reportPatternBeyondRepeat(genomeSource1, genomeSource2, startpt1, endpt1,
                              startpt2, endpt2, plotRange, outputResult):
    Gchecker = open(genomeSource1, 'r')
    G = len(Gchecker.read())
    print G
    Gchecker.close()

    f1 = open(genomeSource1, 'r')
    f2 = open(genomeSource2, 'r')

    pointerLocation1 = endpt1
    pointerLocation2 = endpt2

    windowSize = 10
    distanceList = []

    for index in range(plotRange):
        if pointerLocation1 < G and pointerLocation2 < G:
            f1.seek(pointerLocation1)
            f2.seek(pointerLocation2)

            str1 = f1.read(windowSize)
            str2 = f2.read(windowSize)

            pointerLocation1 = pointerLocation1 + windowSize
            pointerLocation2 = pointerLocation2 + windowSize

            distance = distanceComputeLib.hammingDistance(
                str1, str2, windowSize)
            distanceList.append(distance)

    f = open(outputResult, 'w')
    for eachitem in distanceList:
        f.write(str(eachitem) + "\n")
    f.close()

    f1.close()
    f2.close()
def reportPatternBeyondRepeat(genomeSource1,genomeSource2, startpt1, endpt1, startpt2, endpt2,plotRange,outputResult):
    Gchecker = open(genomeSource1,'r')
    G = len(Gchecker.read())
    print G
    Gchecker.close()    
    
    f1 = open(genomeSource1,'r')
    f2 = open(genomeSource2,'r')
    

    pointerLocation1 = endpt1
    pointerLocation2 = endpt2
    
    windowSize = 10
    distanceList = []
    
    for index in range(plotRange):
        if pointerLocation1 < G and pointerLocation2 < G:
            f1.seek(pointerLocation1)
            f2.seek(pointerLocation2)
        
            str1 = f1.read(windowSize)
            str2 = f2.read(windowSize)
            
            pointerLocation1 = pointerLocation1 + windowSize
            pointerLocation2 = pointerLocation2 + windowSize
            
            distance = distanceComputeLib.hammingDistance(str1, str2, windowSize)
            distanceList.append(distance)        
        
    f = open(outputResult,'w')
    for eachitem in distanceList:  
        f.write(str(eachitem) + "\n" )
    f.close()
    
    
    f1.close()
    f2.close()    
def findapproxrepeatLength(filename1, filename2, start1, start2,
                           lengthOfExactRepeat):
    f1 = open(filename1, 'r')
    f2 = open(filename2, 'r')

    totalNumberOfError = 0

    windowSize = 100
    threshold = 25
    ### Decision rule : if > 50 error in the latest window of 100, then stop counting

    numberOfError = 0

    ### Compute the RHS
    f1.seek(start1 + lengthOfExactRepeat - windowSize - 1)
    f2.seek(start2 + lengthOfExactRepeat - windowSize - 1)

    temp1 = f1.read(windowSize)
    temp2 = f2.read(windowSize)

    lastPosition1 = start1 + lengthOfExactRepeat - windowSize - 1
    lastPosition2 = start2 + lengthOfExactRepeat - windowSize - 1

    print "CheckPoint 1 : ", distanceComputeLib.hammingDistance(
        temp1, temp2, len(temp1))
    numberOfError = distanceComputeLib.hammingDistance(temp1, temp2,
                                                       len(temp1))
    totalNumberOfError = totalNumberOfError + numberOfError

    while (numberOfError < threshold):

        f1.seek(lastPosition1)
        char1 = f1.read(1)
        f2.seek(lastPosition2)
        char2 = f2.read(1)

        if char1 != char2:
            numberOfError = numberOfError - 1

        f1.seek(lastPosition1 + windowSize)
        f2.seek(lastPosition2 + windowSize)

        char1 = f1.read(1)
        char2 = f2.read(1)

        if char1 != char2:
            numberOfError = numberOfError + 1
            totalNumberOfError = totalNumberOfError + 1

        lastPosition1 = lastPosition1 + 1
        lastPosition2 = lastPosition2 + 1

    endIndex1 = lastPosition1 + windowSize - int(threshold * 1.3333)
    endIndex2 = lastPosition2 + windowSize - int(threshold * 1.3333)

    numberOfError = 0

    ### Compute the LHS
    f1.seek(start1)
    f2.seek(start2)

    temp1 = f1.read(windowSize)
    temp2 = f2.read(windowSize)

    lastPosition1 = start1 + windowSize - 1
    lastPosition2 = start2 + windowSize - 1

    numberOfError = distanceComputeLib.hammingDistance(temp1, temp2,
                                                       len(temp1))
    print "checkPoint2 :", distanceComputeLib.hammingDistance(
        temp1, temp2, len(temp1))
    totalNumberOfError = totalNumberOfError + numberOfError

    while (numberOfError < threshold):

        f1.seek(lastPosition1)
        char1 = f1.read(1)
        f2.seek(lastPosition2)
        char2 = f2.read(1)

        if char1 != char2:
            numberOfError = numberOfError - 1

        f1.seek(lastPosition1 - windowSize)
        f2.seek(lastPosition2 - windowSize)

        char1 = f1.read(1)
        char2 = f2.read(1)

        if char1 != char2:
            numberOfError = numberOfError + 1
            totalNumberOfError = totalNumberOfError + 1

        lastPosition1 = lastPosition1 - 1
        lastPosition2 = lastPosition2 - 1

    startIndex1 = lastPosition1 - windowSize + int(threshold * 1.3333)
    startIndex2 = lastPosition2 - windowSize + int(threshold * 1.3333)

    lapprox = endIndex1 - startIndex1
    print lapprox, totalNumberOfError, threshold, lengthOfExactRepeat
    mutationRate = (totalNumberOfError - 2 * threshold) / float(lapprox)
    if mutationRate == 0:
        mutationRate = 1 / float(lapprox)

    print "mutationRate", mutationRate, (totalNumberOfError - 2 * threshold)

    if startIndex1 > startIndex2:
        dummy = startIndex1
        startIndex1 = startIndex2
        startIndex2 = dummy

    if lapprox <= lengthOfExactRepeat:
        return start1, start2, lengthOfExactRepeat + 1, 1 / float(
            lengthOfExactRepeat)
    else:
        return startIndex1, startIndex2, lapprox, mutationRate
def findapproxrepeatLength(filename1, filename2, start1, start2, lengthOfExactRepeat):
    f1 = open(filename1, 'r')
    f2 = open(filename2, 'r')
    
    totalNumberOfError = 0
    
    windowSize = 100
    threshold = 25 
    ### Decision rule : if > 50 error in the latest window of 100, then stop counting 
    
    numberOfError= 0     
    
    ### Compute the RHS
    f1.seek(start1+ lengthOfExactRepeat- windowSize-1)
    f2.seek(start2+ lengthOfExactRepeat- windowSize-1)
    
    temp1  = f1.read(windowSize)
    temp2 = f2.read(windowSize)

    
    lastPosition1 = start1+ lengthOfExactRepeat- windowSize-1
    lastPosition2 = start2+ lengthOfExactRepeat- windowSize-1
    
    print "CheckPoint 1 : ", distanceComputeLib.hammingDistance(temp1, temp2, len(temp1))
    numberOfError = distanceComputeLib.hammingDistance(temp1, temp2, len(temp1))
    totalNumberOfError = totalNumberOfError + numberOfError
    
    while (numberOfError < threshold):
        
        f1.seek(lastPosition1)
        char1 = f1.read(1)
        f2.seek(lastPosition2)
        char2 = f2.read(1)

        if char1 != char2 : 
           numberOfError = numberOfError - 1 
        
        f1.seek(lastPosition1  + windowSize)
        f2.seek(lastPosition2 + windowSize)
        
        char1 = f1.read(1)
        char2 = f2.read(1)
        
        if char1 != char2:
            numberOfError = numberOfError + 1
            totalNumberOfError= totalNumberOfError + 1

        
        lastPosition1 = lastPosition1 + 1
        lastPosition2 = lastPosition2 + 1
    
    
    endIndex1 = lastPosition1 + windowSize - int(threshold* 1.3333)
    endIndex2 = lastPosition2 + windowSize - int(threshold* 1.3333)


    numberOfError= 0     
    
    ### Compute the LHS
    f1.seek(start1)
    f2.seek(start2)
    
    temp1  = f1.read(windowSize)
    temp2 = f2.read(windowSize)

    
    lastPosition1 = start1 + windowSize -1
    lastPosition2 = start2  + windowSize -1
    
    numberOfError = distanceComputeLib.hammingDistance(temp1, temp2, len(temp1))
    print "checkPoint2 :",  distanceComputeLib.hammingDistance(temp1, temp2, len(temp1))
    totalNumberOfError = totalNumberOfError + numberOfError
    
    while (numberOfError < threshold):
        
        f1.seek(lastPosition1)
        char1 = f1.read(1)
        f2.seek(lastPosition2)
        char2 = f2.read(1)
        

        if char1 != char2 : 
           numberOfError = numberOfError - 1 
        
        f1.seek(lastPosition1  - windowSize)
        f2.seek(lastPosition2 - windowSize)
        
        char1 = f1.read(1)
        char2 = f2.read(1)
        
        if char1 != char2:
            numberOfError = numberOfError + 1
            totalNumberOfError = totalNumberOfError +1
        
        
        lastPosition1 = lastPosition1 - 1
        lastPosition2 = lastPosition2 - 1
    
    
    startIndex1 = lastPosition1 - windowSize + int(threshold* 1.3333)
    startIndex2 = lastPosition2 - windowSize + int(threshold* 1.3333)
    
    lapprox = endIndex1-startIndex1 
    print lapprox, totalNumberOfError ,threshold,lengthOfExactRepeat
    mutationRate = (totalNumberOfError - 2* threshold )/float(lapprox)
    if mutationRate == 0:
        mutationRate = 1/float(lapprox)
    
    print "mutationRate", mutationRate, (totalNumberOfError - 2* threshold )
    
    
    if startIndex1 > startIndex2 :
        dummy = startIndex1
        startIndex1 = startIndex2 
        startIndex2 = dummy
    
    
    
    
    if lapprox <= lengthOfExactRepeat:
        return start1, start2, lengthOfExactRepeat+1, 1/ float(lengthOfExactRepeat)
    else:
        return  startIndex1, startIndex2, lapprox, mutationRate