Пример #1
0
def removeRedundantRefvsQuery(folderName, mummerLink, fileR, fileQ,
                              outputFileName):

    thres = 10

    if True:
        alignerRobot.useMummerAlignBatch(mummerLink, folderName,
                                         [["redundantRvsQ", fileR, fileQ, ""]],
                                         houseKeeper.globalParallel)

    dataList = alignerRobot.extractMumData(folderName, "redundantRvsQOut")
    lenDicR = IORobot.obtainLength(folderName, fileR)
    lenDicQ = IORobot.obtainLength(folderName, fileQ)

    isRedundantList = []

    for eachitem in dataList:
        match1, match2, name1, name2 = eachitem[4], eachitem[5], eachitem[
            7], eachitem[8]
        l1, l2 = lenDicR[name1], lenDicQ[name2]

        if abs(l2 - match2) < thres:
            isRedundantList.append(name2)

    #print lenDicQ

    nonRedundantList = obtainComplement(lenDicQ, isRedundantList)

    print nonRedundantList
    IORobot.putListToFileO(folderName, fileQ, outputFileName, nonRedundantList)

    os.system("cp " + folderName + "SC_n_tmp.fasta " + folderName +
              "SC_n.fasta")
def removeRedundantRefvsQuery(folderName, mummerLink, fileR, fileQ, outputFileName):

    thres = 10

    if True:
        alignerRobot.useMummerAlignBatch(
            mummerLink, folderName, [["redundantRvsQ", fileR, fileQ, ""]], houseKeeper.globalParallel
        )

    dataList = alignerRobot.extractMumData(folderName, "redundantRvsQOut")
    lenDicR = IORobot.obtainLength(folderName, fileR)
    lenDicQ = IORobot.obtainLength(folderName, fileQ)

    isRedundantList = []

    for eachitem in dataList:
        match1, match2, name1, name2 = eachitem[4], eachitem[5], eachitem[7], eachitem[8]
        l1, l2 = lenDicR[name1], lenDicQ[name2]

        if abs(l2 - match2) < thres:
            isRedundantList.append(name2)

    # print lenDicQ

    nonRedundantList = obtainComplement(lenDicQ, isRedundantList)

    print nonRedundantList
    IORobot.putListToFileO(folderName, fileQ, outputFileName, nonRedundantList)

    os.system("cp " + folderName + "SC_n_tmp.fasta " + folderName + "SC_n.fasta")
Пример #3
0
def removeRedundantWithFile(folderName, mummerLink, inputFilename,
                            mummerTmpName, outputFileName):
    thres = 10
    os.system("sed -e 's/|//g' " + folderName + inputFilename + ".fasta  > " +
              folderName + inputFilename + "2.fasta")

    os.system("cp " + folderName + inputFilename + "2.fasta " + folderName +
              inputFilename + ".fasta")

    if True:
        alignerRobot.useMummerAlignBatch(mummerLink, folderName, [[
            mummerTmpName, inputFilename + ".fasta", inputFilename + ".fasta",
            ""
        ]], houseKeeper.globalParallel)
        # alignerRobot.useMummerAlign(mummerLink, folderName, "self", "contigs.fasta", "contigs.fasta")
        # outputName, referenceName, queryName, specialName

    dataList = alignerRobot.extractMumData(folderName, mummerTmpName + "Out")

    dataList = alignerRobot.transformCoor(dataList)

    lenDic = IORobot.obtainLength(folderName, inputFilename + '.fasta')

    removeList = []

    shortEmbedClusterDic = {}

    for eachitem in lenDic:
        shortEmbedClusterDic[eachitem] = clusterElem(eachitem)

    for eachitem in dataList:
        match1, match2, name1, name2 = eachitem[4], eachitem[5], eachitem[
            7], eachitem[8]

        if name1 != name2:
            l1, l2 = lenDic[name1], lenDic[name2]

            if abs(l1 - match1) < thres and abs(l2 - match2) > thres:
                removeList.append(name1)
            elif abs(l1 - match1) > thres and abs(l2 - match2) < thres:
                removeList.append(name2)
            elif abs(l1 - match1) < thres and abs(l2 - match2) < thres:
                print "Both shortembedd", eachitem
                union(shortEmbedClusterDic[name1], shortEmbedClusterDic[name2])

    nameList = obtainComplement(lenDic, removeList)

    returnList = []

    for eachitem in nameList:
        if find(shortEmbedClusterDic[eachitem]).id == eachitem:
            returnList.append(eachitem)

    print "len(nameList), len(returnList)", len(nameList), len(returnList)

    IORobot.putListToFileO(folderName, inputFilename + ".fasta",
                           outputFileName, returnList)
def removeRedundantWithFile(folderName, mummerLink, inputFilename, mummerTmpName, outputFileName):
    thres = 10
    os.system("sed -e 's/|//g' " + folderName + inputFilename + ".fasta  > " + folderName + inputFilename + "2.fasta")

    os.system("cp " + folderName + inputFilename + "2.fasta " + folderName + inputFilename + ".fasta")

    if True:
        alignerRobot.useMummerAlignBatch(
            mummerLink,
            folderName,
            [[mummerTmpName, inputFilename + ".fasta", inputFilename + ".fasta", ""]],
            houseKeeper.globalParallel,
        )
        # alignerRobot.useMummerAlign(mummerLink, folderName, "self", "contigs.fasta", "contigs.fasta")
        # outputName, referenceName, queryName, specialName

    dataList = alignerRobot.extractMumData(folderName, mummerTmpName + "Out")

    dataList = alignerRobot.transformCoor(dataList)

    lenDic = IORobot.obtainLength(folderName, inputFilename + ".fasta")

    removeList = []

    shortEmbedClusterDic = {}

    for eachitem in lenDic:
        shortEmbedClusterDic[eachitem] = clusterElem(eachitem)

    for eachitem in dataList:
        match1, match2, name1, name2 = eachitem[4], eachitem[5], eachitem[7], eachitem[8]

        if name1 != name2:
            l1, l2 = lenDic[name1], lenDic[name2]

            if abs(l1 - match1) < thres and abs(l2 - match2) > thres:
                removeList.append(name1)
            elif abs(l1 - match1) > thres and abs(l2 - match2) < thres:
                removeList.append(name2)
            elif abs(l1 - match1) < thres and abs(l2 - match2) < thres:
                print "Both shortembedd", eachitem
                union(shortEmbedClusterDic[name1], shortEmbedClusterDic[name2])

    nameList = obtainComplement(lenDic, removeList)

    returnList = []

    for eachitem in nameList:
        if find(shortEmbedClusterDic[eachitem]).id == eachitem:
            returnList.append(eachitem)

    print "len(nameList), len(returnList)", len(nameList), len(returnList)

    IORobot.putListToFileO(folderName, inputFilename + ".fasta", outputFileName, returnList)
Пример #5
0
def alignWithName(leftSeg, rightSeg, folderName, mummerLink, nameOfOut):
    overlap = [0, 0]
    lLen = 0
    f = open(folderName + nameOfOut + "leftSeg.fasta", 'w')
    f.write(">SegL\n")

    if len(leftSeg) < 50000:
        f.write(leftSeg)
        lLen = len(leftSeg)
    else:
        f.write(leftSeg[-50000:])
        lLen = 50000
    f.close()

    rLen = 0
    f = open(folderName + nameOfOut + "rightSeg.fasta", 'w')
    f.write(">SegR\n")
    if len(rightSeg) < 50000:
        f.write(rightSeg)
        rLen = len(rightSeg)
    else:
        f.write(rightSeg[0:50000])
        rLen = 50000

    f.close()

    #alignerRobot.useMummerAlign(mummerLink, folderName, "overlap", "leftSeg.fasta", "rightSeg.fasta", False)
    alignerRobot.useMummerAlign(mummerLink,
                                folderName,
                                nameOfOut,
                                nameOfOut + "leftSeg.fasta",
                                nameOfOut + "rightSeg.fasta",
                                specialForRaw=False,
                                specialName="",
                                refinedVersion=True)
    dataList = alignerRobot.extractMumData(folderName, nameOfOut + "Out")

    thres = 10

    if len(dataList) == 0:
        overlap = [0, 0]
    else:
        myMax = [0, 0]

        for eachitem in dataList:
            if eachitem[1] > lLen - thres and eachitem[2] < thres:
                if eachitem[5] > myMax[1]:
                    myMax[0] = eachitem[4]
                    myMax[1] = eachitem[5]

        overlap = myMax

    return overlap
def removeEmbedded(folderName, mummerLink):
    print "removeEmbedded"
    thres = 10
    os.system("sed -e 's/|//g' " + folderName + "contigs.fasta  > " +
              folderName + "contigs2.fasta")

    os.system("cp " + folderName + "contigs2.fasta " + folderName +
              "contigs.fasta")

    if True:
        print "removeEmbedded: Aligning contigs.fasta to contigs.fasta, outputs are self*.delta"
        alignerRobot.useMummerAlignBatch(
            mummerLink, folderName,
            [["self", "contigs.fasta", "contigs.fasta", ""]],
            houseKeeper.globalParallel)
        # alignerRobot.useMummerAlign(mummerLink, folderName, "self", "contigs.fasta", "contigs.fasta")
        # outputName, referenceName, queryName, specialName

    print "removeEmbedded: Extracting MUMmer data from delta files to selfOut"
    dataList = alignerRobot.extractMumData(folderName, "selfOut")

    dataList = alignerRobot.transformCoor(dataList)

    lenDic = IORobot.obtainLength(folderName, 'contigs.fasta')

    removeList = []
    for eachitem in dataList:
        match1, match2, name1, name2 = eachitem[4], eachitem[5], eachitem[
            7], eachitem[8]

        if name1 != name2:
            l1, l2 = lenDic[name1], lenDic[name2]

            if abs(l1 - match1) < thres and abs(l2 - match2) > thres:
                removeList.append(name1)
            elif abs(l1 - match1) > thres and abs(l2 - match2) < thres:
                removeList.append(name2)
            elif abs(l1 - match1) < thres and abs(l2 - match2) < thres:
                print "Both shortembedd", eachitem

    nameList = []
    for eachitem in lenDic:
        nameList.append(eachitem)

    print len(nameList)

    for eachitem in removeList:
        if eachitem in nameList:
            nameList.remove(eachitem)
    print len(nameList)

    print "removeEmbedded: Outputting non-contained contigs to noEmbed.fasta"
    IORobot.putListToFileO(folderName, "contigs.fasta", "noEmbed", nameList)
Пример #7
0
def removeEmbedded(folderName, mummerLink):
    print "removeEmbedded"

    thres = 10

    command = r'''perl -pe 's/>[^\$]*$/">Seg" . ++$n ."\n"/ge' ''' + folderName + "raw_reads.fasta > " + folderName + houseKeeper.globalReadName
    os.system(command)

    command = r'''perl -pe 's/>[^\$]*$/">Seg" . ++$n ."\n"/ge' ''' + folderName + "contigs.fasta > " + folderName + houseKeeper.globalContigName
    os.system(command)

    if True:
        alignerRobot.useMummerAlignBatch(mummerLink, folderName, [[
            "self", houseKeeper.globalContigName, houseKeeper.globalContigName,
            ""
        ]], houseKeeper.globalParallel)
        # alignerRobot.useMummerAlign(mummerLink, folderName, "self", "contigs.fasta", "contigs.fasta")
        # outputName, referenceName, queryName, specialName

    dataList = alignerRobot.extractMumData(folderName, "selfOut")

    dataList = alignerRobot.transformCoor(dataList)

    lenDic = IORobot.obtainLength(folderName, houseKeeper.globalContigName)

    removeList = []
    for eachitem in dataList:
        match1, match2, name1, name2 = eachitem[4], eachitem[5], eachitem[
            7], eachitem[8]

        if name1 != name2:
            l1, l2 = lenDic[name1], lenDic[name2]

            if abs(l1 - match1) < thres and abs(l2 - match2) > thres:
                removeList.append(name1)
            elif abs(l1 - match1) > thres and abs(l2 - match2) < thres:
                removeList.append(name2)
            elif abs(l1 - match1) < thres and abs(l2 - match2) < thres:
                print "Both shortembedd", eachitem

    nameList = []
    for eachitem in lenDic:
        nameList.append(eachitem)

    print len(nameList)

    for eachitem in removeList:
        if eachitem in nameList:
            nameList.remove(eachitem)
    print len(nameList)

    IORobot.putListToFileO(folderName, houseKeeper.globalContigName, "noEmbed",
                           nameList)
def removeEmbedded(folderName , mummerLink):
    print "removeEmbedded"
    
    thres = 10
    
    command= r'''perl -pe 's/>[^\$]*$/">Seg" . ++$n ."\n"/ge' ''' + folderName + "raw_reads.fasta > " + folderName  + houseKeeper.globalReadName
    os.system(command)

    command= r'''perl -pe 's/>[^\$]*$/">Seg" . ++$n ."\n"/ge' ''' + folderName + "contigs.fasta > " + folderName  + houseKeeper.globalContigName
    os.system(command)


    if True:
        alignerRobot.useMummerAlignBatch(mummerLink, folderName, [["self", houseKeeper.globalContigName, houseKeeper.globalContigName, ""]], houseKeeper.globalParallel)
        # alignerRobot.useMummerAlign(mummerLink, folderName, "self", "contigs.fasta", "contigs.fasta")
        # outputName, referenceName, queryName, specialName
    
    dataList = alignerRobot.extractMumData(folderName, "selfOut")
    
    dataList = alignerRobot.transformCoor(dataList)
    
    lenDic = IORobot.obtainLength(folderName, houseKeeper.globalContigName)
    
    removeList = []
    for eachitem in dataList:
        match1, match2, name1, name2 = eachitem[4], eachitem[5], eachitem[7], eachitem[8]
        
        if name1 != name2:
            l1, l2 = lenDic[name1], lenDic[name2]
            
            if abs(l1 - match1) < thres and abs(l2 - match2) > thres:
                removeList.append(name1)
            elif abs(l1 - match1) > thres and abs(l2 - match2) < thres:
                removeList.append(name2)
            elif abs(l1 - match1) < thres and abs(l2 - match2) < thres:
                print "Both shortembedd", eachitem
                
    
    
    nameList = []
    for eachitem in lenDic:
        nameList.append(eachitem)

    print len(nameList)
    
    for eachitem in removeList:
        if eachitem in nameList:
            nameList.remove(eachitem)
    print len(nameList)
    
    IORobot.putListToFileO(folderName, houseKeeper.globalContigName, "noEmbed", nameList)
    
def removeEmbedded(folderName, mummerLink):
    print "removeEmbedded"
    thres = 10
    os.system("sed -e 's/|//g' " + folderName + "contigs.fasta  > " + folderName + "contigs2.fasta")

    os.system("cp " + folderName + "contigs2.fasta " + folderName + "contigs.fasta")

    if True:
        print "removeEmbedded: Aligning contigs.fasta to contigs.fasta, outputs are self*.delta"
        alignerRobot.useMummerAlignBatch(
            mummerLink, folderName, [["self", "contigs.fasta", "contigs.fasta", ""]], houseKeeper.globalParallel
        )
        # alignerRobot.useMummerAlign(mummerLink, folderName, "self", "contigs.fasta", "contigs.fasta")
        # outputName, referenceName, queryName, specialName

    print "removeEmbedded: Extracting MUMmer data from delta files to selfOut"
    dataList = alignerRobot.extractMumData(folderName, "selfOut")

    dataList = alignerRobot.transformCoor(dataList)

    lenDic = IORobot.obtainLength(folderName, "contigs.fasta")

    removeList = []
    for eachitem in dataList:
        match1, match2, name1, name2 = eachitem[4], eachitem[5], eachitem[7], eachitem[8]

        if name1 != name2:
            l1, l2 = lenDic[name1], lenDic[name2]

            if abs(l1 - match1) < thres and abs(l2 - match2) > thres:
                removeList.append(name1)
            elif abs(l1 - match1) > thres and abs(l2 - match2) < thres:
                removeList.append(name2)
            elif abs(l1 - match1) < thres and abs(l2 - match2) < thres:
                print "Both shortembedd", eachitem

    nameList = []
    for eachitem in lenDic:
        nameList.append(eachitem)

    print len(nameList)

    for eachitem in removeList:
        if eachitem in nameList:
            nameList.remove(eachitem)
    print len(nameList)

    print "removeEmbedded: Outputting non-contained contigs to noEmbed.fasta"
    IORobot.putListToFileO(folderName, "contigs.fasta", "noEmbed", nameList)
Пример #10
0
def alignWithName(leftSeg, rightSeg, folderName, mummerLink, nameOfOut):
    overlap = [0, 0 ] 
    lLen = 0
    f = open(folderName + nameOfOut+"leftSeg.fasta", 'w')
    f.write(">SegL\n")
    
    if len(leftSeg) < 50000:
        f.write(leftSeg)
        lLen = len(leftSeg)
    else:
        f.write(leftSeg[-50000:])
        lLen = 50000
    f.close()
    
    rLen = 0
    f = open(folderName + nameOfOut+"rightSeg.fasta", 'w')
    f.write(">SegR\n")
    if len(rightSeg) < 50000:
        f.write(rightSeg)
        rLen  = len(rightSeg)
    else:
        f.write(rightSeg[0:50000])
        rLen = 50000
        
    f.close()
    
    
    #alignerRobot.useMummerAlign(mummerLink, folderName, "overlap", "leftSeg.fasta", "rightSeg.fasta", False)
    alignerRobot.useMummerAlign(mummerLink, folderName, nameOfOut, nameOfOut+"leftSeg.fasta", nameOfOut+"rightSeg.fasta", specialForRaw = False, specialName = "", refinedVersion= True)
    dataList =  alignerRobot.extractMumData(folderName , nameOfOut+"Out")
    
    thres = 10
    
    
    if len(dataList) == 0:
        overlap = [0, 0 ]
    else:
        myMax = [0, 0]
        
        for eachitem in dataList:
            if eachitem[1] > lLen - thres and eachitem[2] < thres:
                if eachitem[5] > myMax[1]:
                    myMax[0] = eachitem[4]
                    myMax[1] = eachitem[5]
        
        overlap = myMax 
    
    return overlap 
Пример #11
0
def align(leftSeg, rightSeg, folderName, mummerLink):
    overlap = [0, 0]
    lLen = 0
    f = open(folderName + "leftSeg.fasta", "w")
    f.write(">SegL\n")

    if len(leftSeg) < 50000:
        f.write(leftSeg)
        lLen = len(leftSeg)
    else:
        f.write(leftSeg[-50000:])
        lLen = 50000
    f.close()

    rLen = 0
    f = open(folderName + "rightSeg.fasta", "w")
    f.write(">SegR\n")
    if len(rightSeg) < 50000:
        f.write(rightSeg)
        rLen = len(rightSeg)
    else:
        f.write(rightSeg[0:50000])
        rLen = 50000

    f.close()

    alignerRobot.useMummerAlign(mummerLink, folderName, "overlap", "leftSeg.fasta", "rightSeg.fasta", False)

    dataList = alignerRobot.extractMumData(folderName, "overlapOut")

    thres = 10

    if len(dataList) == 0:
        overlap = [0, 0]
    else:
        myMax = [0, 0]

        for eachitem in dataList:
            if eachitem[1] > lLen - thres and eachitem[2] < thres:
                if eachitem[5] > myMax[1]:
                    myMax[0] = eachitem[4]
                    myMax[1] = eachitem[5]

        overlap = myMax

    return overlap
Пример #12
0
def observeOverlap(folderName):

    dataList = alignerRobot.extractMumData(folderName, "selfOut")
    dataList = alignerRobot.transformCoor(dataList)
    lenDic = IORobot.obtainLength(folderName, 'contigs.fasta')
    matchThres = 10000
    nonMatchThres = 500
    count = 0

    newDataList = []
    for eachitem in dataList:
        name1, name2 = eachitem[-2], eachitem[-1]
        matchLen1, matchLen2 = eachitem[4], eachitem[5]
        start1, end1, start2, end2 = eachitem[0], eachitem[1], eachitem[
            2], eachitem[3]
        #        if name1!= name2 and min(lenDic[name1] - end1, lenDic[name2] - end2 ) > nonMatchThres \
        #        and min(start1, start2) > nonMatchThres \
        if name1!= name2 and ( min(lenDic[name1] - end1, lenDic[name2] - end2 ) > nonMatchThres \
        or min(start1, start2) > nonMatchThres ) \
        and matchLen1> matchThres:
            print "eachitem ", eachitem, lenDic[name1], lenDic[name2]
            count = count + 1
            newDataList.append(eachitem)

    print "Count: " + str(count)

    blkDic = getBreakPointFromDataList(folderName, newDataList)

    LCList = IORobot.loadContigsFromFile(folderName, "contigs.fasta")

    contigList = []

    for eachcontig in LCList:
        #print eachcontig
        if not eachcontig in blkDic:
            contigList = contigList + [LCList[eachcontig]]
        else:
            contigList = contigList + tmpBreakAcBkPts(LCList[eachcontig],
                                                      blkDic[eachcontig])

    print "len(contigList)", len(contigList)
    IORobot.writeSegOut(contigList, folderName, "breakChains.fasta")
Пример #13
0
def fillInMissed(folderName, mummerLink, filerefname, filequeryname, fileoutname):
    
    os.system("mv " + folderName + fileoutname + " " + folderName + filequeryname )
    alignerRobot.useMummerAlignBatch(mummerLink, folderName, [[fileoutname+"fillmiss", filerefname, filequeryname, ""]], houseKeeper.globalParallel)
    
    dataList = alignerRobot.extractMumData(folderName, fileoutname+"fillmissOut")

    lenDic = obtainLength(folderName, filerefname)

    ### Check if there is any missing parts 

    # Format of the dataList :  1      765  |    11596    10822  |      765      775  |    84.25  | ref_NC_001133_       scf7180000000702"
    
    dataList.sort(key = itemgetter(-2))
    thres = 100
    extraList = []

    for key, items in groupby(dataList, itemgetter(-2)):
        isFound = False
        for eachitem in items:
            if abs(int(eachitem[4])  - lenDic[key]) < thres:
                isFound = True
                break

        if not isFound:
            extraList.append(key)

    ### Fill in any missing items
    
    referenceDic = loadContigsFromFile(folderName, filerefname)
    queryDic = loadContigsFromFile(folderName, filequeryname)
    
    ctgList = [referenceDic[eachitem] for eachitem in extraList] + [queryDic[eachitem] for eachitem in queryDic]
    writeSegOut(ctgList, folderName, fileoutname)

    print "fileoutname: len(extraList)",fileoutname,  len(extraList), len(ctgList)
Пример #14
0
def obtainLinkInfo(folderName, mummerLink, inputFile, mummerFile):
    thres = 5
    minLen = 400
    # thres = 10
    # minLen = 200

    writeToFile_Double1(folderName, inputFile + ".fasta",
                        inputFile + "_Double.fasta", "contig")

    fmyFile = open(folderName + inputFile + "_Double.fasta", 'r')
    fSmaller = open(folderName + inputFile + "_contigs_Double.fasta", 'w')

    tmp = fmyFile.readline().rstrip()
    maxSize = 50000

    myName = ""
    while len(tmp) > 0:
        if tmp[0] == '>':
            fSmaller.write(tmp + '\n')
            myName = tmp[1:]
        else:
            component = tmp[0:min(len(tmp), maxSize)]
            countComp = len(component)
            fSmaller.write(component)

            component = tmp[max(0, len(tmp) - maxSize):len(tmp)]
            fSmaller.write(component)
            countComp = countComp + len(component)

            print "DebugName", myName, countComp
            fSmaller.write('\n')

        tmp = fmyFile.readline().rstrip()

    fSmaller.close()
    fmyFile.close()

    if True:
        alignerRobot.useMummerAlignBatch(mummerLink, folderName, [[
            mummerFile, inputFile + "_contigs_Double.fasta",
            inputFile + "_contigs_Double.fasta", ""
        ]], houseKeeper.globalParallel)

        # alignerRobot.useMummerAlign(mummerLink, folderName, mummerFile, inputFile + "_contigs_Double.fasta", inputFile + "_contigs_Double.fasta")

    lengthDic = obtainLength(folderName, inputFile + "_contigs_Double.fasta")

    dataSetRaw = alignerRobot.extractMumData(folderName, mummerFile + "Out")

    # ## Format [ helperStart, helperEnd , readStart, readEnd,matchLen1,matchLen2,percentMatch,helperName,readName]

    dataSet = []

    for eachitem in dataSetRaw:
        helperStart, helperEnd, readStart, readEnd, matchLen1, matchLen2, percentMatch, helperName, readName = eachitem

        detailHelper = helperName.split('_')
        detailRead = readName.split('_')

        if detailHelper[0] != detailRead[0] and helperName != readName and max(
                matchLen1, matchLen2) > minLen and readStart < readEnd and min(
                    helperStart, readStart) < thres and min(
                        lengthDic[helperName] - helperEnd,
                        lengthDic[readName] - readEnd) + 1 < thres:
            conditionForMatch = True
        else:
            conditionForMatch = False

        if conditionForMatch:
            if helperStart < thres:

                dataSet.append((max(matchLen1,
                                    matchLen2), readName, helperName))

    dataSet.sort(reverse=True)

    numberOfContig = len(lengthDic)

    return numberOfContig, dataSet
Пример #15
0
def obtainLinkInfo(folderName, mummerLink, inputFile, mummerFile):
    # minLen = 200
    minLen = 400
    
    if houseKeeper.globalRelaxThres == False:
        thres = 5
    elif houseKeeper.globalRelaxThres == True:
        thres = 10
    
    
    writeToFile_Double1(folderName, inputFile + ".fasta", inputFile + "_Double.fasta", "contig")
    
    fmyFile = open(folderName + inputFile + "_Double.fasta", 'r')
    fSmaller = open(folderName + inputFile + "_contigs_Double.fasta", 'w')

    tmp = fmyFile.readline().rstrip()
    maxSize = 50000

    myName = ""
    while len(tmp) > 0:
        if tmp[0] == '>':
            fSmaller.write(tmp + '\n')
            myName = tmp[1:]
        else:
            component = tmp[0:min(len(tmp), maxSize)] 
            countComp = len(component)
            fSmaller.write(component)
            
            component = tmp[max(0, len(tmp) - maxSize):len(tmp)]
            fSmaller.write(component)
            countComp = countComp + len(component)
            

            print "DebugName", myName, countComp
            fSmaller.write('\n')

        tmp = fmyFile.readline().rstrip()

    fSmaller.close()
    fmyFile.close()
    
    if True:
        alignerRobot.useMummerAlignBatch(mummerLink, folderName, [[mummerFile, inputFile + "_contigs_Double.fasta", inputFile + "_contigs_Double.fasta", ""]], houseKeeper.globalParallel )
        
        # alignerRobot.useMummerAlign(mummerLink, folderName, mummerFile, inputFile + "_contigs_Double.fasta", inputFile + "_contigs_Double.fasta")
        
        
    lengthDic = obtainLength(folderName, inputFile + "_contigs_Double.fasta") 
    
    dataSetRaw = alignerRobot.extractMumData(folderName, mummerFile + "Out")
    
    # ## Format [ helperStart, helperEnd , readStart, readEnd,matchLen1,matchLen2,percentMatch,helperName,readName]
    
    
    dataSet = []
    
    for eachitem in dataSetRaw: 
        helperStart, helperEnd , readStart, readEnd, matchLen1, matchLen2, percentMatch, helperName, readName = eachitem 
        
        detailHelper = helperName.split('_')
        detailRead = readName.split('_')
        

        if detailHelper[0] != detailRead[0] and  helperName != readName and max(matchLen1, matchLen2) > minLen and readStart < readEnd  and min(helperStart, readStart) < thres and min(lengthDic[helperName] - helperEnd, lengthDic[readName] - readEnd) + 1 < thres:
            conditionForMatch = True
        else:
            conditionForMatch = False

        if conditionForMatch :
            if helperStart < thres:
                
                dataSet.append((max(matchLen1, matchLen2), readName, helperName))
    
    dataSet.sort(reverse=True)
    
    numberOfContig = len(lengthDic)
    
    return numberOfContig, dataSet