Пример #1
0
def alignerSubRoutine(folderName, referenceFile, queryFile, mummerLink,
                      header):
    #alignerRobot.useMummerAlign(mummerLink, folderName, header, referenceFile, queryFile)
    numberOfFiles = houseKeeper.globalParallelFileNum
    bindir = os.path.abspath(os.path.dirname(sys.argv[0]))
    command = bindir + "/finisherSCCoreLib/fasta-splitter.pl --n-parts " + str(
        numberOfFiles) + " " + folderName + queryFile
    os.system(command)

    os.system("cp *.fasta " + folderName)
    os.system("rm *.fasta ")

    workerList = []

    for dummyI in range(1, numberOfFiles + 1):
        indexOfMum = ""
        if dummyI < 10:
            indexOfMum = "0" + str(dummyI)
        else:
            indexOfMum = str(dummyI)

        outputName, referenceName, queryName, specialName = header + indexOfMum, referenceFile, queryFile[
            0:-6] + ".part-" + indexOfMum + ".fasta", header + indexOfMum
        workerList.append([outputName, referenceName, queryName, specialName])

    alignerRobot.useMummerAlignBatch(mummerLink,
                                     folderName,
                                     workerList,
                                     houseKeeper.globalParallel,
                                     specialForRaw=False,
                                     refinedVersion=False)
    alignerRobot.combineMultipleCoorMum(True, mummerLink, folderName, header,
                                        header + "Out", numberOfFiles)
def alignerSubRoutine(folderName ,referenceFile,  queryFile, mummerLink, header ):   
    #alignerRobot.useMummerAlign(mummerLink, folderName, header, referenceFile, queryFile)
    numberOfFiles = 20
    bindir =  os.path.abspath(os.path.dirname(sys.argv[0]))   
    command = bindir + "/finisherSCCoreLib/fasta-splitter.pl --n-parts " + str(numberOfFiles) + " " + folderName + queryFile
    os.system(command)
    
    os.system("cp *.fasta " + folderName )
    os.system("rm *.fasta ")
    
    workerList = []
    
    for dummyI in range(1, numberOfFiles + 1):
        indexOfMum = ""
        if dummyI < 10:
            indexOfMum = "0" + str(dummyI)
        else:
            indexOfMum = str(dummyI)
       
        outputName, referenceName, queryName, specialName= header+indexOfMum, referenceFile,queryFile[0:-6]+".part-"+ indexOfMum + ".fasta" ,  header + indexOfMum
        workerList.append([outputName, referenceName, queryName, specialName])
        
    alignerRobot.useMummerAlignBatch(mummerLink, folderName, workerList, houseKeeper.globalParallel ,specialForRaw = False, refinedVersion = False)
    alignerRobot.combineMultipleCoorMum( True, mummerLink, folderName, header,header +"Out", numberOfFiles)
Пример #3
0
def generateAbundanceGraph(folderName, mummerLink):

    print "generateAbundanceGraph"

    """
    1. Find your favorite mappers to map read back
        a. MUMmer, Bowtie, bbmap, any that works V 
        b. And then write a short parser to parse the results V 
    """
    numberOfFiles = 20
    workerList = []
    for dummyI in range(1, numberOfFiles + 1):
        indexOfMum = ""
        if dummyI < 10:
            indexOfMum = "0" + str(dummyI)
        else:
            indexOfMum = str(dummyI)

        """
        "outGapFillRefine"+indexOfMum , "smaller_improvedContig.fasta",  "relatedReads_Double.part-" + indexOfMum + ".fasta",  "fromMumRefine" + indexOfMum
        """
        outputName, referenceName, queryName, specialName = (
            "outAbun" + indexOfMum,
            "improved3.fasta",
            "raw_reads.part-" + indexOfMum + ".fasta",
            "outAbun" + indexOfMum,
        )
        workerList.append([outputName, referenceName, queryName, specialName])

    if True:
        alignerRobot.useMummerAlignBatch(mummerLink, folderName, workerList, houseKeeper.globalParallel, False)
        """
        command = mummerLink + "nucmer --maxmatch --nosimplify -p " + folderName + "out " + folderName + "improved3.fasta "+folderName+"raw_reads.part-" + indexOfMum + ".fasta"
        os.system(command)
    
        command = mummerLink + "show-coords -r " + folderName + "out.delta > " + folderName + "fromMumAbun" + indexOfMum
        os.system(command)
        """

    dataList = []

    for i in range(1, 1 + numberOfFiles):
        if i < 10:
            indexOfMum = "0" + str(i)
        else:
            indexOfMum = str(i)
        dataList = dataList + alignerRobot.extractMumData(folderName, "outAbun" + str(indexOfMum) + "Out")

    """
    2. Calculate count on the abundances 
        a. Aggregate by taking average [put weights on bin along contigs]
        b. Inheritance and a subclass 
    """

    lenDic = IORobot.obtainLength(folderName, "improved3.fasta")
    readLenDic = IORobot.obtainLength(folderName, "raw_reads.fasta")

    myCountDic = {}
    for eachitem in lenDic:
        myCountDic[eachitem] = [0 for i in range(lenDic[eachitem])]

    thres = 30
    lenSum = 0
    extraDataList = []

    print "len(dataList)", len(dataList)

    if not abunHouseKeeper.abunGlobalAvoidrefine:
        myCountDic = evaluateCoverage(dataList, lenDic, readLenDic, folderName, mummerLink, True)
        extraDataList = alignerRobot.extractMumData(folderName, "abunMissOut")
    else:
        extraDataList = []

    dataList = dataList + extraDataList
    myCountDic = evaluateCoverage(dataList, lenDic, readLenDic, folderName, mummerLink, False)

    with open(folderName + "myCountDic.json", "w") as f:
        json.dump(myCountDic, f)

    return myCountDic
Пример #4
0
def evaluateCoverage(dataList, lenDic, readLenDic, folderName, mummerLink, continueFilter):

    myCountDic = {}
    for eachitem in lenDic:
        myCountDic[eachitem] = 0

    dataList.sort(key=itemgetter(-1))

    ctkk, ctbase = 0, 0
    toAddBackDic = copy.deepcopy(readLenDic)

    for key, items in groupby(dataList, itemgetter(-1)):
        maxMatch = -1
        bestname = ""

        for eachitem in items:
            ct = eachitem[6] / 100.0 * eachitem[4]
            if ct > maxMatch:
                maxMatch = ct
                bestname = eachitem[-2]
        myCountDic[bestname] += readLenDic[key]

        ctkk = ctkk + 1
        ctbase = ctbase + readLenDic[key]
        toAddBackDic[key] = -1

    cttot = 0
    for eachitem in readLenDic:
        cttot = cttot + readLenDic[eachitem]

    print "Missed coverage  ", (cttot - ctbase) / (4.7 * pow(10, 6))
    print "percentage miss read", (len(readLenDic) - ctkk) / (1.0 * len(readLenDic))

    toAddReadList = []
    for eachitem in toAddBackDic:
        if toAddBackDic[eachitem] >= 0:
            toAddReadList.append(eachitem)

    """
    This part need the most parallelism because it is most intense with -l 10 
    split V, workerList V , combine 
    """

    if continueFilter:
        numberOfFiles = 20

        IORobot.putListToFileO(folderName, "raw_reads.fasta", "selected_raw", toAddReadList)

        bindir = os.path.abspath(os.path.dirname(sys.argv[0]))
        command = (
            bindir
            + "/finisherSCCoreLib/fasta-splitter.pl --n-parts "
            + str(numberOfFiles)
            + " "
            + folderName
            + "selected_raw.fasta"
        )
        os.system(command)

        workerList = []

        for dummyI in range(1, numberOfFiles + 1):
            indexOfMum = ""
            if dummyI < 10:
                indexOfMum = "0" + str(dummyI)
            else:
                indexOfMum = str(dummyI)

            outputName, referenceName, queryName, specialName = (
                "outAbunRefine" + indexOfMum,
                "improved3.fasta",
                "selected_raw.part-" + indexOfMum + ".fasta",
                "abunMissOut" + indexOfMum,
            )
            workerList.append([outputName, referenceName, queryName, specialName])

        alignerRobot.useMummerAlignBatch(
            mummerLink, folderName, workerList, houseKeeper.globalParallel, specialForRaw=True, refinedVersion=True
        )
        alignerRobot.combineMultipleCoorMum(True, mummerLink, folderName, "outAbunRefine", "abunMissOut", numberOfFiles)

    for i in range(len(myCountDic)):
        eachitem = "Segkk" + str(i)
        print eachitem, myCountDic[eachitem] / (1.0 * lenDic[eachitem])
        myCountDic[eachitem] = myCountDic[eachitem] / (1.0 * lenDic[eachitem])

    return myCountDic
def getAllAssociatedReads(folderName, mummerLink,forFastaName):
    '''
    Input : relatedReads.fasta, raw_reads.fasta 
    Output : all_associated_reads.fasta
    
     Algorithm : 
        a) Get all the associated reads
        b) Loop for N=1 times : ==> this correspond 4 reads to link between the bridge in total
            i) Align the raws and tmp_seedReads
            ii) Put the new reads into the SeedReads
    '''
    
    header, referenceFile, queryFile = "seedReads", forFastaName + ".fasta" , "raw_reads.fasta"
    
    gapFiller.formRelatedReadsFile(folderName, mummerLink, "improved3")
    
    command = "cp " + folderName + "relatedReads.fasta " + folderName + referenceFile
    os.system(command)
    
    N = abunHouseKeeper.abunGlobalReadSearchDepth
    
    print "N: ", N
    if N >0 :
        for trial in range(N):
            print "trial", trial
            numberOfFiles = houseKeeper.globalParallelFileNum
            
            if True: 
                workerList = []
                
                for dummyI in range(1, numberOfFiles + 1):
                    indexOfMum = ""
                    if dummyI < 10:
                        indexOfMum = "0" + str(dummyI)
                    else:
                        indexOfMum = str(dummyI)
                    
                    outputName, referenceName, queryName, specialName= header+indexOfMum, referenceFile, "raw_reads.part-"+ indexOfMum + ".fasta",  header + indexOfMum
                    workerList.append([outputName, referenceName, queryName, specialName])
    
                alignerRobot.useMummerAlignBatch(mummerLink, folderName, workerList, houseKeeper.globalParallel ,False)
            
            dataList = []
            
            for i in range(1, 1+numberOfFiles): 
                if i < 10:
                    indexOfMum = "0" + str(i)
                else:
                    indexOfMum = str(i)
                dataList = dataList+ alignerRobot.extractMumData(folderName, header+ str(indexOfMum)+"Out")
            
            
            filterList = []
            
            lenDicRR = IORobot.obtainLength(folderName, queryFile)
            
            print "len(dataList)", len(dataList)
            for eachitem in dataList:
                if checkSatisfy(eachitem, lenDicRR):
                    filterList.append(eachitem)
                
            filterList.sort(key=itemgetter(-1))
            newReads = []
            
            for key, items in groupby(filterList, itemgetter(-1)):
                newReads.append(key)
                                        
            
            f = open(folderName + forFastaName + ".txt", 'w')
            
            for eachitem in newReads:
                f.write(eachitem + "\n")
            f.close()
                
            command = "perl -ne 'if(/^>(\S+)/){$c=$i{$1}}$c?print:chomp;$i{$_}=1 if @ARGV' " + folderName + forFastaName + ".txt " + folderName + "raw_reads.fasta > " + folderName + forFastaName + ".fasta"
            os.system(command)
    else:
        os.system("cp " + folderName + "relatedReads.fasta " + folderName + forFastaName + ".fasta")
Пример #6
0
def formReadContigStringGraph(folderName,
                              mummerLink,
                              contigFilename,
                              readsetFilename,
                              optTypeFileHeader,
                              graphName,
                              needAlignment=True):
    '''
    Input : all_associated_reads.fasta, improved3.fasta
    Output : (G) String Graph linking the reads and contigs
    Algorithm: 
        a) Form double reads and contigs                            V
        b) Mummer the data and extract dataList three times         V
        c) Use the subroutine to output a graph                     V
        d) Output the graph to a file phasing_String_graph.graph    V
    '''

    G = []

    IORobot.writeToFile_Double1(folderName, contigFilename + ".fasta",
                                contigFilename + "_Double.fasta", "contig")
    IORobot.writeToFile_Double1(folderName, readsetFilename + ".fasta",
                                readsetFilename + "_Double.fasta", "reads")

    header, referenceFile, queryFile = optTypeFileHeader + "CC", contigFilename + "_Double.fasta", contigFilename + "_Double.fasta"

    #if needAlignment:
    #    alignerRobot.useMummerAlign(mummerLink, folderName, header, referenceFile, queryFile)
    if needAlignment:
        alignerRobot.useMummerAlignBatch(
            mummerLink, folderName, [[header, referenceFile, queryFile, ""]],
            houseKeeper.globalParallel)

    lenDicCC = IORobot.obtainLength(folderName,
                                    contigFilename + "_Double.fasta")
    dataListCC = alignerRobot.extractMumData(folderName, header + "Out")
    dataListCC = abunHouseKeeper.filterData(dataListCC, lenDicCC)

    header, referenceFile, queryFile = optTypeFileHeader + "RR", readsetFilename + "_Double.fasta", readsetFilename + "_Double.fasta"

    lenDicRR = IORobot.obtainLength(folderName,
                                    readsetFilename + "_Double.fasta")

    if not abunHouseKeeper.abunGlobalRRDisable:
        if needAlignment:
            alignerSubRoutine(folderName, referenceFile, queryFile, mummerLink,
                              header)

        dataListRR = alignerRobot.extractMumData(folderName, header + "Out")
        dataListRR = abunHouseKeeper.filterData(dataListRR, lenDicRR)
        dataListRR = abunHouseKeeper.filterDataIdentical(dataListRR, lenDicRR)

    else:
        dataListRR = []

    header, referenceFile, queryFile = optTypeFileHeader + "CR", contigFilename + "_Double.fasta", readsetFilename + "_Double.fasta"
    if needAlignment:
        alignerSubRoutine(folderName, referenceFile, queryFile, mummerLink,
                          header)

    lenDicCR = dict(lenDicCC.items() + lenDicRR.items())
    dataListCR = alignerRobot.extractMumData(folderName, header + "Out")
    dataListCR = abunHouseKeeper.filterData(dataListCR, lenDicCR)

    numberOfNodes = len(lenDicCR)
    G = graphLib.seqGraph(numberOfNodes)
    N1, N2 = len(lenDicCC), len(lenDicRR)
    print "N1, N2, numberOfNodes: ", N1, N2, numberOfNodes
    '''
    e.g. of dataListCC[0], dataListRR[0], dataListCR[0]
    
    [1, 520, 2913194, 2913716, 520, 523, 99.05, 'Contig0_d', 'Contig2_d']
    [1, 1383, 1253, 2603, 1383, 1351, 82.39, 'Read0_d', 'Read1705_p']
    [1, 718, 4334, 5074, 718, 741, 91.91, 'Contig0_d', 'Read1018_d']
    
    '''

    addDataToList(dataListCC, G, 0, 0, 'C', 'C')

    addDataToList(dataListRR, G, N1, N1, 'R', 'R')

    addDataToList(dataListCR, G, 0, N1, 'C', 'R')

    Gnew = formExtraEdges(folderName, optTypeFileHeader, contigFilename, G, N1)

    Gnew.saveToFile(folderName, graphName)

    print "len(Gnew.graphNodesList)", len(Gnew.graphNodesList)
Пример #7
0
def getAllAssociatedReads(folderName, mummerLink,forFastaName):
    '''
    Input : relatedReads.fasta, raw_reads.fasta 
    Output : all_associated_reads.fasta
    
     Algorithm : 
        a) Get all the associated reads
        b) Loop for N=1 times : ==> this correspond 4 reads to link between the bridge in total
            i) Align the raws and tmp_seedReads
            ii) Put the new reads into the SeedReads
    '''
    
    header, referenceFile, queryFile = "seedReads", forFastaName + ".fasta" , "raw_reads.fasta"
    command = "cp " + folderName + "relatedReads.fasta " + folderName + referenceFile
    os.system(command)
    
    N = abunHouseKeeper.abunGlobalReadSearchDepth
    
    print "N: ", N
    if N >0 :
        for trial in range(N):
            print "trial", trial
            numberOfFiles = 20
            
            if True: 
                workerList = []
                
                for dummyI in range(1, numberOfFiles + 1):
                    indexOfMum = ""
                    if dummyI < 10:
                        indexOfMum = "0" + str(dummyI)
                    else:
                        indexOfMum = str(dummyI)
                    
                    outputName, referenceName, queryName, specialName= header+indexOfMum, referenceFile, "raw_reads.part-"+ indexOfMum + ".fasta",  header + indexOfMum
                    workerList.append([outputName, referenceName, queryName, specialName])
    
                alignerRobot.useMummerAlignBatch(mummerLink, folderName, workerList, houseKeeper.globalParallel ,False)
            
            dataList = []
            
            for i in range(1, 1+numberOfFiles): 
                if i < 10:
                    indexOfMum = "0" + str(i)
                else:
                    indexOfMum = str(i)
                dataList = dataList+ alignerRobot.extractMumData(folderName, header+ str(indexOfMum)+"Out")
            
            
            filterList = []
            
            lenDicRR = IORobot.obtainLength(folderName, queryFile)
            
            print "len(dataList)", len(dataList)
            for eachitem in dataList:
                if checkSatisfy(eachitem, lenDicRR):
                    filterList.append(eachitem)
                
            filterList.sort(key=itemgetter(-1))
            newReads = []
            
            for key, items in groupby(filterList, itemgetter(-1)):
                newReads.append(key)
                                        
            
            f = open(folderName + forFastaName + ".txt", 'w')
            
            for eachitem in newReads:
                f.write(eachitem + "\n")
            f.close()
                
            command = "perl -ne 'if(/^>(\S+)/){$c=$i{$1}}$c?print:chomp;$i{$_}=1 if @ARGV' " + folderName + forFastaName + ".txt " + folderName + "raw_reads.fasta > " + folderName + forFastaName + ".fasta"
            os.system(command)
    else:
        os.system("cp " + folderName + "relatedReads.fasta " + folderName + forFastaName + ".fasta")
Пример #8
0
def generateAbundanceGraph(folderName, mummerLink):
    
    
    print "generateAbundanceGraph"
    
    '''
    1. Find your favorite mappers to map read back
        a. MUMmer, Bowtie, bbmap, any that works V 
        b. And then write a short parser to parse the results V 
    '''
    numberOfFiles = 20
    workerList = []
    for dummyI in range(1, numberOfFiles + 1):
        indexOfMum = ""
        if dummyI < 10:
            indexOfMum = "0" + str(dummyI)
        else:
            indexOfMum = str(dummyI)
        
        '''
        "outGapFillRefine"+indexOfMum , "smaller_improvedContig.fasta",  "relatedReads_Double.part-" + indexOfMum + ".fasta",  "fromMumRefine" + indexOfMum
        '''
        outputName, referenceName, queryName, specialName= "outAbun"+indexOfMum, "improved3.fasta", "raw_reads.part-"+ indexOfMum + ".fasta",  "outAbun" + indexOfMum
        workerList.append([outputName, referenceName, queryName, specialName])
    
    if True:
        alignerRobot.useMummerAlignBatch(mummerLink, folderName, workerList, houseKeeper.globalParallel ,False)
        '''
        command = mummerLink + "nucmer --maxmatch --nosimplify -p " + folderName + "out " + folderName + "improved3.fasta "+folderName+"raw_reads.part-" + indexOfMum + ".fasta"
        os.system(command)
    
        command = mummerLink + "show-coords -r " + folderName + "out.delta > " + folderName + "fromMumAbun" + indexOfMum
        os.system(command)
        '''
        
    dataList = []
    
    for i in range(1, 1+numberOfFiles): 
        if i < 10:
            indexOfMum = "0" + str(i)
        else:
            indexOfMum = str(i)
        dataList = dataList+ alignerRobot.extractMumData(folderName, "outAbun"+ str(indexOfMum)+"Out")
    

    '''
    2. Calculate count on the abundances 
        a. Aggregate by taking average [put weights on bin along contigs]
        b. Inheritance and a subclass 
    '''
         
    lenDic = IORobot.obtainLength(folderName, "improved3.fasta")
    readLenDic = IORobot.obtainLength(folderName , "raw_reads.fasta")
    

    myCountDic = {}
    for eachitem in lenDic:
        myCountDic[eachitem] = [0 for i in range(lenDic[eachitem])]

    thres = 30
    lenSum = 0
    extraDataList= []
    
    
    print "len(dataList)", len(dataList)
    
    if not abunHouseKeeper.abunGlobalAvoidrefine: 
        myCountDic =  evaluateCoverage(dataList, lenDic, readLenDic, folderName, mummerLink,  True)
        extraDataList = alignerRobot.extractMumData(folderName, "abunMissOut" )
    else:
        extraDataList = []
        
    dataList = dataList + extraDataList
    myCountDic = evaluateCoverage(dataList, lenDic, readLenDic, folderName, mummerLink,False)
    
    with open(folderName + 'myCountDic.json', 'w') as f:
        json.dump(myCountDic, f)

    
    return myCountDic
Пример #9
0
def evaluateCoverage(dataList, lenDic, readLenDic, folderName,mummerLink, continueFilter):
    
    myCountDic = {}
    for eachitem in lenDic:
        myCountDic[eachitem] = 0
            
    dataList.sort(key = itemgetter(-1)) 
    
    ctkk, ctbase = 0, 0
    toAddBackDic = copy.deepcopy(readLenDic)
    
    for key, items in groupby(dataList, itemgetter(-1)):
        maxMatch = -1
        bestname = ""
        
        for eachitem in items:
            ct = eachitem[6]/100.0 * eachitem[4]
            if ct > maxMatch:
                maxMatch = ct 
                bestname = eachitem[-2]
        myCountDic[bestname] += readLenDic[key] 
        
        ctkk = ctkk + 1 
        ctbase = ctbase + readLenDic[key]
        toAddBackDic[key] = -1
    
    cttot = 0
    for eachitem in readLenDic:
        cttot = cttot + readLenDic[eachitem]
        
    print "Missed coverage  ", (cttot - ctbase)/(4.7*pow(10, 6))
    print "percentage miss read", (len(readLenDic) - ctkk)/(1.0*len(readLenDic)) 
    
    toAddReadList = []
    for eachitem in toAddBackDic:
        if toAddBackDic[eachitem] >= 0 :
            toAddReadList.append(eachitem)
    
    '''
    This part need the most parallelism because it is most intense with -l 10 
    split V, workerList V , combine 
    '''
    
    if continueFilter:
        numberOfFiles= 20
        
        IORobot.putListToFileO(folderName, "raw_reads.fasta" , "selected_raw", toAddReadList)
        
        bindir =  os.path.abspath(os.path.dirname(sys.argv[0]))   
        command = bindir + "/finisherSCCoreLib/fasta-splitter.pl --n-parts " + str(numberOfFiles) + " " + folderName + "selected_raw.fasta"
        os.system(command)
        
        workerList = []
        
        for dummyI in range(1, numberOfFiles + 1):
            indexOfMum = ""
            if dummyI < 10:
                indexOfMum = "0" + str(dummyI)
            else:
                indexOfMum = str(dummyI)
           
            outputName, referenceName, queryName, specialName= "outAbunRefine"+indexOfMum, "improved3.fasta", "selected_raw.part-"+ indexOfMum + ".fasta",  "abunMissOut" + indexOfMum
            workerList.append([outputName, referenceName, queryName, specialName])
            
        alignerRobot.useMummerAlignBatch(mummerLink, folderName, workerList, houseKeeper.globalParallel ,specialForRaw = True, refinedVersion = True)
        alignerRobot.combineMultipleCoorMum( True, mummerLink, folderName, "outAbunRefine", "abunMissOut", numberOfFiles)
        

        
    for i in range(len(myCountDic)):
        eachitem = "Segkk"+str(i)
        print eachitem , myCountDic[eachitem]/(1.0*lenDic[eachitem])
        myCountDic[eachitem] = myCountDic[eachitem]/(1.0*lenDic[eachitem])
        
    return myCountDic
Пример #10
0
def evaluateCoverage(dataList, lenDic, readLenDic, folderName, mummerLink,
                     continueFilter, contigFilename):
    '''
    not sure if that is the right documentation... 

    Input : string_graph_3, improved3.fasta, raw_reads.fasta
    Output : string_graph_4 with weights [need a data structure to store the weight on node]

    Algorithm : 
    1. Find your favorite mappers to map read back
        a. MUMmer, Bowtie, bbmap, any that works V 
        b. And then write a short parser to parse the results V 
    2. Calculate count on the abundances 
        a. Aggregate by taking average [put weights on bin along contigs]
        b. Inheritance and a subclass 
    3. Find your favorite graphical tool to display 
        a. Use a javascript library [halfviz should just work ! put weight on edge ]

    '''
    myCountDic = {}
    for eachitem in lenDic:
        myCountDic[eachitem] = 0

    dataList.sort(key=itemgetter(-1))

    ctkk, ctbase = 0, 0
    toAddBackDic = copy.deepcopy(readLenDic)

    for key, items in groupby(dataList, itemgetter(-1)):
        maxMatch = -1
        bestname = ""

        for eachitem in items:
            ct = eachitem[6] / 100.0 * eachitem[4]
            if ct > maxMatch:
                maxMatch = ct
                bestname = eachitem[-2]
        myCountDic[bestname] += readLenDic[key]

        ctkk = ctkk + 1
        ctbase = ctbase + readLenDic[key]
        toAddBackDic[key] = -1

    cttot = 0
    for eachitem in readLenDic:
        cttot = cttot + readLenDic[eachitem]

    print "Missed coverage  ", (cttot - ctbase) / (4.7 * pow(10, 6))
    print "percentage miss read", (len(readLenDic) - ctkk) / (1.0 *
                                                              len(readLenDic))

    toAddReadList = []
    for eachitem in toAddBackDic:
        if toAddBackDic[eachitem] >= 0:
            toAddReadList.append(eachitem)
    '''
    This part need the most parallelism because it is most intense with -l 10 
    split V, workerList V , combine 
    '''

    if continueFilter:
        numberOfFiles = houseKeeper.globalParallelFileNum

        IORobot.putListToFileO(folderName, "raw_reads.fasta", "selected_raw",
                               toAddReadList)

        bindir = os.path.abspath(os.path.dirname(sys.argv[0]))
        command = bindir + "/finisherSCCoreLib/fasta-splitter.pl --n-parts " + str(
            numberOfFiles) + " " + folderName + "selected_raw.fasta"
        os.system(command)

        workerList = []

        for dummyI in range(1, numberOfFiles + 1):
            indexOfMum = ""
            if dummyI < 10:
                indexOfMum = "0" + str(dummyI)
            else:
                indexOfMum = str(dummyI)

            outputName, referenceName, queryName, specialName = "outAbunRefine" + indexOfMum, contigFilename + ".fasta", "selected_raw.part-" + indexOfMum + ".fasta", "abunMissOut" + indexOfMum
            workerList.append(
                [outputName, referenceName, queryName, specialName])

        alignerRobot.useMummerAlignBatch(mummerLink,
                                         folderName,
                                         workerList,
                                         houseKeeper.globalParallel,
                                         specialForRaw=True,
                                         refinedVersion=True)
        alignerRobot.combineMultipleCoorMum(True, mummerLink, folderName,
                                            "outAbunRefine", "abunMissOut",
                                            numberOfFiles)

    for eachitem in lenDic:
        #eachitem = "Segkk"+str(i)
        print eachitem, myCountDic[eachitem] / (1.0 * lenDic[eachitem])
        myCountDic[eachitem] = myCountDic[eachitem] / (1.0 * lenDic[eachitem])

    return myCountDic