def xPhased(folderName , mummerLink): # ## Repeat resolution [Proxy for MB] # 1. Re-form the contig string graph with ALL connections from contigs only V # 2. Log down the reads and associated blocked contigs V # 3. Use reads to connect; # 4. Transform graph by identifying 1 successor/predecessor case ; Condense(important); # 5. Read out contigs numberOfContig, dataSet = IORobot.obtainLinkInfo(folderName, mummerLink, "improved2", "mb") lenDic = IORobot.obtainLength(folderName, "improved2_Double.fasta") confidenLenThres = 0 G = graphLib.seqGraph(numberOfContig) extraEdges = loadEdgeFromBlockedReads(folderName) for eachitem in dataSet: # print eachitem wt, myin, myout = eachitem myInData = myin[6:].split('_') myOutData = myout[6:].split('_') if myInData[1] == 'p': offsetin = 0 else: offsetin = 1 if myOutData[1] == 'p': offsetout = 0 else: offsetout = 1 i = int(myInData[0]) * 2 + offsetin j = int(myOutData[0]) * 2 + offsetout ck = False for eachedge in extraEdges: mystart, myend, len1, len2 = eachedge[0], eachedge[1], eachedge[2] , eachedge[3] if [i, j] == [mystart, myend] and min(len1, len2) >= wt and lenDic[myin] >= confidenLenThres and lenDic[myout] >= confidenLenThres: ck = True if ck: G.insertEdge(i, j, wt) # G.reportEdge() G.MBResolve() G.reportEdge() G.saveToFile(folderName, "condensedGraphMB.txt") graphFileName = "condensedGraphMB.txt" contigFile = "improved2_Double.fasta" outContigFile = "improved3.fasta" outOpenList = "openZoneMB.txt" IORobot.readContigOut(folderName, mummerLink, graphFileName, contigFile, outContigFile, outOpenList) # ## Repeat resolution [Proxy for phasing step] # 6. Find out the repeat region by MSA # 7. Find out the location of SNPs and extend across repeat # [short cut : use contig creator : your job here is to get data into the correct formats] print "xPhased"
def xPhased(folderName , mummerLink): # ## Repeat resolution [Proxy for MB] # 1. Re-form the contig string graph with ALL connections from contigs only V # 2. Log down the reads and associated blocked contigs V # 3. Use reads to connect; # 4. Transform graph by identifying 1 successor/predecessor case ; Condense(important); # 5. Read out contigs print "xPhased: Aligning improved2.fasta against itself, outputting to mb*.delta" numberOfContig, dataSet = IORobot.obtainLinkInfo(folderName, mummerLink, "improved2", "mb") lenDic = IORobot.obtainLength(folderName, "improved2_Double.fasta") confidenLenThres = 0 print "xPhased: Building seqGraph" G = graphLib.seqGraph(numberOfContig) extraEdges = loadEdgeFromBlockedReads(folderName) for eachitem in dataSet: # print eachitem wt, myin, myout = eachitem myInData = myin[6:].split('_') myOutData = myout[6:].split('_') if myInData[1] == 'p': offsetin = 0 else: offsetin = 1 if myOutData[1] == 'p': offsetout = 0 else: offsetout = 1 i = int(myInData[0]) * 2 + offsetin j = int(myOutData[0]) * 2 + offsetout ck = False for eachedge in extraEdges: mystart, myend, len1, len2 = eachedge[0], eachedge[1], eachedge[2] , eachedge[3] if [i, j] == [mystart, myend] and min(len1, len2) >= wt and lenDic[myin] >= confidenLenThres and lenDic[myout] >= confidenLenThres: ck = True if ck: G.insertEdge(i, j, wt) # G.reportEdge() G.MBResolve() G.reportEdge() print "xPhased: Saving condensed seqGraph to condensedGraphMB.txt" G.saveToFile(folderName, "condensedGraphMB.txt") graphFileName = "condensedGraphMB.txt" contigFile = "improved2_Double.fasta" outContigFile = "improved3.fasta" outOpenList = "openZoneMB.txt" print "xPhased: Outputting improved contigs from condensed seqGraph to improved3.fasta" IORobot.readContigOut(folderName, mummerLink, graphFileName, contigFile, outContigFile, outOpenList) # ## Repeat resolution [Proxy for phasing step] # 6. Find out the repeat region by MSA # 7. Find out the location of SNPs and extend across repeat # [short cut : use contig creator : your job here is to get data into the correct formats] print "xPhased"
def fetchSuccessor(folderName , mummerLink): print "fetchSuccessor" left_connect, right_connect = [], [] print "Direct greedy" print "fetchSuccessor: Aligning non-contained contigs to themselves, output files are greedy*.delta" numberOfContig, dataSet = IORobot.obtainLinkInfo(folderName, mummerLink, "noEmbed", "greedy") # [next_item, overlap_length] leftConnect = [[-1, -1] for i in range(numberOfContig)] rightConnect = [[-1, -1] for i in range(numberOfContig)] dataSet.sort(reverse=True, key=itemgetter(1)) print "fetchSuccessor: Finding best successors" for key, items in groupby(dataSet, itemgetter(1)): # if key == "Contig217_d": # print "dddd" maxVal = -1 myName = key connectorName = "" for eachsubitem in items: if eachsubitem[0] > maxVal: maxVal = eachsubitem[0] connectorName = eachsubitem[2] prefix = myName.split('_') suffix = connectorName.split('_') lengthOfOverlap = maxVal if prefix[1] == 'p': prefixContig = int(prefix[0][6:]) * 2 else: prefixContig = int(prefix[0][6:]) * 2 + 1 if suffix[1] == 'p': suffixContig = int(suffix[0][6:]) * 2 else: suffixContig = int(suffix[0][6:]) * 2 + 1 assert(rightConnect[prefixContig][0] == -1) rightConnect[prefixContig][0] = suffixContig rightConnect[prefixContig][1] = lengthOfOverlap dataSet.sort(reverse=True, key=itemgetter(2)) print "fetchSuccessor: Finding best predecessors" for key, items in groupby(dataSet, itemgetter(2)): maxVal = -1 myName = key connectorName = "" for eachsubitem in items: if eachsubitem[0] > maxVal: maxVal = eachsubitem[0] connectorName = eachsubitem[1] prefix = connectorName.split('_') suffix = myName.split('_') lengthOfOverlap = maxVal if prefix[1] == 'p': prefixContig = int(prefix[0][6:]) * 2 else: prefixContig = int(prefix[0][6:]) * 2 + 1 if suffix[1] == 'p': suffixContig = int(suffix[0][6:]) * 2 else: suffixContig = int(suffix[0][6:]) * 2 + 1 assert(leftConnect[suffixContig][0] == -1) leftConnect[suffixContig][0] = prefixContig leftConnect[suffixContig][1] = lengthOfOverlap print "fetchSuccessor: Outputting best successors to rightConnect.txt" # ## Write to file: f = open(folderName + 'rightConnect.txt', 'w') for eachitem, dummyIndex in zip(rightConnect, range(len(rightConnect))): f.write(str(dummyIndex) + ',' + str(eachitem[0]) + ',' + str(eachitem[1]) + '\n') f.close() print "fetchSuccessor: Outputting best predecessors to leftConnect.txt" f = open(folderName + 'leftConnect.txt', 'w') for eachitem, dummyIndex in zip(leftConnect, range(len(leftConnect))): f.write(str(dummyIndex) + ',' + str(eachitem[0]) + ',' + str(eachitem[1]) + '\n') f.close()
def fetchSuccessor(folderName , mummerLink): print "fetchSuccessor" left_connect, right_connect = [], [] print "Direct greedy" numberOfContig, dataSet = IORobot.obtainLinkInfo(folderName, mummerLink, "noEmbed", "greedy") # [next_item, overlap_length] leftConnect = [[-1, -1] for i in range(numberOfContig)] rightConnect = [[-1, -1] for i in range(numberOfContig)] dataSet.sort(reverse=True, key=itemgetter(1)) for key, items in groupby(dataSet, itemgetter(1)): # if key == "Contig217_d": # print "dddd" maxVal = -1 myName = key connectorName = "" for eachsubitem in items: if eachsubitem[0] > maxVal: maxVal = eachsubitem[0] connectorName = eachsubitem[2] prefix = myName.split('_') suffix = connectorName.split('_') lengthOfOverlap = maxVal if prefix[1] == 'p': prefixContig = int(prefix[0][6:]) * 2 else: prefixContig = int(prefix[0][6:]) * 2 + 1 if suffix[1] == 'p': suffixContig = int(suffix[0][6:]) * 2 else: suffixContig = int(suffix[0][6:]) * 2 + 1 assert(rightConnect[prefixContig][0] == -1) rightConnect[prefixContig][0] = suffixContig rightConnect[prefixContig][1] = lengthOfOverlap dataSet.sort(reverse=True, key=itemgetter(2)) for key, items in groupby(dataSet, itemgetter(2)): maxVal = -1 myName = key connectorName = "" for eachsubitem in items: if eachsubitem[0] > maxVal: maxVal = eachsubitem[0] connectorName = eachsubitem[1] prefix = connectorName.split('_') suffix = myName.split('_') lengthOfOverlap = maxVal if prefix[1] == 'p': prefixContig = int(prefix[0][6:]) * 2 else: prefixContig = int(prefix[0][6:]) * 2 + 1 if suffix[1] == 'p': suffixContig = int(suffix[0][6:]) * 2 else: suffixContig = int(suffix[0][6:]) * 2 + 1 assert(leftConnect[suffixContig][0] == -1) leftConnect[suffixContig][0] = prefixContig leftConnect[suffixContig][1] = lengthOfOverlap # ## Write to file: f = open(folderName + 'rightConnect.txt', 'w') for eachitem, dummyIndex in zip(rightConnect, range(len(rightConnect))): f.write(str(dummyIndex) + ',' + str(eachitem[0]) + ',' + str(eachitem[1]) + '\n') f.close() f = open(folderName + 'leftConnect.txt', 'w') for eachitem, dummyIndex in zip(leftConnect, range(len(leftConnect))): f.write(str(dummyIndex) + ',' + str(eachitem[0]) + ',' + str(eachitem[1]) + '\n') f.close()