Пример #1
0
def rewrite_align(U, NU, aliDfile, pScoreCutoff, aliFormat, outdir):
    pathoUtilsA.ensure_dir(outdir)
    f = os.path.basename(aliDfile)
    reAlignfile = outdir + os.sep + 'updated_' + f

    with open(reAlignfile, 'w') as of:
        with open(aliDfile, 'r') as in1:
            h_readId = {}
            h_refId = {}
            genomes = []
            read = []
            gCnt = 0
            rCnt = 0

            mxBitSc = 700
            sigma2 = 3
            for ln in in1:
                if (ln[0] == '@' or ln[0] == '#'):
                    of.write(ln)
                    continue

                l = ln.split('\t')

                readId = l[0]
                if (aliFormat == 0 or aliFormat == 1):  # gnu-sam or sam
                    #refId=l[2].split("ti:")[-1]
                    refId = l[2]
                    if int(
                            l[1]
                    ) & 0x4 == 4:  # bitwise FLAG - 0x4 : segment unmapped
                        continue
                elif (aliFormat == 2):  # bl8
                    refId = l[1]

                if refId == '*':
                    continue

                mObj = re.search(r'ti\|(\d+)\|org\|([^|]+)\|', refId)
                if mObj:
                    refId = "ti|" + mObj.group(1) + "|org|" + mObj.group(2)
                else:
                    mObj = re.search(r'ti\|(\d+)\|', refId)
                    if mObj and mObj.group(1) != "-1":
                        refId = "ti|" + mObj.group(1)

                (_, skipFlag) = find_entry_score(ln, l, aliFormat,
                                                 pScoreCutoff)
                if skipFlag:
                    continue

                gIdx = h_refId.get(refId, -1)
                if gIdx == -1:
                    gIdx = gCnt
                    h_refId[refId] = gIdx
                    genomes.append(refId)
                    gCnt += 1

                rIdx = h_readId.get(readId, -1)
                if rIdx == -1:
                    #hold on this new read
                    #first, wrap previous read profile and see if any previous read has a same profile with that!
                    rIdx = rCnt
                    h_readId[readId] = rIdx
                    read.append(readId)
                    rCnt += 1
                    if rIdx in U:
                        of.write(ln)
                        continue

                if rIdx in NU:
                    if (aliFormat == 0):  # gnu-sam
                        scoreComponents = l[12].split(':')
                        (upPscore,
                         pscoreSum) = find_updated_score(NU, rIdx, gIdx)
                        scoreComponents[2] = str(upPscore * pscoreSum)
                        if (scoreComponents[2] < pScoreCutoff):
                            continue
                        l[12] = ':'.join(scoreComponents)
                        ln = '\t'.join(l)
                        of.write(ln)
                    elif (aliFormat == 1):  # sam
                        (upPscore,
                         pscoreSum) = find_updated_score(NU, rIdx, gIdx)
                        if (upPscore < pScoreCutoff):
                            continue
                        if (upPscore >= 1.0):
                            upPscore = 0.999999
                        mapq2 = math.log10(1 - upPscore)
                        l[4] = str(int(round(-10.0 * mapq2)))
                        ln = '\t'.join(l)
                        of.write(ln)
                    elif (aliFormat == 2):  # bl8
                        (upPscore,
                         pscoreSum) = find_updated_score(NU, rIdx, gIdx)
                        score = upPscore * pscoreSum
                        if score <= 0.0:
                            continue
                        bitSc = math.log(score)
                        if bitSc > mxBitSc:
                            bitSc = mxBitSc
                        l[10] = str(bitSc * sigma2)
                        ln = '\t'.join(l)
                        of.write(ln)

    return reAlignfile
Пример #2
0
def rewrite_align(U, NU, aliDfile, pScoreCutoff, aliFormat, outdir):
	pathoUtilsA.ensure_dir(outdir)
	f = os.path.basename(aliDfile)
	reAlignfile = outdir + os.sep + 'updated_' + f
	
	with open(reAlignfile,'w') as of:
		with open(aliDfile,'r') as in1:
			h_readId = {}
			h_refId = {}
			genomes = []
			read =[]
			gCnt = 0
			rCnt = 0
		
			mxBitSc = 700
			sigma2 = 3
			for ln in in1:
				if (ln[0] == '@' or ln[0] == '#'):
					of.write(ln)
					continue
		
				l = ln.split('\t')
				
				readId=l[0]
				if (aliFormat == 0 or aliFormat == 1): # gnu-sam or sam
					#refId=l[2].split("ti:")[-1]
					refId=l[2]
					if int(l[1])&0x4 == 4: # bitwise FLAG - 0x4 : segment unmapped
						continue
				elif (aliFormat == 2): # bl8
					refId=l[1]
				
				if refId == '*':
					continue

				mObj=re.search(r'ti\|(\d+)\|org\|([^|]+)\|gi',refId)
				if mObj:
					refId = "ti|"+mObj.group(1)+"|org|"+mObj.group(2)
				else:
					mObj=re.search(r'ti\|(\d+)\|gi',refId)
					if mObj and mObj.group(1)!="-1":
						refId = "ti|"+mObj.group(1)
				
				(_, skipFlag) = find_entry_score(ln, l, aliFormat, pScoreCutoff)
				if skipFlag:
					continue
				
				gIdx = h_refId.get(refId,-1)
				if gIdx == -1:
					gIdx = gCnt
					h_refId[refId] = gIdx
					genomes.append(refId)
					gCnt += 1
		
				rIdx = h_readId.get(readId,-1)
				if rIdx == -1:
					#hold on this new read
					#first, wrap previous read profile and see if any previous read has a same profile with that!
					rIdx = rCnt
					h_readId[readId] = rIdx
					read.append(readId)
					rCnt += 1
					if rIdx in U:
						of.write(ln)
						continue
							
				if rIdx in NU:
					if (aliFormat == 0): # gnu-sam
						scoreComponents = l[12].split(':')
						(upPscore, pscoreSum) = find_updated_score(NU, rIdx, gIdx)
						scoreComponents[2] = str(upPscore*pscoreSum)
						if (scoreComponents[2] < pScoreCutoff):
							continue
						l[12] = ':'.join(scoreComponents)
						ln = '\t'.join(l)
						of.write(ln)
					elif (aliFormat == 1): # sam
						(upPscore, pscoreSum) = find_updated_score(NU, rIdx, gIdx)
						if (upPscore < pScoreCutoff):
							continue
						if (upPscore >= 1.0):
							upPscore = 0.999999
						mapq2 = math.log10(1 - upPscore)
						l[4] = str(int(round(-10.0*mapq2)))
						ln = '\t'.join(l)
						of.write(ln)
					elif (aliFormat == 2): # bl8
						(upPscore, pscoreSum) = find_updated_score(NU, rIdx, gIdx)
						score = upPscore*pscoreSum
						if score <= 0.0:
							continue
						bitSc = math.log(score)
						if bitSc > mxBitSc:
							bitSc = mxBitSc
						l[10] = str(bitSc*sigma2)
						ln = '\t'.join(l)
						of.write(ln)

	return reAlignfile