Exemplo n.º 1
0
def transcriptSetOverlapTargets(aDir):

    geneSetFN = '/home/chrisgre/dataSources/known/Human/geneSets/ensemblAllTranscripts.tsv'
    allExons = cgGenes.createGeneSetFromFile(geneSetFN)

    #get degradome TCCS
    #note that you need to test the AS peaks, this is the location of the targetted transcript

    aDC = cgDB.dataController(aDir, cgAlignment.cgAlignment)
    id_alignment = aDC.load()

    #create list of unique tccs.
    uniqTccs = []
    for alignment in id_alignment.values():
        chrom, strand, start, end = cg.tccSplit(alignment.tTcc)
        offset = alignment.tStart
        sLen = alignment.sLength

        if strand == '1':
            start = start - 19 + offset
            end = start + sLen
        else:
            end = end + 19 - offset
            start = end - sLen

        tcc = cg.makeTcc(chrom, strand, start, end)
        if tcc not in uniqTccs: uniqTccs.append(tcc)

    degTccs = [cg.convertToAS(x) for x in uniqTccs]

    #find all overlapping exons/transcripts, then all results sequences that overlap exons
    overlappingExons = allExons.transcriptOverlaps(degTccs)
    overlappingExonTccs = [x.tcc for x in overlappingExons]
    overlappingDegTccs = compare.compareTwoTcc(degTccs, overlappingExonTccs, 1)

    #update
    for obj in id_alignment.values():
        chrom, strand, start, end = cg.tccSplit(alignment.tTcc)
        offset = alignment.tStart
        sLen = alignment.sLength

        if strand == '1':
            start = start - 19 + offset
            end = start + sLen
        else:
            end = end + 19 - offset
            start = end - sLen

        tcc = cg.makeTcc(chrom, strand, start, end)
        degTcc = cg.convertToAS(tcc)

        if degTcc in overlappingDegTccs:
            obj.transcriptOverlap = True
        else:
            obj.transcriptOverlap = False

    aDC.commit(id_alignment)
Exemplo n.º 2
0
def transcriptSetOverlapTargets(aDir):

	geneSetFN = '/home/chrisgre/dataSources/known/Human/geneSets/ensemblAllTranscripts.tsv'
	allExons = cgGenes.createGeneSetFromFile(geneSetFN)

	#get degradome TCCS
	#note that you need to test the AS peaks, this is the location of the targetted transcript
        
        aDC = cgDB.dataController(aDir, cgAlignment.cgAlignment)
        id_alignment = aDC.load()
        
        #create list of unique tccs.
        uniqTccs = []
        for alignment in id_alignment.values():
                chrom, strand, start, end = cg.tccSplit(alignment.tTcc)
                offset = alignment.tStart
                sLen = alignment.sLength

                if strand == '1':
                        start = start - 19 + offset
                        end = start + sLen
                else:
                        end = end + 19 - offset
                        start = end - sLen

                tcc = cg.makeTcc(chrom, strand, start, end)
                if tcc not in uniqTccs: uniqTccs.append(tcc)

        degTccs = [cg.convertToAS(x) for x in uniqTccs]

	#find all overlapping exons/transcripts, then all results sequences that overlap exons
	overlappingExons = allExons.transcriptOverlaps(degTccs)
        overlappingExonTccs = [x.tcc for x in overlappingExons]
	overlappingDegTccs = compare.compareTwoTcc(degTccs, overlappingExonTccs, 1)

        #update
        for obj in id_alignment.values():         
                chrom, strand, start, end = cg.tccSplit(alignment.tTcc)
                offset = alignment.tStart
                sLen = alignment.sLength

                if strand == '1':
                        start = start - 19 + offset
                        end = start + sLen
                else:
                        end = end + 19 - offset
                        start = end - sLen

                tcc = cg.makeTcc(chrom, strand, start, end)
                degTcc = cg.convertToAS(tcc)

                if degTcc in overlappingDegTccs:
                        obj.transcriptOverlap = True
	        else:
                        obj.transcriptOverlap = False 

        aDC.commit(id_alignment)
def transcriptSetOverlapDegFileHitmap(degFile, runningChrom, runningStrand):

	geneSetFN = '/home/chrisgre/dataSources/known/Human/geneSets/ensemblAllTranscripts.tsv'
	allExons = cgGenes.createGeneSetFromFile(geneSetFN)
        transcriptTccs = []
        for gene in allExons.set.values():
                for transcript in gene.transcripts:
                        transcriptTccs.append(transcript.tcc)

        #create hitmap
        coordSet = set()
        for tcc in transcriptTccs:
                chrom, strand, start, end = cg.tccSplit(tcc)
                
                if chrom != runningChrom:
                        continue

                if strand != runningStrand:
                        continue

                for i in range(start, end + 1):
                        coordSet.add(i)

        #find overlapping degTccs
        print 'done creating hitmap'
        

        f = open(degFile, 'r')
	newLines = []
	for line in f:
	        ls = line.strip().split('\t') 
                degTcc = cg.convertToAS(ls[1])
                chrom, strand, start, end = cg.tccSplit(degTcc)
                if chrom != runningChrom:
                        continue

                if strand != runningStrand:
                        continue

                inTran = '0'
                for i in xrange(start, end + 1):
                        if i in coordSet:
                                inTran = '1'
                                break

		#update newLines
                newLine = cg.appendToLine(line, inTran, 3)
                newLines.append(newLine)         
	f.close()

        f = open(degFile + '.%s.%s' % (runningChrom, runningStrand), 'w')
        f.writelines(newLines)
        f.close()
Exemplo n.º 4
0
def transcriptSetOverlapDegFileHitmap(degFile, runningChrom, runningStrand):

    geneSetFN = '/home/chrisgre/dataSources/known/Human/geneSets/ensemblAllTranscripts.tsv'
    allExons = cgGenes.createGeneSetFromFile(geneSetFN)
    transcriptTccs = []
    for gene in allExons.set.values():
        for transcript in gene.transcripts:
            transcriptTccs.append(transcript.tcc)

#create hitmap
    coordSet = set()
    for tcc in transcriptTccs:
        chrom, strand, start, end = cg.tccSplit(tcc)

        if chrom != runningChrom:
            continue

        if strand != runningStrand:
            continue

        for i in range(start, end + 1):
            coordSet.add(i)

#find overlapping degTccs
    print 'done creating hitmap'

    f = open(degFile, 'r')
    newLines = []
    for line in f:
        ls = line.strip().split('\t')
        degTcc = cg.convertToAS(ls[1])
        chrom, strand, start, end = cg.tccSplit(degTcc)
        if chrom != runningChrom:
            continue

        if strand != runningStrand:
            continue

        inTran = '0'
        for i in xrange(start, end + 1):
            if i in coordSet:
                inTran = '1'
                break

    #update newLines
        newLine = cg.appendToLine(line, inTran, 3)
        newLines.append(newLine)
    f.close()

    f = open(degFile + '.%s.%s' % (runningChrom, runningStrand), 'w')
    f.writelines(newLines)
    f.close()
Exemplo n.º 5
0
def countWithBinsSet(dFN, binDir, type = 'INTRON'):

        dNX = cgNexusFlat.Nexus(dFN, cgDegPeak.Peak)
        dNX.load(['tcc'])

        
        numBins = 1
        
        c_s_bin_set = {}

        for chrom in bioLibCG.humanChromosomes:
                for strand in ('1', '-1'):
                        #initialize data structure
                        for i in range(0, numBins):
                                c_s_bin_set.setdefault(chrom, {}).setdefault(strand, {})[i] = set()
                        f = open(binDir + '/%s.%s.%s.bins' % (type, chrom, strand), 'r')
                        for line in f:
                                ls = line.strip().split('\t')
                                tccs = ls[1:numBins + 1]
                                for i in range(0,numBins):
                                        ch, st, sta, end = bioLibCG.tccSplit(tccs[i])
                                        for j in range(sta, end + 1):
                                                c_s_bin_set[chrom][strand][i].add(j)

        #collect dTtcs in list
        dTccs = []
        for dID in dNX.tcc:
                tcc = dNX.tcc[dID]
                c, s, st, en = bioLibCG.tccSplit(tcc)
                if s == '1':
                        s = '-1'
                        en = st
                else:
                        s = '1'
                        st = en
                dTccs.append(bioLibCG.makeTcc(c,s,st,en))

        #make bCounts
        binCounts = [0] * numBins

        #count for each bin
        for i in range(0, numBins):
                for dTcc in dTccs:
                        c, s, st, en = bioLibCG.tccSplit(dTcc)
                        for j in range(st, en + 1):
                                if j in c_s_bin_set[c][s][i]:
                                        binCounts[i] += 1
                print '%s\t%s' % (i, binCounts[i])
Exemplo n.º 6
0
def countWithBinsSet(dFN, binDir, type='INTRON'):

    dNX = cgNexusFlat.Nexus(dFN, cgDegPeak.Peak)
    dNX.load(['tcc'])

    numBins = 1

    c_s_bin_set = {}

    for chrom in bioLibCG.humanChromosomes:
        for strand in ('1', '-1'):
            #initialize data structure
            for i in range(0, numBins):
                c_s_bin_set.setdefault(chrom, {}).setdefault(strand,
                                                             {})[i] = set()
            f = open(binDir + '/%s.%s.%s.bins' % (type, chrom, strand), 'r')
            for line in f:
                ls = line.strip().split('\t')
                tccs = ls[1:numBins + 1]
                for i in range(0, numBins):
                    ch, st, sta, end = bioLibCG.tccSplit(tccs[i])
                    for j in range(sta, end + 1):
                        c_s_bin_set[chrom][strand][i].add(j)

    #collect dTtcs in list
    dTccs = []
    for dID in dNX.tcc:
        tcc = dNX.tcc[dID]
        c, s, st, en = bioLibCG.tccSplit(tcc)
        if s == '1':
            s = '-1'
            en = st
        else:
            s = '1'
            st = en
        dTccs.append(bioLibCG.makeTcc(c, s, st, en))

    #make bCounts
    binCounts = [0] * numBins

    #count for each bin
    for i in range(0, numBins):
        for dTcc in dTccs:
            c, s, st, en = bioLibCG.tccSplit(dTcc)
            for j in range(st, en + 1):
                if j in c_s_bin_set[c][s][i]:
                    binCounts[i] += 1
        print '%s\t%s' % (i, binCounts[i])
Exemplo n.º 7
0
def testOverlaps(dataFN, oFF):

    dataNX = Nexus(dataFN, oFF)
    dataNX.load(['tcc'])

    #check for overlaps
    overlappingIDs = set()
    chrom_strand_range = {}
    while dataNX.nextID():
        chrom, strand, start, end = bioLibCG.tccSplit(dataNX.tcc)

        #check if overlap
        chrom_strand_range.setdefault(chrom, {}).setdefault(strand, set())
        overlap = False
        for i in range(start, end + 1):
            if i in chrom_strand_range[chrom][strand]:
                overlap = True
                break

        #tag or add these coordinates 
        if overlap:
            overlappingIDs.add(dataNX.id)
        else:
            for i in range(start, end + 1):
                chrom_strand_range[chrom][strand].add(i)

    print "THESE OVERLAP", overlappingIDs
Exemplo n.º 8
0
def updateContext(oFN, wigDir, chrom, strand, switchStrand = False):
        
        oNX = cgNexusFlat.Nexus(oFN, cgDegPeak.Peak)
        oNX.load(['tcc', 'context'])
        
        if switchStrand:
            strand = str(-int(strand))
        else:
            strand = str(strand)
        
        print 'loading wig'
        coord_contexts = cgWig.loadSingleWigContext(wigDir, chrom, strand, 'context') 
        print 'done loading'

        ds = bioLibCG.dominantSpotter(['C_EXON', 'C_3UTR', 'C_5UTR', 'NC_EXON', 'NC_3UTR', 'NC_5UTR', 'C_INTRON', 'NC_INTRON', 'INTER']) 


        for oID in oNX.tcc:

                oChrom, oStrand, start, end = bioLibCG.tccSplit(oNX.tcc[oID])
                
                #deg wigs is AS to actual clipping site
                if switchStrand:
                    oStrand = str(-int(strand))
                else:
                    oStrand = str(oStrand)
        
                if oChrom == chrom and oStrand == strand:

                        contexts = coord_contexts.get(start, 'INTER').split(',')
                        oNX.context[oID] = ds.spotItem(contexts)

        
        oNX.save()
Exemplo n.º 9
0
def updateContext(oFN, wigDir, chrom, strand, rn=None, tn=None):

    oNX = cgNexusFlat.Nexus(oFN, degPeak.degPeak)
    oNX.load(['context', 'tcc'], [rn, tn])

    print 'loading wig'
    coord_contexts = cgWig.loadSingleWigContext(wigDir, chrom, strand,
                                                'context')
    print 'done loading'

    ds = bioLibCG.dominantSpotter([
        'C_EXON', 'C_3UTR', 'C_5UTR', 'NC_EXON', 'NC_3UTR', 'NC_5UTR',
        'C_INTRON', 'NC_INTRON', 'INTER'
    ])

    for oID in oNX.tcc:

        oChrom, oStrand, start, end = bioLibCG.tccSplit(oNX.tcc[oID])

        #deg wigs is AS to actual clipping site
        if oStrand == '1':
            oStrand = '-1'
        else:
            oStrand = '1'

        if oChrom == chrom and oStrand == strand:

            contexts = coord_contexts.get(start, 'INTER').split(',')
            oNX.context[oID] = ds.spotItem(contexts)

    oNX.save()
Exemplo n.º 10
0
def newCoords(fN):
        
        f = open(fN, 'r')
        for line in f:
                ls = line.strip().split('\t')
                chrom, strand, start, end = bioLibCG.tccSplit(ls[0])
                print '%s:%s-%s' % (chrom, start, end)
Exemplo n.º 11
0
def updateIContext(oFN, wigDir, chrom, strand, rn = None, tn = None):
        
        oNX = cgNexusFlat.Nexus(oFN, cgDegPeak.Peak)
        oNX.load(['tcc', 'iContexts'], [rn, tn])
        
         
        if strand == '1':
                strand = '-1'
        else:
                strand = '1'
        
        print 'loading wig'
        coord_contexts = cgWig.loadSingleWigContext(wigDir, chrom, strand, 'iContext') 
        print 'done loading'

        for oID in oNX.tcc:

                oChrom, oStrand, start, end = bioLibCG.tccSplit(oNX.tcc[oID])
                
                #deg wigs is AS to actual clipping site
                if oStrand == '1':
                        oStrand = '-1'
                else:
                        oStrand = '1'
        
                if oChrom == chrom and oStrand == strand:

                        contexts = coord_contexts.get(start, '-1').split(',')
                        contexts = [int(x) for x in contexts]

                        oNX.iContexts[oID] = contexts

        
        oNX.save()
Exemplo n.º 12
0
def getPlotData(aSites, wigDir, outFN):
    '''get box plot data from sites in degradome'''

    #load and init
    spreadRange = range(-200, 201) #200 +/- ... might want to check distance each AAUAAA is from each other
    relCoord_degVals = dict( (i, []) for i in spreadRange )
    
    for chrom in bioLibCG.humanChromosomes:
        for strand in ('1', '-1'):
            print chrom, strand
            coord_value = cgWig.loadSingleWig(wigDir, chrom, strand, 'ALL')
            f = open(aSites, 'r')
            for line in f:
                ls = line.strip().split('\t')
                ichrom, istrand, start, end = bioLibCG.tccSplit(ls[0])
                if ichrom != chrom or istrand != strand: continue

                for i in spreadRange:
                    degVal = coord_value.get(end + i, 0)
                    relCoord_degVals[i].append(degVal)
            f.close()
    
    #output box data
    #each row is a histogram of spread position (e.g., first row is -200)
    f = open(outFN, 'w')
    outLines = []
    for i in spreadRange:
        l = [str(x) for x in relCoord_degVals[i]]
        outLines.append('\t'.join(l) + '\n')
    f.writelines(outLines)
    f.close()
Exemplo n.º 13
0
def returnContBlocks(profile, tcc, minLevel = 5):
	chrom, strand, start, end = cg.tccSplit(tcc)
	
	pCoords = profile.keys()
	pCoords.sort()
	
	inBlock = False
	bStart = None
	
	blocks = []	
	for pCoord in pCoords:
		if int(profile[pCoord]) > minLevel: #expression's high enough
			if inBlock:
				continue
			else: #block Start
				bStart = pCoord
				inBlock = True
		else: #not high enough
			if inBlock: #end the block
				blocks.append('%s:%s:%s:%s' % (chrom, strand, bStart, pCoord - 1))
				inBlock = False
			else:
				continue
	
	return blocks				
Exemplo n.º 14
0
def testOverlaps(dataFN, oFF):

    dataNX = Nexus(dataFN, oFF)
    dataNX.load(['tcc'])

    #check for overlaps
    overlappingIDs = set()
    chrom_strand_range = {}
    while dataNX.nextID():
        chrom, strand, start, end = bioLibCG.tccSplit(dataNX.tcc)

        #check if overlap
        chrom_strand_range.setdefault(chrom, {}).setdefault(strand, set())
        overlap = False
        for i in range(start, end + 1):
            if i in chrom_strand_range[chrom][strand]:
                overlap = True
                break

        #tag or add these coordinates
        if overlap:
            overlappingIDs.add(dataNX.id)
        else:
            for i in range(start, end + 1):
                chrom_strand_range[chrom][strand].add(i)

    print "THESE OVERLAP", overlappingIDs
def markCenterExpression(aFN, wigDir, rn = None, tn = None):

        extend = 25
        
        timer = bioLibCG.cgTimer()
        timer.start()

        aNX = cgNexusFlat.Nexus(aFN, cgAlignmentFlat.cgAlignment)
        aNX.load(['centerExpression', 'tTcc', 'tStart', 'sLength', 'tELevel'], [rn, tn])
        
        #load expression of degradome
        wigDict = cgWig.loadWigDict(wigDir)
        
        for aID in aNX.centerExpression:
                aNX.centerExpression[aID] = [0.0, 0.0, 0.0]      
                chrom, strand, start, end = bioLibCG.tccSplit(aNX.tTcc[aID])
                offset = aNX.tStart[aID]
                sLen = aNX.sLength[aID]

                if strand == '1':
                        start = start - extend + offset
                        end = start + sLen
                else:
                        end = end + extend - offset
                        start = end - sLen

                scanRange = bioLibCG.makeTcc(chrom, strand, start, end)
                stretch = cgWig.getExpressionProfile(scanRange, wigDict)

                #make sure peak is in the small range
                peakLevel = aNX.tELevel[aID]
                peakInRange = (peakLevel in stretch.values())
                

                expressionSum = sum(stretch.values())
                sortedKeys = stretch.keys()
                sortedKeys.sort()

                if strand == '-1':
                        sortedKeys.reverse()
                

                if expressionSum != 0 and peakInRange:

                        sumE = 0.0
                        for key in sortedKeys[8:12]:
                                sumE += stretch[key]
                        aNX.centerExpression[aID][0] = sumE/expressionSum

                        sumE = 0.0
                        for key in sortedKeys[7:13]:
                                sumE += stretch[key]
                        aNX.centerExpression[aID][1] = sumE/expressionSum

                        sumE = 0.0
                        for key in sortedKeys[6:14]:
                                sumE += stretch[key]
                        aNX.centerExpression[aID][2] = sumE/expressionSum
        
        aNX.save()
Exemplo n.º 16
0
def collectIntronSeqs(fN, outFN, assembly, amount=100, prime3=True):

    myG = gf.GenomeFetch(assembly)
    fOut = open(outFN, 'w')
    f = open(fN, 'r')
    for line in f:
        ls = line.strip().split('\t')
        if not 'C_INTRON' in ls[1]: continue
        chrom, strand, start, end = bioLibCG.tccSplit(ls[2])

        if end - start < amount: continue

        if strand == '1':
            if prime3:
                tcc = bioLibCG.makeTcc(chrom, strand, end - amount, end)
            seq = myG.getSequence(tcc)
            fOut.write('%s\n' % seq[::-1])

        else:
            if prime3:
                tcc = bioLibCG.makeTcc(chrom, strand, start, start + amount)
            seq = myG.getSequence(tcc)
            fOut.write('%s\n' % seq[::-1])

    f.close()
    fOut.close()
Exemplo n.º 17
0
def updateHitMap(chrom_strand_coord, tcc):

    chrom, strand, start, end = bioLibCG.tccSplit(tcc)

    for i in xrange(start, end + 1):
        chrom_strand_coord.setdefault(chrom, {}).setdefault(strand,
                                                            set()).add(i)
Exemplo n.º 18
0
def probe(tcc, conf = None):
	
	if not conf:
		mConf = c.cgConfig('Main.conf')
	smallPath = mConf.conf['smallPath']
	
	chrom, strand, start, end = cg.tccSplit(tcc)
	
	total = 0
	for lib in cg.recurseDir(smallPath, end = 'mapped.%s.wig' % strand):
		
		
		try:
			eLevels = stepVectorScan.scanVectorsFile(lib, [tcc])
		except:
			print lib, 'index failed'
			continue
			
		
		#find highest expression level
		highest = 0
		for coord in eLevels:
			if eLevels[coord] > highest:
				highest = eLevels[coord]
				
				
		if highest > 0:
			print lib, highest
			total += highest
			#print eLevels
		
	print total
Exemplo n.º 19
0
def getNumberContextPosition(iContextDir, outFN, cType = 'INTRON', prime5 = True, ):

        #load all context NXs
        print 'loading NXs'
        chrom_strand_NX = {}
        for chrom in bioLibCG.humanChromosomes:
                for strand in ('1', '-1'):
                        iFN = iContextDir + '/%s.%s.ids' % (chrom, strand) 
                        NX = cgNexusFlat.Nexus(iFN, cgIContext.IContext)
                        NX.load(['type', 'tcc'])

                        chrom_strand_NX.setdefault(chrom, {})[strand] = NX
        print 'done...'


        print 'getting context occupancy'
        #get the amount of introns/etc that occupy X
        pos_numIntrons = {}
        for chrom in chrom_strand_NX:
            for strand in chrom_strand_NX[chrom]:
                NX = chrom_strand_NX[chrom][strand]
                for id in NX.ids:
                    if cType not in NX.type[id]: continue 
                    c, s, st, en = bioLibCG.tccSplit(NX.tcc[id])
                    iLen = en - st
                    for i in xrange(0, iLen):
                            pos_numIntrons[i] = pos_numIntrons.get(i, 0) + 1

        fOut = open(outFN, 'w')
        for pos, numIntrons in pos_numIntrons.iteritems():
            fOut.write('%s\t%s\n' % (pos, numIntrons) )
        fOut.close()
Exemplo n.º 20
0
def collectIntronSeqs(fN, outFN, assembly, amount = 100, prime3 = True):

    myG = gf.GenomeFetch(assembly)
    fOut = open(outFN, 'w')
    f = open(fN, 'r')
    for line in f:
        ls = line.strip().split('\t')
        if not 'C_INTRON' in ls[1]: continue
        chrom, strand, start, end = bioLibCG.tccSplit(ls[2])

        if end - start < amount: continue
        
        if strand == '1':
            if prime3:
                tcc = bioLibCG.makeTcc(chrom, strand, end - amount, end)
            seq = myG.getSequence(tcc)
            fOut.write('%s\n' % seq[::-1])

        else:
            if prime3:
                tcc = bioLibCG.makeTcc(chrom, strand, start, start + amount)
            seq = myG.getSequence(tcc)
            fOut.write('%s\n' % seq[::-1])


    f.close()
    fOut.close()
Exemplo n.º 21
0
def newCoords(fN):

    f = open(fN, 'r')
    for line in f:
        ls = line.strip().split('\t')
        chrom, strand, start, end = bioLibCG.tccSplit(ls[0])
        print '%s:%s-%s' % (chrom, start, end)
Exemplo n.º 22
0
def updateTranscriptOverlap(oFN, wigDir, chrom, strand, rn=None, tn=None):

    oNX = cgNexusFlat.Nexus(oFN, cgDegPeak.Peak)
    oNX.load(['tOverlap', 'tcc'], [rn, tn])

    #load the AS wig file for this degradome strand
    if strand == '1':
        strand = '-1'
    else:
        strand = '1'

    coord_transcripts = cgWig.loadSingleWigTranscript(wigDir, chrom, strand,
                                                      'transcript')

    for oID in oNX.tOverlap:

        tChrom, tStrand, start, end = bioLibCG.tccSplit(oNX.tcc[oID])
        if tStrand == '1':
            tStrand = '-1'
        else:
            tStrand = '1'

        if tChrom != chrom or tStrand != strand: continue

        oNX.tOverlap[oID] = False
        for i in xrange(start, end + 1):
            if i in coord_transcripts:
                oNX.tOverlap[oID] = True
                break

    oNX.save()
Exemplo n.º 23
0
def probe(tcc, conf=None):

    if not conf:
        mConf = c.cgConfig('Main.conf')
    smallPath = mConf.conf['smallPath']

    chrom, strand, start, end = cg.tccSplit(tcc)

    total = 0
    for lib in cg.recurseDir(smallPath, end='mapped.%s.wig' % strand):

        try:
            eLevels = stepVectorScan.scanVectorsFile(lib, [tcc])
        except:
            print lib, 'index failed'
            continue

        #find highest expression level
        highest = 0
        for coord in eLevels:
            if eLevels[coord] > highest:
                highest = eLevels[coord]

        if highest > 0:
            print lib, highest
            total += highest
            #print eLevels

    print total
Exemplo n.º 24
0
def updateGeneName(dFN,
                   fFN,
                   wigDir,
                   chrom,
                   strand,
                   prefix,
                   switchStrand=False):

    NX = Nexus(dFN, fFN)
    NX.load(['geneNames', 'tcc'])

    if switchStrand:
        strand = -strand

    strand = str(strand)
    coord_gName = cgWig.loadSingleWigTranscript(wigDir, chrom, strand, prefix)

    while NX.nextID():

        chrom, strand, start, end = bioLibCG.tccSplit(NX.tcc)

        overlappingGenes = coord_gName.get(start, ".")
        if overlappingGenes == "NONE":
            NX.geneNames = []
        else:
            NX.geneNames = overlappingGenes.split(',')

    NX.save()
Exemplo n.º 25
0
def updateTranscriptOverlap(oFN, wigDir, chrom, strand, rn = None, tn = None):
        
        oNX = cgNexusFlat.Nexus(oFN, cgDegPeak.Peak)
        oNX.load(['tOverlap', 'tcc'], [rn, tn])

        #load the AS wig file for this degradome strand
        if strand == '1':
                strand = '-1'
        else:
                strand = '1'
        
        coord_transcripts = cgWig.loadSingleWigTranscript(wigDir, chrom, strand, 'transcript')

        for oID in oNX.tOverlap:

                
                tChrom, tStrand, start, end = bioLibCG.tccSplit(oNX.tcc[oID])
                if tStrand == '1':
                        tStrand = '-1'
                else:
                        tStrand = '1'

                if tChrom != chrom or tStrand != strand: continue
                
                oNX.tOverlap[oID] = False
                for i in xrange(start, end + 1):
                        if i in coord_transcripts:
                                oNX.tOverlap[oID] = True
                                break
        

        oNX.save()
Exemplo n.º 26
0
def updateContext(oFN, wigDir, chrom, strand, rn = None, tn = None):

        oNX = cgNexusFlat.Nexus(oFN, degPeak.degPeak)
        oNX.load(['context', 'tcc'], [rn, tn])

        print 'loading wig'
        coord_contexts = cgWig.loadSingleWigContext(wigDir, chrom, strand, 'context') 
        print 'done loading'

        ds = bioLibCG.dominantSpotter(['C_EXON', 'C_3UTR', 'C_5UTR', 'NC_EXON', 'NC_3UTR', 'NC_5UTR', 'C_INTRON', 'NC_INTRON', 'INTER']) 


        for oID in oNX.tcc:

                oChrom, oStrand, start, end = bioLibCG.tccSplit(oNX.tcc[oID])
                
                #deg wigs is AS to actual clipping site
                if oStrand == '1':
                        oStrand = '-1'
                else:
                        oStrand = '1'
        
                if oChrom == chrom and oStrand == strand:

                        contexts = coord_contexts.get(start, 'INTER').split(',')
                        oNX.context[oID] = ds.spotItem(contexts)
        
        oNX.save()
Exemplo n.º 27
0
def scanVectorsFile(fN, tccList):
	'''Given tcc list --> scan wig files and return coord:value...
	'''	
	timer = cg.cgTimer()
	timer.start()
	coordDict = {} # tcc: [list values]
	for tcc in tccList:
		chrom, strand, tccStart, tccEnd = cg.tccSplit(tcc)
		
		#goto correct line in index
		fIndex = cgIndex.lineIndex(fN, header = True) #!!!there actually is a header...have to deal with this...
		fIndex.passCheckFunction(cgIndex.wigCheckFunction)
		fIndex.binarySearch(tcc) #places file pointer at beginning of tcc as beginning
				
		stop = False
		for line in fIndex.file:
			#print 'Line:', line.strip()
			lBeg = int(cg.ss(line)[1])
			lEnd = int(cg.ss(line)[2])
			lValue = int(cg.ss(line)[3].split('.')[0])
			
			if tccStart > lBeg:
				lBeg = tccStart
			if tccEnd < lEnd:
				lEnd = tccEnd
				stop = True
			#print timer.split()

			for i in range(lBeg, lEnd):
				coordDict[i] = lValue
				
			if stop: break
	
		#fIndex.close()
	return coordDict
Exemplo n.º 28
0
def makeWig(fN, assembly, format=None, name=None):
    '''format assumes bowtie
	suitible for medium mapped files.
	takes longer.'''
    #assume bowtie
    if not format: format = 'Bowtie'
    parserFunction = returnParserFunction(format)
    if not name: name = cg.getBaseFileName(fN, naked=True)
    lDict = cg.returnChromLengthDict(assembly)

    for chrom in lDict:
        if not chrom in cg.acceptableChroms: continue
        for strand in ['1', '-1']:
            f = open(fN, 'r')
            #create hitmap of chrom and strand
            print chrom, strand, 'hitmap'
            hitDict = {}
            for line in f:

                lChrom, lStrand, start, end = cg.tccSplit(parserFunction(line))
                lStrand = str(lStrand)
                start = int(start)
                end = int(end)
                if chrom == lChrom and strand == lStrand:
                    for i in range(start, end + 1):
                        try:
                            hitDict[i] += 1
                        except KeyError:
                            hitDict[i] = 1

            #write results to wig file
            writeWigFromHitDict(hitDict, assembly)
Exemplo n.º 29
0
def updateContext(fN, fF, wigDir, chrom, strand, switchStrand = False):
        
    NX = Nexus(fN, fF)
    NX.load(['tcc', 'context'])
    
    if switchStrand:
        strand = str(-int(strand))
    else:
        strand = str(strand)
    
    print 'loading wig'
    coord_contexts = cgWig.loadSingleWigContext(wigDir, chrom, strand, 'context') 
    print 'done loading'

    ds = bioLibCG.dominantSpotter(['C_EXON', 'C_3UTR', 'C_5UTR', 'NC_EXON', 'NC_3UTR', 'NC_5UTR', 'C_INTRON', 'NC_INTRON', 'INTER']) 


    while NX.nextID():

        oChrom, oStrand, start, end = bioLibCG.tccSplit(NX.tcc)
        
        #deg wigs is AS to actual clipping site
        if switchStrand:
            oStrand = str(-int(strand))
        else:
            oStrand = str(oStrand)

        if oChrom == chrom and oStrand == strand:

            contexts = coord_contexts.get(start, 'INTER').split(',')
            NX.context = ds.spotItem(contexts)

    
    NX.save()
Exemplo n.º 30
0
def profileAroundPoint(zeroPoint, span, cName, ratio=False, ratioCoord=None):
    '''span is +/- that number...  if you put in 30, you'll get 
	-29 to 29 around 0
	zeroPoint must be in tcc format with point at start
	ratio will return all points around zero point as a ratio of zero point
	so zeroPoint = 1.0 and the rest will be fractions of that...'''
    chrom, strand, zPoint, end = cg.tccSplit(zeroPoint)
    rStart = zPoint - span
    rEnd = zPoint + span

    rTcc = cg.makeTcc(chrom, strand, rStart, rEnd)
    scanDict = svCoord([rTcc], cName)

    #reorient so that zero is at zero
    #find position of zero
    returnDict = {}
    if not ratio:
        for i in range(1 - span, span):
            returnDict[i] = scanDict[zPoint + i]
    else:
        if ratioCoord:
            zeroVal = scanDict[ratioCoord]
        else:
            zeroVal = scanDict[zPoint]
        if zeroVal == 0: zeroVal = 1
        for i in range(1 - span, span):
            r = float(scanDict[zPoint + i]) / float(zeroVal)
            returnDict[i] = r

    return returnDict
Exemplo n.º 31
0
def scanVectorsFile(fN, tccList):
	'''Given tcc list --> scan wig files and return coord:value...
	'''	
	timer = cg.cgTimer()
	timer.start()
	coordDict = {} # tcc: [list values]
	for tcc in tccList:
		chrom, strand, tccStart, tccEnd = cg.tccSplit(tcc)
		
		#goto correct line in index
		fIndex = cgIndex.lineIndex(fN, header = True) #!!!there actually is a header...have to deal with this...
		fIndex.passCheckFunction(cgIndex.wigCheckFunction)
		fIndex.binarySearch(tcc) #places file pointer at beginning of tcc as beginning
				
		stop = False
		for line in fIndex.file:
			#print 'Line:', line.strip()
			lBeg = int(cg.ss(line)[1])
			lEnd = int(cg.ss(line)[2])
			lValue = int(cg.ss(line)[3].split('.')[0])
			
			if tccStart > lBeg:
				lBeg = tccStart
			if tccEnd < lEnd:
				lEnd = tccEnd
				stop = True
			#print timer.split()

			for i in range(lBeg, lEnd):
				coordDict[i] = lValue
				
			if stop: break
	
		#fIndex.close()
	return coordDict
Exemplo n.º 32
0
def profileAroundPoint(zeroPoint, span, cName, ratio = False, ratioCoord = None):
	'''span is +/- that number...  if you put in 30, you'll get 
	-29 to 29 around 0
	zeroPoint must be in tcc format with point at start
	ratio will return all points around zero point as a ratio of zero point
	so zeroPoint = 1.0 and the rest will be fractions of that...'''
	chrom, strand, zPoint, end = cg.tccSplit(zeroPoint)
	rStart = zPoint - span
	rEnd = zPoint + span
	
	rTcc = cg.makeTcc(chrom, strand, rStart, rEnd)
	scanDict = svCoord([rTcc], cName)
	
	#reorient so that zero is at zero 
	#find position of zero
	returnDict = {}
	if not ratio:
		for i in range(1-span, span):
			returnDict[i] = scanDict[zPoint + i]
	else:
		if ratioCoord:
			zeroVal = scanDict[ratioCoord]
		else:
			zeroVal = scanDict[zPoint]
		if zeroVal == 0: zeroVal = 1 
		for i in range(1-span, span):
			r = float(scanDict[zPoint + i])/float(zeroVal)
			returnDict[i] = r
	
	return returnDict
Exemplo n.º 33
0
def returnContBlocks(profile, tcc, minLevel=5):
    chrom, strand, start, end = cg.tccSplit(tcc)

    pCoords = profile.keys()
    pCoords.sort()

    inBlock = False
    bStart = None

    blocks = []
    for pCoord in pCoords:
        if int(profile[pCoord]) > minLevel:  #expression's high enough
            if inBlock:
                continue
            else:  #block Start
                bStart = pCoord
                inBlock = True
        else:  #not high enough
            if inBlock:  #end the block
                blocks.append('%s:%s:%s:%s' %
                              (chrom, strand, bStart, pCoord - 1))
                inBlock = False
            else:
                continue

    return blocks
Exemplo n.º 34
0
def makeBins5(fN, fOut, typeFilter):
        
        fOut = open(fOut, 'w')
        f = open(fN, 'r')
        for line in f:
                ls = line.strip().split('\t')
                id = ls[0]
                type, tcc = ls[1:3]
                chrom, strand, st, en = bioLibCG.tccSplit(tcc)

                #only take seqs that are long enough
                if en - st < 100: continue
                if not typeFilter in type: continue
                
                tccBins = [] #0 is the first nt from the 3' end
                if strand == '1':
                        for i in range(0, 100):
                                s, e = st + i, st + i
                                tccBins.append(bioLibCG.makeTcc(chrom,strand,s,e)) 

                elif strand == '-1':
                        for i in range(0, 100):
                                s, e = en - i, en - i
                                tccBins.append(bioLibCG.makeTcc(chrom,strand,s,e))


                pString = [id] + tccBins
                fOut.write('\t'.join([str(x) for x in pString]) + '\n')
Exemplo n.º 35
0
def makeWig(fN, assembly, format = None, name = None):
	
	'''format assumes bowtie
	suitible for medium mapped files.
	takes longer.'''
	#assume bowtie
	if not format: format = 'Bowtie'
	parserFunction = returnParserFunction(format)
	if not name: name = cg.getBaseFileName(fN, naked = True)
	lDict = cg.returnChromLengthDict(assembly)
	
	
	for chrom in lDict:
		if not chrom in cg.acceptableChroms: continue
		for strand in ['1', '-1']:
			f = open(fN, 'r')
			#create hitmap of chrom and strand
			print chrom, strand, 'hitmap'
			hitDict = {}
			for line in f:
				
				lChrom, lStrand, start, end = cg.tccSplit(parserFunction(line))
				lStrand = str(lStrand)
				start = int(start)
				end = int(end)
				if chrom == lChrom and strand == lStrand:
					for i in range(start, end + 1):
						try:
							hitDict[i] += 1
						except KeyError:
							hitDict[i] = 1
			
			#write results to wig file
			writeWigFromHitDict(hitDict, assembly)
Exemplo n.º 36
0
def truncate(fN):

    tCount = 0
    shortCount = 0
    fOut = open(fN + '.trun', 'w')
    f = open(fN, 'r')
    for line in f:

        ls = line.strip().split('\t')
        type, tcc = ls[1], ls[2]

        tCount += 1
        c, s, st, en = bioLibCG.tccSplit(tcc)

        cLen = en - st

        if cLen < 50:
            shortCount += 1
            continue

        if s == '1':
            en = en - 50
        elif s == '-1':
            st = st + 50
        else:
            print 'error'
            return 1

        ls[2] = bioLibCG.makeTcc(c, s, st, en)
        line = '\t'.join(str(x) for x in [ls[0], ls[1], ls[2]]) + '\n'
        fOut.write(line)

    print shortCount, tCount
Exemplo n.º 37
0
def truncate5(fN):
       
       tCount = 0
       shortCount = 0
       fOut = open(fN + '.trun5', 'w')
       f = open(fN, 'r')
       for line in f:
               
               ls = line.strip().split('\t')
               type, tcc = ls[1], ls[2]

               tCount += 1
               c, s, st, en = bioLibCG.tccSplit(tcc)

               cLen = en - st

               if cLen < 50:
                        shortCount += 1
                        continue

               if s == '1':
                       st = st + 50
               elif s == '-1':
                       en = en - 50
               else:
                       print 'error'
                       return 1

               ls[2] = bioLibCG.makeTcc(c, s, st, en)
               line = '\t'.join(str(x) for x in [ls[0], ls[1], ls[2]]) + '\n'
               fOut.write(line)

       print shortCount, tCount
Exemplo n.º 38
0
def markCenterExpression(aFN, wigDir, rn=None, tn=None):

    extend = 25

    timer = bioLibCG.cgTimer()
    timer.start()

    aNX = cgNexusFlat.Nexus(aFN, cgAlignmentFlat.cgAlignment)
    aNX.load(['centerExpression', 'tTcc', 'tStart', 'sLength', 'tELevel'],
             [rn, tn])

    #load expression of degradome
    wigDict = cgWig.loadWigDict(wigDir)

    for aID in aNX.centerExpression:
        aNX.centerExpression[aID] = [0.0, 0.0, 0.0]
        chrom, strand, start, end = bioLibCG.tccSplit(aNX.tTcc[aID])
        offset = aNX.tStart[aID]
        sLen = aNX.sLength[aID]

        if strand == '1':
            start = start - extend + offset
            end = start + sLen
        else:
            end = end + extend - offset
            start = end - sLen

        scanRange = bioLibCG.makeTcc(chrom, strand, start, end)
        stretch = cgWig.getExpressionProfile(scanRange, wigDict)

        #make sure peak is in the small range
        peakLevel = aNX.tELevel[aID]
        peakInRange = (peakLevel in stretch.values())

        expressionSum = sum(stretch.values())
        sortedKeys = stretch.keys()
        sortedKeys.sort()

        if strand == '-1':
            sortedKeys.reverse()

        if expressionSum != 0 and peakInRange:

            sumE = 0.0
            for key in sortedKeys[8:12]:
                sumE += stretch[key]
            aNX.centerExpression[aID][0] = sumE / expressionSum

            sumE = 0.0
            for key in sortedKeys[7:13]:
                sumE += stretch[key]
            aNX.centerExpression[aID][1] = sumE / expressionSum

            sumE = 0.0
            for key in sortedKeys[6:14]:
                sumE += stretch[key]
            aNX.centerExpression[aID][2] = sumE / expressionSum

    aNX.save()
Exemplo n.º 39
0
def testPeaks(degFN, dForm, allGeneInfo, gForm, switchStrand = False):

    #load/configure gene Info
    gNX = Nexus(allGeneInfo, gForm)
    gNX.load(['geneName', 'numReads', 'numSpots'])
    
    gName_numReads = {}
    gName_numSpots = {}
    while gNX.nextID():
        gName_numReads[gNX.geneName] = gNX.numReads
        gName_numSpots[gNX.geneName] = gNX.numSpots
   
   
    #load degFN info
    dNX = Nexus(degFN, dForm)
    dNX.load(['tcc', 'eLevel', 'geneNames', 'pValBin'])

    while dNX.nextID():
       
        gNames, readsForPeak = dNX.geneNames, dNX.eLevel
        chrom, strand, start, end = bioLibCG.tccSplit(dNX.tcc)
        if switchStrand:
            strand = -int(strand)
      
        pVals = []
        for gName in gNames:
            
            #may have to change gene name cuz of multiple spans
            try:
                totGeneReads = gName_numReads[gName]
                numSpotsForGene = gName_numSpots[gName]
            except KeyError:

                try:
                    gName = gName + '_RE_%s_%s' % (chrom, strand)
                    totGeneReads = gName_numReads[gName]
                    numSpotsForGene = gName_numSpots[gName]
                except KeyError:
                    print "FIX THIS GENE NAME", gName
                    continue

            #add psuedocount
            totGeneReads += 1
            numSpotsForGene += 1 # not sure whether to do this yet...

            #check for hidden intron gene overlap
            try:
                q = 1.0/numSpotsForGene
            except ZeroDivisionError:
                continue #intron gene

            #add p val
            pVals.append(binom.sf(readsForPeak, totGeneReads, q))

        dNX.pValBin = max(pVals) if pVals else -1.0

    dNX.save()
Exemplo n.º 40
0
def extendPeakTest(tcc, pRange, minVal, maxAvgNoise, minPeakLength, maxPeakLength, cName):  	
	        
                chrom, strand, peakPosition, end = cg.tccSplit(tcc)
		cProfile = stepVectorScan.profileAroundPoint(tcc, pRange, cName, ratio = True)
		
                #extend this peak left and right
                leftRange = range(1-pRange, 0)
                rightRange = range(1, pRange)
                leftRange.reverse() #going from the middle outward

                #left
                startFinal = leftRange[-1]
		for i in leftRange:
			if cProfile[i] > minVal:
				print ' extending stretch'
			else:
                                print ' end of stretch L'
                                startFinal = i + 1
                                break
                #right
                endFinal = rightRange[-1]
                for i in rightRange:
                        if cProfile[i] > minVal:
                                print ' extending stretch'
                        else:
                                print ' end of stretch R'
                                endFinal = i - 1
                                break

	        peakLength = endFinal - startFinal + 1

		
                #avg expression around peak check...
                #get total expression before peak
                low = startFinal
                high = endFinal
                noiseExpression = 0
                lowRange = range(1 - pRange, low)
                highRange = range(high + 1, pRange)
                totalLength = len(lowRange) + len(highRange)
                print totalLength, pRange, low, high, lowRange, highRange
                for i in lowRange:
                        noiseExpression += cProfile[i]
                for i in highRange:
                        noiseExpression += cProfile[i]
                try:
                        avgNoise = noiseExpression/float(totalLength)
                except:
                        return False

		#filter out peaks that look a certain way.
		if (minPeakLength < peakLength < maxPeakLength) and (avgNoise < maxAvgNoise):
                        goodTcc = cg.makeTcc(chrom, strand, peakPosition + startFinal, peakPosition + endFinal)
                        print '*KEEPER'
                        return goodTcc
                else:
                        return False
Exemplo n.º 41
0
def updateTypeAlignment(oFN, wigDir, chrom, strand, rn = None, tn = None):
        '''This is for ALIGNMENTS...NOT DEG PEAKS!'''

        oNX = cgNexusFlat.Nexus(oFN, cgAlignmentFlat.cgAlignment)
        oNX.load(['tTcc', 'type'], [rn, tn])
        
                
        if strand == '1':
                strand = '-1'
        else:
                strand = '1'
        
        print 'loading wig'
        coord_types = cgWig.loadSingleWigContext(wigDir, chrom, strand, 'tType') 
        print 'done loading'
        
        domOrder = ['microRNA_noncoding',
        'lincRNA_noncoding',
        'longNC_noncoding',
        'miRNA_pseudogene_noncoding',
        'Mt_rRNA_noncoding',
        'Mt_tRNA_noncoding',
        'Mt_tRNA_pseudogene_noncoding',
        'rRNA_noncoding',
        'rRNA_pseudogene_noncoding',
        'scRNA_pseudogene_noncoding',
        'snoRNA_noncoding',
        'snoRNA_pseudogene_noncoding',
        'snRNA_noncoding',
        'snRNA_pseudogene_noncoding',
        'tRNA_pseudogene_noncoding',
        'pseudogene_noncoding',
        'protein_coding',
        'None']
        
        ds = bioLibCG.dominantSpotter(domOrder)


        for oID in oNX.tTcc:

                oChrom, oStrand, start, end = bioLibCG.tccSplit(oNX.tTcc[oID])
                
                #deg wigs is AS to actual clipping site
                if oStrand == '1':
                        oStrand = '-1'
                else:
                        oStrand = '1'
                
                if oChrom == chrom and oStrand == strand:

                        tranTypes = coord_types.get(start, 'None').split(',')
                        types = [x.split(':')[1] if x != 'None' else 'None' for x in tranTypes]
                        types = list(set(types))
                        oNX.type[oID] = ds.spotItem(types)

        
        oNX.save()
Exemplo n.º 42
0
def countWithBinsSetReads(readFN, binDir, type='INTRON'):

    numBins = 100

    #create bin sets
    c_s_bin_set = {}
    for chrom in bioLibCG.humanChromosomes:
        for strand in ('1', '-1'):
            #initialize data structure
            for i in range(0, numBins):
                c_s_bin_set.setdefault(chrom, {}).setdefault(strand,
                                                             {})[i] = set()
            f = open(binDir + '/%s.%s.%s.bins' % (type, chrom, strand), 'r')
            for line in f:
                ls = line.strip().split('\t')
                tccs = ls[1:numBins + 1]
                for i in range(0, numBins):
                    ch, st, sta, end = bioLibCG.tccSplit(tccs[i])
                    for j in range(sta, end + 1):
                        c_s_bin_set[chrom][strand][i].add(j)

    print 'creating read sets'
    #creat read set
    c_s_set = {}
    for chrom in bioLibCG.humanChromosomes:
        c_s_set[chrom] = {}
        for strand in ('1', '-1'):
            c_s_set[chrom][strand] = set()

    f = open(readFN, 'r')
    for line in f:
        ls = line.strip().split('\t')
        strand, chrom, start = ls[1:4]
        if chrom not in bioLibCG.humanChromosomes: continue
        start = int(start)
        if strand == '+':
            strand = '-1'
            start += 20
        else:
            strand = '1'

        c_s_set[chrom][strand].add(start)

    print 'counting'
    #make bCounts
    binCounts = [0] * numBins

    #count for each bin
    for i in range(0, numBins):
        for chrom in bioLibCG.humanChromosomes:
            for strand in ('1', '-1'):
                for j in c_s_set[chrom][strand]:
                    if j in c_s_bin_set[chrom][strand][i]:
                        binCounts[i] += 1

        print '%s\t%s' % (i, binCounts[i])
Exemplo n.º 43
0
def countWithBinsSetReads(readFN, binDir, type = 'INTRON'):

        numBins = 100
       
        #create bin sets
        c_s_bin_set = {}
        for chrom in bioLibCG.humanChromosomes:
                for strand in ('1', '-1'):
                        #initialize data structure
                        for i in range(0, numBins):
                                c_s_bin_set.setdefault(chrom, {}).setdefault(strand, {})[i] = set()
                        f = open(binDir + '/%s.%s.%s.bins' % (type, chrom, strand), 'r')
                        for line in f:
                                ls = line.strip().split('\t')
                                tccs = ls[1:numBins + 1]
                                for i in range(0,numBins):
                                        ch, st, sta, end = bioLibCG.tccSplit(tccs[i])
                                        for j in range(sta, end + 1):
                                                c_s_bin_set[chrom][strand][i].add(j)

        print 'creating read sets'
        #creat read set
        c_s_set = {}
        for chrom in bioLibCG.humanChromosomes:
                c_s_set[chrom] = {}
                for strand in ('1', '-1'):
                        c_s_set[chrom][strand] = set()

        f = open(readFN, 'r')
        for line in f:
                ls = line.strip().split('\t')
                strand, chrom, start = ls[1:4]
                if chrom not in bioLibCG.humanChromosomes: continue
                start = int(start)
                if strand == '+':
                        strand = '-1'
                        start += 20
                else:
                        strand = '1'

                c_s_set[chrom][strand].add(start)
                

        print 'counting'
        #make bCounts
        binCounts = [0] * numBins

        #count for each bin
        for i in range(0, numBins):
                for chrom in bioLibCG.humanChromosomes:
                        for strand in ('1', '-1'):
                                for j in c_s_set[chrom][strand]:
                                        if j in c_s_bin_set[chrom][strand][i]:
                                                binCounts[i] += 1
                
                print '%s\t%s' % (i, binCounts[i])
Exemplo n.º 44
0
def test(tcc, wigDir):

        chrom, strand, start, end = bioLibCG.tccSplit(tcc)
        print chrom, strand
        coord_eLevel = cgWig.loadSingleWig(wigDir, chrom, strand, 'ALL')

        sKeys = sorted(coord_eLevel.keys())

        for i in range(start, end + 1):
                print i, coord_eLevel.get(i, 0)
Exemplo n.º 45
0
def getExpressionProfile(tcc, wigDict):
    '''assume 1 based'''

    chrom, strand, start, end = bioLibCG.tccSplit(tcc)
    coord_value = {}

    for i in range(start, end + 1):
        coord_value[i] = wigDict[chrom][strand].get(i, 0)

    return coord_value
Exemplo n.º 46
0
def addOne(fN):
        
        f = open(fN, 'r')
        for line in f:
                ls = line.strip().split('\t')
                chrom, strand, start, end = bioLibCG.tccSplit(ls[0])
                start += 1
                end += 1

                print bioLibCG.makeTcc(chrom,strand,start,end)
Exemplo n.º 47
0
def getExpressionProfile(tcc, wigDict):
        '''assume 1 based'''

        chrom, strand, start, end = bioLibCG.tccSplit(tcc)
        coord_value = {}

        for i in range(start, end + 1):
                coord_value[i] = wigDict[chrom][strand].get(i, 0)

        return coord_value                
Exemplo n.º 48
0
def mapStartRangeCheckFunction(val, line):
	lineStart = int(line.strip().split('\t')[3])
        lineEnd = lineStart + len(line.strip().split('\t')[4])
        chrom, strand, start, end = cg.tccSplit(val)
        start = int(start)
        end = int(end)

        if cg.simpleOverlap(start, end, lineStart, lineEnd):
                return 0
        else:
                return -1
Exemplo n.º 49
0
def mapStartCheckFunction(val, line):
	lineStart = int(line.strip().split('\t')[3])
        chrom, strand, start, end = cg.tccSplit(val)
        start = int(start)

	if start < lineStart:
		return -1
	elif start > lineStart:
		return 1
	elif start == lineStart:
		return 0
Exemplo n.º 50
0
def makeWigMem(fN, assembly, format=None, name=None, directory=None):
    '''format assumes bowtie
	suitible for small mapped files.'''

    if not name: name = cg.getBaseFileName(fN, naked=True)
    if not format: format = 'Bowtie'
    parserFunction = returnParserFunction(format)

    lDict = cg.returnChromLengthDict(assembly)
    f = open(fN, 'r')
    f.readline()  #header...file might not have one but its one read...

    #create hitmap of chrom and strand
    hitDict = {}  #format = chr: { strand : { coord : value
    for line in f:
        try:
            lChrom, lStrand, start, end = cg.tccSplit(parserFunction(line))
        except AttributeError:
            continue
        lStrand = str(lStrand)
        start = int(start)
        end = int(end)
        if lChrom in cg.acceptableChroms:

            #wig for degradome
            if lStrand == '1':
                i = start + 20
            else:
                i = start

            try:
                hitDict[lChrom][lStrand][i] += 1
            except KeyError:
                if lChrom not in hitDict:
                    hitDict[lChrom] = {}
                if lStrand not in hitDict[lChrom]:
                    hitDict[lChrom][lStrand] = {}
                hitDict[lChrom][lStrand][i] = 1
            '''
			
			for i in range(start, end):
				try:
					hitDict[lChrom][lStrand][i] += 1
				except KeyError:
					if lChrom not in hitDict:
						hitDict[lChrom] = {}
					if lStrand not in hitDict[lChrom]:
						hitDict[lChrom][lStrand] = {}
					hitDict[lChrom][lStrand][i] = 1
			'''
    f.close()

    #write results to wig file
    writeWigFromHitDict(hitDict, assembly, name, directory)
Exemplo n.º 51
0
def mapStartCheckFunction(val, line):
    lineStart = int(line.strip().split('\t')[3])
    chrom, strand, start, end = cg.tccSplit(val)
    start = int(start)

    if start < lineStart:
        return -1
    elif start > lineStart:
        return 1
    elif start == lineStart:
        return 0
Exemplo n.º 52
0
def mapStartRangeCheckFunction(val, line):
    lineStart = int(line.strip().split('\t')[3])
    lineEnd = lineStart + len(line.strip().split('\t')[4])
    chrom, strand, start, end = cg.tccSplit(val)
    start = int(start)
    end = int(end)

    if cg.simpleOverlap(start, end, lineStart, lineEnd):
        return 0
    else:
        return -1
Exemplo n.º 53
0
def svCoord(tccList, config = None):
	'''Given tcc list --> scan Organism wig files and coord:value...
	'''
	
	#init
	config = c.getConfig(config)
	org = config.conf['organism']
	wigDir = config.conf['wigSetDir']
	wigSetName = config.conf['wigSetName']
	splitIntoChroms = config.conf['wigChromSplit']
	if splitIntoChroms == 'True':
		splitIntoChroms = True
	else:
		splitIntoChroms = False

	coordDict = {} # tcc: [list values]
	for tcc in tccList:
		chrom, strand, tccStart, tccEnd = cg.tccSplit(tcc)
		
		if splitIntoChroms:
			fN = wigDir + '/%s.%s.%s.wig' %  (wigSetName, chrom, strand)
		else:
			fN = wigDir + '/Merge.%s.%s.wig' % (org.lower(), strand)
		
		fIndex = cgIndex.lineIndex(fN, header = True)
		fIndex.passCheckFunction(cgIndex.wigCheckFunction)
		fIndex.binarySearch(tcc) #places file pointer at beginning of tcc as beginning
		
		stop = False
		for line in fIndex.file:
			
			#print 'Line:', line.strip()
			lBeg = int(cg.ss(line)[1]) + 1
                        #print 'lBeg', lBeg
			lEnd = int(cg.ss(line)[2])
                        #print 'lEnd', lEnd
                        #print '--'
			lValue = int(cg.ss(line)[3].split('.')[0])
			
			if tccStart > lBeg:
				lBeg = tccStart
			if tccEnd < lEnd:
				lEnd = tccEnd
				stop = True
			#print timer.split()

			for i in range(lBeg, lEnd + 1):
				coordDict[i] = lValue
				
			if stop: break
		fIndex.close() #close the file and the index after use...

	return coordDict
Exemplo n.º 54
0
def peakToSeq(peakFN, extend, outFN):
    #extend is +25 for degradome and -6/-4 for oRNA
    extend = int(extend)
    gf = GenomeFetch.GenomeFetch('hg19')

    outF = open(outFN, 'w')
    f = open(peakFN, 'r')
    for line in f:
        ls = line.strip().split('\t')
        chrom, strand, start, end = bioLibCG.tccSplit(ls[0])
        start, end = start - extend, end + extend
        newTcc = bioLibCG.makeTcc(chrom, strand, start, end)
        outF.write(gf.getSequence(newTcc) + '\n')