def transcriptSetOverlap(aDir, AS):
        AS = bool(AS)

	geneSetFN = '/home/chrisgre/dataSources/known/Human/geneSets/ensemblAllTranscripts.tsv'
	allExons = cgGenes.createGeneSetFromFile(geneSetFN)

	#get degradome TCCS
	#note that you need to test the AS peaks, this is the location of the targetted transcript
        oRNA_DC = cgNexusFlat.dataController(aDir, cgOriginRNA.OriginRNA)
	id_oRNA = oRNA_DC.load()
        if AS == True:
                degTccs = [cg.convertToAS(x.tcc) for x in id_oRNA.values()]
        else:
                degTccs = [x.tcc for x in id_oRNA.values()]

	#find all overlapping exons/transcripts, then all results sequences that overlap exons
	overlappingExons = allExons.transcriptOverlaps(degTccs)
	#print len(overlappingExons), "num of overlapping exons"
        overlappingExonTccs = [x.tcc for x in overlappingExons]
	overlappingDegTccs = compare.compareTwoTcc(degTccs, overlappingExonTccs, 1)


	#write new file
        for obj in id_oRNA.values():         
                if AS:
                        degTcc = cg.convertToAS(obj.tcc)
                else:
                        degTcc = obj.tcc

                if degTcc in overlappingDegTccs:
                        obj.transcriptOverlap = True
	        else:
                        obj.transcriptOverlap = False 

        oRNA_DC.commit(id_oRNA)	
Exemplo n.º 2
0
def transcriptSetOverlap(aDir, AS):
    AS = bool(AS)

    geneSetFN = '/home/chrisgre/dataSources/known/Human/geneSets/ensemblAllTranscripts.tsv'
    allExons = cgGenes.createGeneSetFromFile(geneSetFN)

    #get degradome TCCS
    #note that you need to test the AS peaks, this is the location of the targetted transcript
    oRNA_DC = cgDB.dataController(aDir, cgOriginRNA.OriginRNA)
    id_oRNA = oRNA_DC.load()
    if AS == True:
        degTccs = [cg.convertToAS(x.tcc) for x in id_oRNA.values()]
    else:
        degTccs = [x.tcc for x in id_oRNA.values()]

#find all overlapping exons/transcripts, then all results sequences that overlap exons
    overlappingExons = allExons.transcriptOverlaps(degTccs)
    #print len(overlappingExons), "num of overlapping exons"
    overlappingExonTccs = [x.tcc for x in overlappingExons]
    overlappingDegTccs = compare.compareTwoTcc(degTccs, overlappingExonTccs, 1)

    #write new file
    for obj in id_oRNA.values():
        if AS:
            degTcc = cg.convertToAS(obj.tcc)
        else:
            degTcc = obj.tcc

        if degTcc in overlappingDegTccs:
            obj.transcriptOverlap = True
        else:
            obj.transcriptOverlap = False

    oRNA_DC.commit(id_oRNA)
Exemplo n.º 3
0
def transcriptSetOverlapTargets(aDir):

    geneSetFN = '/home/chrisgre/dataSources/known/Human/geneSets/ensemblAllTranscripts.tsv'
    allExons = cgGenes.createGeneSetFromFile(geneSetFN)

    #get degradome TCCS
    #note that you need to test the AS peaks, this is the location of the targetted transcript

    aDC = cgDB.dataController(aDir, cgAlignment.cgAlignment)
    id_alignment = aDC.load()

    #create list of unique tccs.
    uniqTccs = []
    for alignment in id_alignment.values():
        chrom, strand, start, end = cg.tccSplit(alignment.tTcc)
        offset = alignment.tStart
        sLen = alignment.sLength

        if strand == '1':
            start = start - 19 + offset
            end = start + sLen
        else:
            end = end + 19 - offset
            start = end - sLen

        tcc = cg.makeTcc(chrom, strand, start, end)
        if tcc not in uniqTccs: uniqTccs.append(tcc)

    degTccs = [cg.convertToAS(x) for x in uniqTccs]

    #find all overlapping exons/transcripts, then all results sequences that overlap exons
    overlappingExons = allExons.transcriptOverlaps(degTccs)
    overlappingExonTccs = [x.tcc for x in overlappingExons]
    overlappingDegTccs = compare.compareTwoTcc(degTccs, overlappingExonTccs, 1)

    #update
    for obj in id_alignment.values():
        chrom, strand, start, end = cg.tccSplit(alignment.tTcc)
        offset = alignment.tStart
        sLen = alignment.sLength

        if strand == '1':
            start = start - 19 + offset
            end = start + sLen
        else:
            end = end + 19 - offset
            start = end - sLen

        tcc = cg.makeTcc(chrom, strand, start, end)
        degTcc = cg.convertToAS(tcc)

        if degTcc in overlappingDegTccs:
            obj.transcriptOverlap = True
        else:
            obj.transcriptOverlap = False

    aDC.commit(id_alignment)
Exemplo n.º 4
0
def transcriptSetOverlapTargets(aDir):

	geneSetFN = '/home/chrisgre/dataSources/known/Human/geneSets/ensemblAllTranscripts.tsv'
	allExons = cgGenes.createGeneSetFromFile(geneSetFN)

	#get degradome TCCS
	#note that you need to test the AS peaks, this is the location of the targetted transcript
        
        aDC = cgDB.dataController(aDir, cgAlignment.cgAlignment)
        id_alignment = aDC.load()
        
        #create list of unique tccs.
        uniqTccs = []
        for alignment in id_alignment.values():
                chrom, strand, start, end = cg.tccSplit(alignment.tTcc)
                offset = alignment.tStart
                sLen = alignment.sLength

                if strand == '1':
                        start = start - 19 + offset
                        end = start + sLen
                else:
                        end = end + 19 - offset
                        start = end - sLen

                tcc = cg.makeTcc(chrom, strand, start, end)
                if tcc not in uniqTccs: uniqTccs.append(tcc)

        degTccs = [cg.convertToAS(x) for x in uniqTccs]

	#find all overlapping exons/transcripts, then all results sequences that overlap exons
	overlappingExons = allExons.transcriptOverlaps(degTccs)
        overlappingExonTccs = [x.tcc for x in overlappingExons]
	overlappingDegTccs = compare.compareTwoTcc(degTccs, overlappingExonTccs, 1)

        #update
        for obj in id_alignment.values():         
                chrom, strand, start, end = cg.tccSplit(alignment.tTcc)
                offset = alignment.tStart
                sLen = alignment.sLength

                if strand == '1':
                        start = start - 19 + offset
                        end = start + sLen
                else:
                        end = end + 19 - offset
                        start = end - sLen

                tcc = cg.makeTcc(chrom, strand, start, end)
                degTcc = cg.convertToAS(tcc)

                if degTcc in overlappingDegTccs:
                        obj.transcriptOverlap = True
	        else:
                        obj.transcriptOverlap = False 

        aDC.commit(id_alignment)
def transcriptSetOverlapDegFileHitmap(degFile, runningChrom, runningStrand):

	geneSetFN = '/home/chrisgre/dataSources/known/Human/geneSets/ensemblAllTranscripts.tsv'
	allExons = cgGenes.createGeneSetFromFile(geneSetFN)
        transcriptTccs = []
        for gene in allExons.set.values():
                for transcript in gene.transcripts:
                        transcriptTccs.append(transcript.tcc)

        #create hitmap
        coordSet = set()
        for tcc in transcriptTccs:
                chrom, strand, start, end = cg.tccSplit(tcc)
                
                if chrom != runningChrom:
                        continue

                if strand != runningStrand:
                        continue

                for i in range(start, end + 1):
                        coordSet.add(i)

        #find overlapping degTccs
        print 'done creating hitmap'
        

        f = open(degFile, 'r')
	newLines = []
	for line in f:
	        ls = line.strip().split('\t') 
                degTcc = cg.convertToAS(ls[1])
                chrom, strand, start, end = cg.tccSplit(degTcc)
                if chrom != runningChrom:
                        continue

                if strand != runningStrand:
                        continue

                inTran = '0'
                for i in xrange(start, end + 1):
                        if i in coordSet:
                                inTran = '1'
                                break

		#update newLines
                newLine = cg.appendToLine(line, inTran, 3)
                newLines.append(newLine)         
	f.close()

        f = open(degFile + '.%s.%s' % (runningChrom, runningStrand), 'w')
        f.writelines(newLines)
        f.close()
Exemplo n.º 6
0
def transcriptSetOverlapDegFileHitmap(degFile, runningChrom, runningStrand):

    geneSetFN = '/home/chrisgre/dataSources/known/Human/geneSets/ensemblAllTranscripts.tsv'
    allExons = cgGenes.createGeneSetFromFile(geneSetFN)
    transcriptTccs = []
    for gene in allExons.set.values():
        for transcript in gene.transcripts:
            transcriptTccs.append(transcript.tcc)

#create hitmap
    coordSet = set()
    for tcc in transcriptTccs:
        chrom, strand, start, end = cg.tccSplit(tcc)

        if chrom != runningChrom:
            continue

        if strand != runningStrand:
            continue

        for i in range(start, end + 1):
            coordSet.add(i)

#find overlapping degTccs
    print 'done creating hitmap'

    f = open(degFile, 'r')
    newLines = []
    for line in f:
        ls = line.strip().split('\t')
        degTcc = cg.convertToAS(ls[1])
        chrom, strand, start, end = cg.tccSplit(degTcc)
        if chrom != runningChrom:
            continue

        if strand != runningStrand:
            continue

        inTran = '0'
        for i in xrange(start, end + 1):
            if i in coordSet:
                inTran = '1'
                break

    #update newLines
        newLine = cg.appendToLine(line, inTran, 3)
        newLines.append(newLine)
    f.close()

    f = open(degFile + '.%s.%s' % (runningChrom, runningStrand), 'w')
    f.writelines(newLines)
    f.close()
def transcriptSetOverlapDegFile(degFile):

	geneSetFN = '/home/chrisgre/dataSources/known/Human/geneSets/ensemblAllTranscripts.tsv'
	allExons = cgGenes.createGeneSetFromFile(geneSetFN)

	#get degradome TCCS
	#note that you need to test the AS peaks, this is the location of the targetted transcript
        
        degTccs = []
        f = open(degFile, 'r')
        for line in f:
                ls = line.strip().split('\t')
                degTccs.append(ls[1])
        f.close()
                        

        degTccs = [cg.convertToAS(x) for x in degTccs]

	#find all overlapping exons/transcripts, then all results sequences that overlap exons
	overlappingExons = allExons.transcriptOverlaps(degTccs)
	#print len(overlappingExons), "num of overlapping exons"
        overlappingExonTccs = [x.tcc for x in overlappingExons]
	overlappingDegTccs = compare.compareTwoTcc(degTccs, overlappingExonTccs, 1)

        
        f = open(degFile, 'r')
	newLines = []
	for line in f:
	        
                degTcc = cg.convertToAS(ls[1])
               
                inTran = '0'
                if degTcc in overlappingDegTccs:
                        inTran = '1'

		#update newLines
                newLine = cg.appendToLine(line, inTran, 3)
                
	f.close()
Exemplo n.º 8
0
def transcriptSetOverlapDegFile(degFile):

    geneSetFN = '/home/chrisgre/dataSources/known/Human/geneSets/ensemblAllTranscripts.tsv'
    allExons = cgGenes.createGeneSetFromFile(geneSetFN)

    #get degradome TCCS
    #note that you need to test the AS peaks, this is the location of the targetted transcript

    degTccs = []
    f = open(degFile, 'r')
    for line in f:
        ls = line.strip().split('\t')
        degTccs.append(ls[1])
    f.close()

    degTccs = [cg.convertToAS(x) for x in degTccs]

    #find all overlapping exons/transcripts, then all results sequences that overlap exons
    overlappingExons = allExons.transcriptOverlaps(degTccs)
    #print len(overlappingExons), "num of overlapping exons"
    overlappingExonTccs = [x.tcc for x in overlappingExons]
    overlappingDegTccs = compare.compareTwoTcc(degTccs, overlappingExonTccs, 1)

    f = open(degFile, 'r')
    newLines = []
    for line in f:

        degTcc = cg.convertToAS(ls[1])

        inTran = '0'
        if degTcc in overlappingDegTccs:
            inTran = '1'

    #update newLines
        newLine = cg.appendToLine(line, inTran, 3)

    f.close()
Exemplo n.º 9
0
import getHairpins
import cgGenes
import cgConfig as c
import bioLibCG as cg

mConf = c.getConfig('Main.conf')
geneSetFolder = mConf.conf['geneSetsHuman']

fN = '/home/chrisgre/projects/NoncodingHuman/results/NChuman-s3k8b17.results.sorted.introns.sorted'
cHairs = getHairpins.getHairpins(fN)

ensGenes = cgGenes.createGeneSetFromFile(geneSetFolder + '/ensemblAllTranscripts.tsv')

cDesc = {} #CID:gDesc
for CID in cHairs:
	tcc = cHairs[CID]
	
	cDesc[CID] = "NONE"
	
	overlappingGenes = ensGenes.geneOverlaps([tcc])
	if len(overlappingGenes) > 0:
		print overlappingGenes[0].type
		cDesc[CID] = overlappingGenes[0].type

f = open(fN, 'r')
newLines = []
for line in f:
	CID = line.strip().split('\t')[7]
	newLines.append(cg.appendToLine(line, cDesc[CID], 16))
f.close()
Exemplo n.º 10
0
import cgGenes
import compareData as compare
import cgConfig as c

cName = 'mm9.conf'
mConf = c.getConfig('Main.conf')
conf = c.getConfig(cName)
organism = conf.conf['organism']
geneSetFolder = mConf.conf['geneSets%s' % organism]
genes = cgGenes.createGeneSetFromFile(geneSetFolder + '/allTransciptsType.tsv')
peakTccs = compare.tccFileToList('peakData.500.mm9', 0)


tOverlaps = genes.transcriptOverlaps(peakTccs)
typeDict = {}
for transcript in tOverlaps:
	if transcript.type not in typeDict:
		typeDict[transcript.type] = 1
	else:
		typeDict[transcript.type] += 1

#count the amounts of each type for each transcript
amount = {}
for gene in genes.genes:
	for t in gene.transcripts:
		if t.type in amount:
			amount[t.type] += 1
		else:
			amount[t.type] = 1

print 'Total Peaks:', len(peakTccs)
Exemplo n.º 11
0
import getHairpins
import cgGenes
import cgConfig as c
import bioLibCG as cg

mConf = c.getConfig('Main.conf')
geneSetFolder = mConf.conf['geneSetsHuman']

fN = '/home/chrisgre/projects/NoncodingHuman/results/NChuman-s3k8b17.results.sorted.introns.sorted'
cHairs = getHairpins.getHairpins(fN)

ensGenes = cgGenes.createGeneSetFromFile(geneSetFolder +
                                         '/ensemblAllTranscripts.tsv')

cDesc = {}  #CID:gDesc
for CID in cHairs:
    tcc = cHairs[CID]

    cDesc[CID] = "NONE"

    overlappingGenes = ensGenes.geneOverlaps([tcc])
    if len(overlappingGenes) > 0:
        print overlappingGenes[0].type
        cDesc[CID] = overlappingGenes[0].type

f = open(fN, 'r')
newLines = []
for line in f:
    CID = line.strip().split('\t')[7]
    newLines.append(cg.appendToLine(line, cDesc[CID], 16))
f.close()
Exemplo n.º 12
0
def exonNoisy(cName = None):
	#init
	mConf = c.cgConfig('Main.conf')
	conf = c.getConfig(cName)
	cHairs = getHairpins.getHairpins(conf.conf['resultsExons']) #CID: HAIRPIN
	organism = conf.conf['organism']
	geneSetFolder = mConf.conf['geneSets%s' % organism]
	
	#make prediction overlap hitmap
	print 'Making prediction list'
	predList = [] 
	for CID in cHairs:
		hPin = cHairs[CID]
		predList.append(hPin)
	
	if compare.checkIfOverlaps(predList):
		predList = compare.collapseOverlaps(predList)
	
	
	#make genes for Ensemble/make list of tccs for exons.
	print 'Creating gene set'
	
	ensGenes = cgGenes.createGeneSetFromFile(geneSetFolder + '/ensemblAllExons.tsv')
	print '  loaded # genes:', len(ensGenes.set)
	
	
	#collect levels for each haipin region
	print '[Checking all levels]'
	cidLevels = {}
	for CID in cHairs:
		print CID
		hPin = cHairs[CID]
			
		#for each hairpin, --> find overlapping transcripts in same gene
		overlappingGenes = ensGenes.geneOverlaps([hPin])
		if len(overlappingGenes) > 0:
			gIDs = [gene.id for gene in overlappingGenes]
			allTccs = ensGenes.getTccsFromGIDs(gIDs)
			if compare.checkIfOverlaps:
				print '  Overlaps...collapsing'
				allTccs = compare.collapseOverlaps(allTccs)
		else:
			print 'NO GENE OVERLAPS!!!!!', CID, hPin
		
		
		#filter out my predictions.
		print '  Filtering out predictions'
		checkList = compare.subtractTwoTccLists(allTccs, predList)
			
		
		#Get Expression level for gene.
		print '  Retrieving Expression levels:', cg.getTccListTotalLength(checkList)
		levels = []
		
		
		hPinLevels = stepVectorScan.scanVectorsHist(checkList, cName)
		for hPin in hPinLevels:
			levels.extend(hPinLevels[hPin])
		
			
		cidLevels[CID] = levels
		
	
	
	
	#output levels to file
	print 'Outputting to file'
	#find longest
	longest = 0
	for CID in cidLevels:
		length = len(cidLevels[CID])
		if length > longest:
			longest = length
	
	sortedKeys = cidLevels.keys()
	sortedKeys.sort()
	#print sortedKeys
	
	newLines = []
	for j in range(0, longest): #how many lines are there
		newLine = []
		for CID in sortedKeys:
			if len(cidLevels[CID]) > j:# add it
				newLine.append(str(cidLevels[CID][j]))
			else:
				newLine.append('NA')
	
		newLines.append('\t'.join(newLine) + '\n')
	
	outFileN = conf.conf['exonNoiseData']
	outFile = open(outFileN, 'w')
	outFile.write('\t'.join(sortedKeys) + '\n')
	outFile.writelines(newLines)
	outFile.close()