def getSplicingUnitLengths(tranFN, wigDir, chrom, strand): exonChr_strand_coord = {} intronChr_strand_coord = {} f = open(tranFN, 'r') for line in f: ls = line.strip().split('\t') tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2]) #if tChrom != chrom or tStrand != strand: #continue tStart, tEnd = int(ls[3]), int(ls[4]) - 1 cStart, cEnd = int(ls[5]), int(ls[6]) - 1 exonStarts = [int(x) for x in ls[8][:-1].split(',')] exonEnds = [int(x) - 1 for x in ls[9][:-1].split(',')] exonPairs = zip(exonStarts, exonEnds) codingStatus = '_coding' in ls[13] tID = ls[0] #calulate intron pairs intronPairs = [] i = 0 for pair in exonPairs: if i == 0: i += 1 continue iStart = exonPairs[i -1][1] + 1 iEnd = exonPairs[i][0] - 1 intronPairs.append((iStart, iEnd)) i += 1 #take care of messy UTRs and assign utr ranges #5UTR if strand == '1': if cStart == tStart or cStart == tEnd + 1: range5 = () else: range5 = (tStart, cStart - 1) else: if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1: range5 = () else: range5 = (cEnd + 1, tEnd) #3UTR if strand == '1': if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1: range3 = () else: range3 = (cEnd + 1, tEnd) else: if cStart == tStart or cStart == tEnd + 1: range3 = () else: range3 = (tStart, cStart - 1) utr5 = compareData.subtractTwoRanges([range5], intronPairs) utr3 = compareData.subtractTwoRanges([range3], intronPairs) exonPairs = compareData.subtractTwoRanges(exonPairs, [range5]) exonPairs = compareData.subtractTwoRanges(exonPairs, [range3]) pairs__type = [ (exonPairs, 'C_EXON'), (intronPairs, 'C_INTRON') ] for pairs, type in pairs__type: for pair in pairs: for i in xrange(pair[0], pair[1] + 1): if codingStatus: if type == 'C_EXON': exonChr_strand_coord.setdefault(tChrom, {}).setdefault(tStrand, set()).add(i) elif type == 'C_INTRON': #intronChr_strand_coord.setdefault(tChrom, {}).setdefault(tStrand, set()).add(i) pass iLength = 0 for chrom in intronChr_strand_coord: for strand in intronChr_strand_coord[chrom]: iLength += len(intronChr_strand_coord[chrom][strand]) eLength = 0 for chrom in exonChr_strand_coord: for strand in exonChr_strand_coord[chrom]: eLength += len(exonChr_strand_coord[chrom][strand]) print 'total Exon Length (all exons overlapped)', eLength print 'total intron Length (all introns overlapped)', iLength
def makeContextDB(tranFN, chrom, strand, outFN): f = open(tranFN, 'r') fOut = open(outFN, 'w') id = 0 for line in f: ls = line.strip().split('\t') tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2]) if tChrom != chrom or tStrand != strand: continue tStart, tEnd = int(ls[3]), int(ls[4]) - 1 cStart, cEnd = int(ls[5]), int(ls[6]) - 1 exonStarts = [int(x) for x in ls[8][:-1].split(',')] exonEnds = [int(x) - 1 for x in ls[9][:-1].split(',')] exonPairs = zip(exonStarts, exonEnds) codingStatus = '_coding' in ls[13] tID = ls[0] intronPairs = [] i = 0 for pair in exonPairs: if i == 0: i += 1 continue iStart = exonPairs[i - 1][1] + 1 iEnd = exonPairs[i][0] - 1 intronPairs.append((iStart, iEnd)) i += 1 #p.tell(tStart, tEnd, cStart, cEnd, exonPairs, intronPairs) #take care of messy UTRs and assign utr ranges #5UTR if strand == '1': if cStart == tStart or cStart == tEnd + 1: range5 = () else: range5 = (tStart, cStart - 1) else: if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1: range5 = () else: range5 = (cEnd + 1, tEnd) #3UTR if strand == '1': if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1: range3 = () else: range3 = (cEnd + 1, tEnd) else: if cStart == tStart or cStart == tEnd + 1: range3 = () else: range3 = (tStart, cStart - 1) utr5 = compareData.subtractTwoRanges([range5], intronPairs) utr3 = compareData.subtractTwoRanges([range3], intronPairs) exonPairs = compareData.subtractTwoRanges(exonPairs, [range5]) exonPairs = compareData.subtractTwoRanges(exonPairs, [range3]) #5UTR for pair in utr5: myTcc = bioLibCG.makeTcc(chrom, strand, pair[0] + 1, pair[1] + 1) if codingStatus: pString = [str(id), 'C_5UTR', myTcc] pString = '\t'.join(pString) + '\n' fOut.write(pString) else: pString = [str(id), 'NC_5UTR', myTcc] pString = '\t'.join(pString) + '\n' fOut.write(pString) id += 1 #Exons for pair in exonPairs: myTcc = bioLibCG.makeTcc(chrom, strand, pair[0] + 1, pair[1] + 1) if codingStatus: pString = [str(id), 'C_EXON', myTcc] pString = '\t'.join(pString) + '\n' fOut.write(pString) else: pString = [str(id), 'NC_EXON', myTcc] pString = '\t'.join(pString) + '\n' fOut.write(pString) id += 1 #Introns for pair in intronPairs: myTcc = bioLibCG.makeTcc(chrom, strand, pair[0] + 1, pair[1] + 1) if codingStatus: pString = [str(id), 'C_INTRON', myTcc] pString = '\t'.join(pString) + '\n' fOut.write(pString) else: pString = [str(id), 'NC_INTRON', myTcc] pString = '\t'.join(pString) + '\n' fOut.write(pString) id += 1 #3UTR for pair in utr3: myTcc = bioLibCG.makeTcc(chrom, strand, pair[0] + 1, pair[1] + 1) if codingStatus: pString = [str(id), 'C_3UTR', myTcc] pString = '\t'.join(pString) + '\n' fOut.write(pString) else: pString = [str(id), 'NC_3UTR', myTcc] pString = '\t'.join(pString) + '\n' fOut.write(pString) id += 1 f.close() fOut.close()
def getSplicingUnitOccupancy(tranFN, wigDir1, wigDir2, chrom, strand, maxCut): """get the number of spots in each data set, and the number that overlap""" """wigDir2 has to be hela cuz strand flip""" maxCut = int(maxCut) oppStrand = bioLibCG.switchStrand(strand) coord_value1 = cgWig.loadSingleWig(wigDir1, chrom, strand, "ALL") coord_value2 = cgWig.loadSingleWig(wigDir2, chrom, oppStrand, "ALL") # 0, 0, 0 = num1, num2, numOverlap covered = set() cutoff_overlap = dict((i, [0, 0, 0]) for i in range(maxCut)) f = open(tranFN, "r") for line in f: ls = line.strip().split("\t") tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2]) if tChrom != chrom or tStrand != strand: continue tStart, tEnd = int(ls[3]), int(ls[4]) - 1 cStart, cEnd = int(ls[5]), int(ls[6]) - 1 exonStarts = [int(x) for x in ls[8][:-1].split(",")] exonEnds = [int(x) - 1 for x in ls[9][:-1].split(",")] exonPairs = zip(exonStarts, exonEnds) codingStatus = "_coding" in ls[13] tID = ls[0] # calulate intron pairs intronPairs = [] i = 0 for pair in exonPairs: if i == 0: i += 1 continue iStart = exonPairs[i - 1][1] + 1 iEnd = exonPairs[i][0] - 1 intronPairs.append((iStart, iEnd)) i += 1 # take care of messy UTRs and assign utr ranges # 5UTR if strand == "1": if cStart == tStart or cStart == tEnd + 1: range5 = () else: range5 = (tStart, cStart - 1) else: if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1: range5 = () else: range5 = (cEnd + 1, tEnd) # 3UTR if strand == "1": if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1: range3 = () else: range3 = (cEnd + 1, tEnd) else: if cStart == tStart or cStart == tEnd + 1: range3 = () else: range3 = (tStart, cStart - 1) utr5 = compareData.subtractTwoRanges([range5], intronPairs) utr3 = compareData.subtractTwoRanges([range3], intronPairs) exonPairs = compareData.subtractTwoRanges(exonPairs, [range5]) exonPairs = compareData.subtractTwoRanges(exonPairs, [range3]) pairs__type = [(exonPairs, "C_EXON"), (intronPairs, "C_INTRON")] for pairs, type in pairs__type: for pair in pairs: for i in xrange(pair[0], pair[1] + 1): if codingStatus: if type == "C_EXON": if i in covered: continue # multiple transcripts will have same exons covered.add(i) val1 = coord_value1.get(i, 0) val2 = coord_value2.get(i, 0) for cut in range(1, maxCut): # in1 = (val1 >= cut) # in2 = (val2 >= cut) in1 = val1 == cut in2 = val2 == cut if in1 and in2: cutoff_overlap[cut][2] += 1 if in1: cutoff_overlap[cut][0] += 1 if in2: cutoff_overlap[cut][1] += 1 elif type == "C_INTRON": # intronChr_strand_coord.setdefault(tChrom, {}).setdefault(tStrand, set()).add(i) pass for i in range(1, maxCut): cutoff_overlap[i].extend(["%s:%s" % (chrom, strand), i]) pString = "\t".join([str(x) for x in cutoff_overlap[i]]) print pString
def makeContextDB(tranFN, chrom, strand, outFN): '''outputs cID TYPE TCC, used for making wigs''' f = open(tranFN, 'r') fOut = open(outFN, 'w') id = 0 for line in f: ls = line.strip().split('\t') tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2]) if tChrom != chrom or tStrand != strand: continue tStart, tEnd = int(ls[3]), int(ls[4]) - 1 cStart, cEnd = int(ls[5]), int(ls[6]) - 1 exonStarts = [int(x) for x in ls[8][:-1].split(',')] exonEnds = [int(x) - 1 for x in ls[9][:-1].split(',')] exonPairs = zip(exonStarts, exonEnds) codingStatus = '_coding' in ls[13] tID = ls[0] intronPairs = [] i = 0 for pair in exonPairs: if i == 0: i += 1 continue iStart = exonPairs[i -1][1] + 1 iEnd = exonPairs[i][0] - 1 intronPairs.append((iStart, iEnd)) i += 1 #p.tell(tStart, tEnd, cStart, cEnd, exonPairs, intronPairs) #take care of messy UTRs and assign utr ranges #5UTR if strand == '1': if cStart == tStart or cStart == tEnd + 1: range5 = () else: range5 = (tStart, cStart - 1) else: if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1: range5 = () else: range5 = (cEnd + 1, tEnd) #3UTR if strand == '1': if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1: range3 = () else: range3 = (cEnd + 1, tEnd) else: if cStart == tStart or cStart == tEnd + 1: range3 = () else: range3 = (tStart, cStart - 1) utr5 = compareData.subtractTwoRanges([range5], intronPairs) utr3 = compareData.subtractTwoRanges([range3], intronPairs) exonPairs = compareData.subtractTwoRanges(exonPairs, [range5]) exonPairs = compareData.subtractTwoRanges(exonPairs, [range3]) #5UTR for pair in utr5: myTcc = bioLibCG.makeTcc(chrom, strand, pair[0] + 1, pair[1] + 1) if codingStatus: pString = [str(id), 'C_5UTR', myTcc] pString = '\t'.join(pString) + '\n' fOut.write(pString) else: pString = [str(id), 'NC_5UTR', myTcc] pString = '\t'.join(pString) + '\n' fOut.write(pString) id += 1 #Exons for pair in exonPairs: myTcc = bioLibCG.makeTcc(chrom, strand, pair[0] + 1, pair[1] + 1) if codingStatus: pString = [str(id), 'C_EXON', myTcc] pString = '\t'.join(pString) + '\n' fOut.write(pString) else: pString = [str(id), 'NC_EXON', myTcc] pString = '\t'.join(pString) + '\n' fOut.write(pString) id += 1 #Introns for pair in intronPairs: myTcc = bioLibCG.makeTcc(chrom, strand, pair[0] + 1, pair[1] + 1) if codingStatus: pString = [str(id), 'C_INTRON', myTcc] pString = '\t'.join(pString) + '\n' fOut.write(pString) else: pString = [str(id), 'NC_INTRON', myTcc] pString = '\t'.join(pString) + '\n' fOut.write(pString) id += 1 #3UTR for pair in utr3: myTcc = bioLibCG.makeTcc(chrom, strand, pair[0] + 1, pair[1] + 1) if codingStatus: pString = [str(id), 'C_3UTR', myTcc] pString = '\t'.join(pString) + '\n' fOut.write(pString) else: pString = [str(id), 'NC_3UTR', myTcc] pString = '\t'.join(pString) + '\n' fOut.write(pString) id += 1 f.close() fOut.close()
def getSplicingUnitLengths(tranFN, wigDir, chrom, strand): exonChr_strand_coord = {} intronChr_strand_coord = {} f = open(tranFN, 'r') for line in f: ls = line.strip().split('\t') tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2]) #if tChrom != chrom or tStrand != strand: #continue tStart, tEnd = int(ls[3]), int(ls[4]) - 1 cStart, cEnd = int(ls[5]), int(ls[6]) - 1 exonStarts = [int(x) for x in ls[8][:-1].split(',')] exonEnds = [int(x) - 1 for x in ls[9][:-1].split(',')] exonPairs = zip(exonStarts, exonEnds) codingStatus = '_coding' in ls[13] tID = ls[0] #calulate intron pairs intronPairs = [] i = 0 for pair in exonPairs: if i == 0: i += 1 continue iStart = exonPairs[i - 1][1] + 1 iEnd = exonPairs[i][0] - 1 intronPairs.append((iStart, iEnd)) i += 1 #take care of messy UTRs and assign utr ranges #5UTR if strand == '1': if cStart == tStart or cStart == tEnd + 1: range5 = () else: range5 = (tStart, cStart - 1) else: if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1: range5 = () else: range5 = (cEnd + 1, tEnd) #3UTR if strand == '1': if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1: range3 = () else: range3 = (cEnd + 1, tEnd) else: if cStart == tStart or cStart == tEnd + 1: range3 = () else: range3 = (tStart, cStart - 1) utr5 = compareData.subtractTwoRanges([range5], intronPairs) utr3 = compareData.subtractTwoRanges([range3], intronPairs) exonPairs = compareData.subtractTwoRanges(exonPairs, [range5]) exonPairs = compareData.subtractTwoRanges(exonPairs, [range3]) pairs__type = [(exonPairs, 'C_EXON'), (intronPairs, 'C_INTRON')] for pairs, type in pairs__type: for pair in pairs: for i in xrange(pair[0], pair[1] + 1): if codingStatus: if type == 'C_EXON': exonChr_strand_coord.setdefault( tChrom, {}).setdefault(tStrand, set()).add(i) elif type == 'C_INTRON': #intronChr_strand_coord.setdefault(tChrom, {}).setdefault(tStrand, set()).add(i) pass iLength = 0 for chrom in intronChr_strand_coord: for strand in intronChr_strand_coord[chrom]: iLength += len(intronChr_strand_coord[chrom][strand]) eLength = 0 for chrom in exonChr_strand_coord: for strand in exonChr_strand_coord[chrom]: eLength += len(exonChr_strand_coord[chrom][strand]) print 'total Exon Length (all exons overlapped)', eLength print 'total intron Length (all introns overlapped)', iLength
def makeContextWig(tranFN, wigDir, chrom, strand, species = 'hg19'): p = bioLibCG.cgPrint() coord_id = {} f = open(tranFN, 'r') for line in f: ls = line.strip().split('\t') tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2]) if tChrom != chrom or tStrand != strand: continue tStart, tEnd = int(ls[3]), int(ls[4]) - 1 cStart, cEnd = int(ls[5]), int(ls[6]) - 1 exonStarts = [int(x) for x in ls[8][:-1].split(',')] exonEnds = [int(x) - 1 for x in ls[9][:-1].split(',')] exonPairs = zip(exonStarts, exonEnds) codingStatus = '_coding' in ls[13] tID = ls[0] #debug p.show = False intronPairs = [] i = 0 for pair in exonPairs: if i == 0: i += 1 continue iStart = exonPairs[i -1][1] + 1 iEnd = exonPairs[i][0] - 1 intronPairs.append((iStart, iEnd)) i += 1 #p.tell(tStart, tEnd, cStart, cEnd, exonPairs, intronPairs) #take care of messy UTRs and assign utr ranges #5UTR if strand == '1': if cStart == tStart or cStart == tEnd + 1: p.tell('5 is none') range5 = () else: range5 = (tStart, cStart - 1) else: if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1: p.tell('5 is none') range5 = () else: range5 = (cEnd + 1, tEnd) #3UTR if strand == '1': if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1: p.tell('3 is none') range3 = () else: range3 = (cEnd + 1, tEnd) else: if cStart == tStart or cStart == tEnd + 1: p.tell('3 is none') range3 = () else: range3 = (tStart, cStart - 1) p.tell('ranges', range5, range3) p.tell('intronRange', intronPairs) utr5 = compareData.subtractTwoRanges([range5], intronPairs) utr3 = compareData.subtractTwoRanges([range3], intronPairs) p.tell('utr', utr5, utr3) p.tell('exon before', exonPairs) exonPairs = compareData.subtractTwoRanges(exonPairs, [range5]) exonPairs = compareData.subtractTwoRanges(exonPairs, [range3]) p.tell('exon after', exonPairs) debugSpot = 23631989 #5UTR for pair in utr5: p.tell('filling utr5', pair[0], pair[1]) for i in xrange(pair[0], pair[1] + 1): if i == debugSpot: p.tell('*** 5UTR', codingStatus, tID) if codingStatus: coord_id[i] = coord_id.get(i, '') + 'C_5UTR ' else: coord_id[i] = coord_id.get(i, '') + 'NC_5UTR ' #Exons for pair in exonPairs: p.tell('filling exons', pair[0], pair[1]) for i in xrange(pair[0], pair[1] + 1): if i == debugSpot: p.tell('*** exon', codingStatus, tID) if codingStatus: coord_id[i] = coord_id.get(i, '') + 'C_EXON ' else: coord_id[i] = coord_id.get(i, '') + 'NC_EXON ' #Introns for pair in intronPairs: p.tell('filling introns', pair[0], pair[1]) for i in xrange(pair[0], pair[1] + 1): if i == debugSpot: p.tell('*** INTRON', codingStatus, tID) if codingStatus: coord_id[i] = coord_id.get(i, '') + 'C_INTRON ' else: coord_id[i] = coord_id.get(i, '') + 'NC_INTRON ' #3UTR for pair in utr3: p.tell('filling utr3', pair[0], pair[1]) for i in xrange(pair[0], pair[1] + 1): if i == debugSpot: p.tell(' *** 3UTR', codingStatus, tID) if codingStatus: coord_id[i] = coord_id.get(i, '') + 'C_3UTR ' else: coord_id[i] = coord_id.get(i, '') + 'NC_3UTR ' p.show = False #uniqify, stringify for i, ids in coord_id.iteritems(): coord_id[i] = ','.join([x for x in set(ids.strip().split(' '))]) #p.tell('finalInfo', utr5, exonPairs, utr3) #write wig to file writeWigDictToWig(coord_id, chrom, strand, species, 'context', wigDir, 'INTER')
def getSplicingUnitOccupancy(tranFN, wigDir1, wigDir2, chrom, strand, maxCut): '''get the number of spots in each data set, and the number that overlap''' '''wigDir2 has to be hela cuz strand flip''' maxCut = int(maxCut) oppStrand = bioLibCG.switchStrand(strand) coord_value1 = cgWig.loadSingleWig(wigDir1, chrom, strand, 'ALL') coord_value2 = cgWig.loadSingleWig(wigDir2, chrom, oppStrand, 'ALL') # 0, 0, 0 = num1, num2, numOverlap covered = set() cutoff_overlap = dict( (i, [0, 0, 0]) for i in range(maxCut)) f = open(tranFN, 'r') for line in f: ls = line.strip().split('\t') tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2]) if tChrom != chrom or tStrand != strand: continue tStart, tEnd = int(ls[3]), int(ls[4]) - 1 cStart, cEnd = int(ls[5]), int(ls[6]) - 1 exonStarts = [int(x) for x in ls[8][:-1].split(',')] exonEnds = [int(x) - 1 for x in ls[9][:-1].split(',')] exonPairs = zip(exonStarts, exonEnds) codingStatus = '_coding' in ls[13] tID = ls[0] #calulate intron pairs intronPairs = [] i = 0 for pair in exonPairs: if i == 0: i += 1 continue iStart = exonPairs[i -1][1] + 1 iEnd = exonPairs[i][0] - 1 intronPairs.append((iStart, iEnd)) i += 1 #take care of messy UTRs and assign utr ranges #5UTR if strand == '1': if cStart == tStart or cStart == tEnd + 1: range5 = () else: range5 = (tStart, cStart - 1) else: if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1: range5 = () else: range5 = (cEnd + 1, tEnd) #3UTR if strand == '1': if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1: range3 = () else: range3 = (cEnd + 1, tEnd) else: if cStart == tStart or cStart == tEnd + 1: range3 = () else: range3 = (tStart, cStart - 1) utr5 = compareData.subtractTwoRanges([range5], intronPairs) utr3 = compareData.subtractTwoRanges([range3], intronPairs) exonPairs = compareData.subtractTwoRanges(exonPairs, [range5]) exonPairs = compareData.subtractTwoRanges(exonPairs, [range3]) pairs__type = [ (exonPairs, 'C_EXON'), (intronPairs, 'C_INTRON') ] for pairs, type in pairs__type: for pair in pairs: for i in xrange(pair[0], pair[1] + 1): if codingStatus: if type == 'C_EXON': if i in covered: continue #multiple transcripts will have same exons covered.add(i) val1 = coord_value1.get(i, 0) val2 = coord_value2.get(i, 0) for cut in range(1, maxCut): #in1 = (val1 >= cut) #in2 = (val2 >= cut) in1 = (val1 == cut) in2 = (val2 == cut) if in1 and in2: cutoff_overlap[cut][2] += 1 if in1: cutoff_overlap[cut][0] += 1 if in2: cutoff_overlap[cut][1] += 1 elif type == 'C_INTRON': #intronChr_strand_coord.setdefault(tChrom, {}).setdefault(tStrand, set()).add(i) pass for i in range(1, maxCut): cutoff_overlap[i].extend(['%s:%s' % (chrom, strand), i]) pString = '\t'.join([ str(x) for x in cutoff_overlap[i] ]) print pString
def createCollapsedGeneSets(tranFN, outFN, acceptableTypes = 'EXON', onlyCoding = True): '''get areas occupied by all transcripts in a gene''' acceptableTypes = acceptableTypes.strip().split(',') geneName_intervalSet = {} geneName_info = {} f = open(tranFN, 'r') for line in f: ls = line.strip().split('\t') tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2]) #if tChrom != chrom or tStrand != strand: #continue tStart, tEnd = int(ls[3]), int(ls[4]) - 1 cStart, cEnd = int(ls[5]), int(ls[6]) - 1 exonStarts = [int(x) for x in ls[8][:-1].split(',')] exonEnds = [int(x) - 1 for x in ls[9][:-1].split(',')] exonPairs = zip(exonStarts, exonEnds) codingStatus = '_coding' in ls[13] geneName = ls[10] tID = ls[0] #calulate intron pairs intronPairs = [] i = 0 for pair in exonPairs: if i == 0: i += 1 continue iStart = exonPairs[i -1][1] + 1 iEnd = exonPairs[i][0] - 1 intronPairs.append((iStart, iEnd)) i += 1 #take care of messy UTRs and assign utr ranges #5UTR if tStrand == '1': if cStart == tStart or cStart == tEnd + 1: range5 = () else: range5 = (tStart, cStart - 1) else: if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1: range5 = () else: range5 = (cEnd + 1, tEnd) #3UTR if tStrand == '1': if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1: range3 = () else: range3 = (cEnd + 1, tEnd) else: if cStart == tStart or cStart == tEnd + 1: range3 = () else: range3 = (tStart, cStart - 1) utr5 = compareData.subtractTwoRanges([range5], intronPairs) utr3 = compareData.subtractTwoRanges([range3], intronPairs) exonPairs = compareData.subtractTwoRanges(exonPairs, [range5]) exonPairs = compareData.subtractTwoRanges(exonPairs, [range3]) geneName_info.setdefault(geneName, set()).add((tChrom, tStrand)) pairs__type = [ (exonPairs, 'EXON'), (intronPairs, 'INTRON'), (utr5, '5UTR'), (utr3, '3UTR') ] for pairs, type in pairs__type: for pair in pairs: if type in acceptableTypes: if onlyCoding and not codingStatus: continue #create geneset/info if does not exist if geneName not in geneName_intervalSet: geneName_intervalSet[geneName] = IntervalSet() geneName_intervalSet[geneName].add(Interval(pair[0], pair[1] + 1)) for geneName, info in geneName_info.iteritems(): if len(info) > 1: if geneName in geneName_intervalSet: del geneName_intervalSet[geneName] # if it spans different chromosomes/strands... print geneName, info, 'FAILED' fOut = open(outFN, 'w') for geneName, iSet in geneName_intervalSet.iteritems(): gStarts = [] gEnds = [] for interv in iSet: gStarts.append(interv.lower_bound) gEnds.append(interv.upper_bound) chrom, strand = geneName_info[geneName].pop() outString = [geneName, chrom, strand, ','.join([str(x) for x in gStarts]), ','.join([str(x) for x in gEnds])] fOut.write('\t'.join([str(x) for x in outString]) + '\n') fOut.close()
def makeContextWig(tranFN, wigDir, chrom, strand, species='hg19'): p = bioLibCG.cgPrint() coord_id = {} f = open(tranFN, 'r') for line in f: ls = line.strip().split('\t') tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2]) if tChrom != chrom or tStrand != strand: continue tStart, tEnd = int(ls[3]), int(ls[4]) - 1 cStart, cEnd = int(ls[5]), int(ls[6]) - 1 exonStarts = [int(x) for x in ls[8][:-1].split(',')] exonEnds = [int(x) - 1 for x in ls[9][:-1].split(',')] exonPairs = zip(exonStarts, exonEnds) codingStatus = '_coding' in ls[13] tID = ls[0] #debug p.show = False intronPairs = [] i = 0 for pair in exonPairs: if i == 0: i += 1 continue iStart = exonPairs[i - 1][1] + 1 iEnd = exonPairs[i][0] - 1 intronPairs.append((iStart, iEnd)) i += 1 #p.tell(tStart, tEnd, cStart, cEnd, exonPairs, intronPairs) #take care of messy UTRs and assign utr ranges #5UTR if strand == '1': if cStart == tStart or cStart == tEnd + 1: p.tell('5 is none') range5 = () else: range5 = (tStart, cStart - 1) else: if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1: p.tell('5 is none') range5 = () else: range5 = (cEnd + 1, tEnd) #3UTR if strand == '1': if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1: p.tell('3 is none') range3 = () else: range3 = (cEnd + 1, tEnd) else: if cStart == tStart or cStart == tEnd + 1: p.tell('3 is none') range3 = () else: range3 = (tStart, cStart - 1) p.tell('ranges', range5, range3) p.tell('intronRange', intronPairs) utr5 = compareData.subtractTwoRanges([range5], intronPairs) utr3 = compareData.subtractTwoRanges([range3], intronPairs) p.tell('utr', utr5, utr3) p.tell('exon before', exonPairs) exonPairs = compareData.subtractTwoRanges(exonPairs, [range5]) exonPairs = compareData.subtractTwoRanges(exonPairs, [range3]) p.tell('exon after', exonPairs) debugSpot = 23631989 #5UTR for pair in utr5: p.tell('filling utr5', pair[0], pair[1]) for i in xrange(pair[0], pair[1] + 1): if i == debugSpot: p.tell('*** 5UTR', codingStatus, tID) if codingStatus: coord_id[i] = coord_id.get(i, '') + 'C_5UTR ' else: coord_id[i] = coord_id.get(i, '') + 'NC_5UTR ' #Exons for pair in exonPairs: p.tell('filling exons', pair[0], pair[1]) for i in xrange(pair[0], pair[1] + 1): if i == debugSpot: p.tell('*** exon', codingStatus, tID) if codingStatus: coord_id[i] = coord_id.get(i, '') + 'C_EXON ' else: coord_id[i] = coord_id.get(i, '') + 'NC_EXON ' #Introns for pair in intronPairs: p.tell('filling introns', pair[0], pair[1]) for i in xrange(pair[0], pair[1] + 1): if i == debugSpot: p.tell('*** INTRON', codingStatus, tID) if codingStatus: coord_id[i] = coord_id.get(i, '') + 'C_INTRON ' else: coord_id[i] = coord_id.get(i, '') + 'NC_INTRON ' #3UTR for pair in utr3: p.tell('filling utr3', pair[0], pair[1]) for i in xrange(pair[0], pair[1] + 1): if i == debugSpot: p.tell(' *** 3UTR', codingStatus, tID) if codingStatus: coord_id[i] = coord_id.get(i, '') + 'C_3UTR ' else: coord_id[i] = coord_id.get(i, '') + 'NC_3UTR ' p.show = False #uniqify, stringify for i, ids in coord_id.iteritems(): coord_id[i] = ','.join([x for x in set(ids.strip().split(' '))]) #p.tell('finalInfo', utr5, exonPairs, utr3) #write wig to file writeWigDictToWig(coord_id, chrom, strand, species, 'context', wigDir, 'INTER')