def generateSubpeakFASTA(TFtoEnhancerDict, subpeaks, genomeDirectory, projectName, projectFolder, constExtension): ''' from a BED file of constituents generate a FASTA for the consituients contained within the canidate supers ''' subpeakDict = {} subpeakBED = [['track name=' + projectName + ' color=204,0,204']] subpeakTable = utils.parseTable(subpeaks, '\t') subpeakLoci = [ utils.Locus(l[0], int(l[1]), int(l[2]), '.') for l in subpeakTable ] subpeakCollection = utils.LocusCollection(subpeakLoci, 50) for gene in TFtoEnhancerDict.keys(): subpeakDict[gene] = [] for region in TFtoEnhancerDict[gene]: overlaps = subpeakCollection.getOverlap(region) extendedOverlaps = [ utils.makeSearchLocus(x, constExtension, constExtension) for x in overlaps ] overlapCollectionTemp = utils.LocusCollection(extendedOverlaps, 50) overlapCollection = overlapCollectionTemp.stitchCollection() for overlap in overlapCollection.getLoci(): subpeakBED.append( [overlap.chr(), overlap.start(), overlap.end()]) subpeakDict[gene].append(overlap) bedfilename = projectFolder + projectName + '_subpeaks.bed' utils.unParseTable(subpeakBED, bedfilename, '\t') fasta = [] for gene in subpeakDict: for subpeak in subpeakDict[gene]: fastaTitle = gene + '|' + subpeak.chr() + '|' + str( subpeak.start()) + '|' + str(subpeak.end()) fastaLine = utils.fetchSeq(genomeDirectory, subpeak.chr(), int(subpeak.start() + 1), int(subpeak.end() + 1)) fasta.append('>' + fastaTitle) fasta.append(upper(fastaLine)) outname = projectFolder + projectName + '_SUBPEAKS.fa' utils.unParseTable(fasta, outname, '')
def generateSubpeakFASTA(TFandSuperDict, subpeaks, genomeDirectory, projectName, projectFolder, motifExtension): ''' takes as input a BED file of constituents outputs a FASTA file of merged extended super-enhancer consituents and associated formated name ''' print 'MAKE FASTA' subpeakDict = {} subpeakBED = [['track name=' + projectName + ' color=204,0,204']] subpeakTable = utils.parseTable(subpeaks, '\t') subpeakLoci = [utils.Locus(l[0], int(l[1]), int(l[2]), '.') for l in subpeakTable] subpeakCollection = utils.LocusCollection(subpeakLoci, 50) for gene in TFandSuperDict.keys(): subpeakDict[gene] = [] for region in TFandSuperDict[gene]: overlaps = subpeakCollection.getOverlap(region) extendedOverlaps = [utils.makeSearchLocus(x, motifExtension, motifExtension) for x in overlaps] overlapCollectionTemp = utils.LocusCollection(extendedOverlaps, 50) overlapCollection = overlapCollectionTemp.stitchCollection() for overlap in overlapCollection.getLoci(): subpeakBED.append([overlap.chr(), overlap.start(), overlap.end()]) subpeakDict[gene].append(overlap) bedfilename = projectFolder + projectName + '_subpeaks.bed' utils.unParseTable(subpeakBED, bedfilename, '\t') fasta = [] for gene in subpeakDict: for subpeak in subpeakDict[gene]: fastaTitle = gene + '|' + subpeak.chr() + '|' + str(subpeak.start()) + '|' + str(subpeak.end()) fastaLine = utils.fetchSeq(genomeDirectory, subpeak.chr(), int(subpeak.start()+1), int(subpeak.end()+1)) fasta.append('>' + fastaTitle) fasta.append(upper(fastaLine)) # Output the fasta file of extended SE constituents outname = projectFolder + projectName + '_SUBPEAKS.fa' utils.unParseTable(fasta, outname, '')
def generateSubpeakFASTA(gene_to_enhancer_dict, subpeaks, genome, projectName, projectFolder, constExtension): ''' from a BED file of constituents generate a FASTA for the consituients contained within the canidate supers ''' genomeDirectory = genome.directory() subpeakDict = {} subpeakBED = [['track name=' + projectName + ' color=204,0,204']] subpeakTable = utils.parseTable(subpeaks, '\t') subpeakLoci = [utils.Locus(l[0], int(l[1]), int(l[2]), '.') for l in subpeakTable] subpeakCollection = utils.LocusCollection(subpeakLoci, 50) for gene in gene_to_enhancer_dict.keys(): subpeakDict[gene] = [] for region in gene_to_enhancer_dict[gene]: overlaps = subpeakCollection.getOverlap(region) extendedOverlaps = [utils.makeSearchLocus(x, constExtension, constExtension) for x in overlaps] overlapCollectionTemp = utils.LocusCollection(extendedOverlaps, 50) overlapCollection = overlapCollectionTemp.stitchCollection() for overlap in overlapCollection.getLoci(): subpeakBED.append([overlap.chr(), overlap.start(), overlap.end()]) subpeakDict[gene].append(overlap) fasta = [] for gene in subpeakDict: for subpeak in subpeakDict[gene]: fastaTitle = gene + '|' + subpeak.chr() + '|' + str(subpeak.start()) + '|' + str(subpeak.end()) fastaLine = utils.fetchSeq(genomeDirectory, subpeak.chr(), int(subpeak.start()+1), int(subpeak.end()+1)) fasta.append('>' + fastaTitle) fasta.append(string.upper(fastaLine)) return subpeakBED,fasta
def makePeakTable(paramDict, splitGFFPath, averageTablePath, startDict, geneList, genomeDirectory, tads_path=''): ''' makes the final peak table with ebox info ''' peakTable = [[ 'REGION_ID', 'CHROM', 'START', 'STOP', 'LENGTH', 'TSS', 'CPG', 'CPG_FRACTION', 'GC_FREQ', 'SIGNAL', 'CANON_EBOX_COUNT', 'NON_CANON_EBOX_COUNT', 'TOTAL_EBOX_COUNT', 'OVERLAPPING_GENES', 'PROXIMAL_GENES' ]] print('LOADING PEAK REGIONS') peakGFF = utils.parseTable(splitGFFPath, '\t') print('LOADING BINDING DATA') signalTable = utils.parseTable(averageTablePath, '\t') print('LOADING CPGS ISLANDS') cpgBed = utils.parseTable(paramDict['cpgPath'], '\t') cpgLoci = [] for line in cpgBed: cpgLoci.append(utils.Locus(line[0], line[1], line[2], '.', line[-1])) cpgCollection = utils.LocusCollection(cpgLoci, 50) print("MAKING TSS COLLECTIONS") if len(geneList) == 0: geneList = startDict.keys() tss_1kb_loci = [] tss_50kb_loci = [] for refID in geneList: tss_1kb_loci.append(utils.makeTSSLocus(refID, startDict, 1000, 1000)) tss_50kb_loci.append(utils.makeTSSLocus(refID, startDict, 50000, 50000)) #make a 1kb flanking and 50kb flanking collection tss_1kb_collection = utils.LocusCollection(tss_1kb_loci, 50) tss_50kb_collection = utils.LocusCollection(tss_50kb_loci, 50) if len(tads_path) > 0: print('LOADING TADS FROM %s' % (tads_path)) tad_collection = utils.importBoundRegion(tads_path, 'tad') use_tads = True #building a tad dict keyed by tad ID w/ genes in that tad provided tad_dict = defaultdict(list) for tss_locus in tss_1kb_loci: overlapping_tads = tad_collection.getOverlap(tss_locus, 'both') for tad_locus in overlapping_tads: tad_dict[tad_locus.ID()].append(tss_locus.ID()) else: use_tads = False print('CLASSIFYING PEAKS') ticker = 0 no_tad_count = 0 for i in range(len(peakGFF)): if ticker % 1000 == 0: print(ticker) ticker += 1 #getting the particulars of the region gffLine = peakGFF[i] peakID = gffLine[1] chrom = gffLine[0] start = int(gffLine[3]) stop = int(gffLine[4]) lineLocus = utils.Locus(chrom, start, stop, '.', peakID) #getting the mapped signal signalLine = signalTable[(i + 1)] signalVector = [float(x) for x in signalLine[2:]] #setting up the new line newLine = [peakID, chrom, start, stop, lineLocus.len()] #get the tss status from the gff itself (we are able to do this nicely from the split gff code earlier newLine.append(gffLine[7]) #check cpg status if cpgCollection.getOverlap(lineLocus, 'both'): newLine.append(1) else: newLine.append(0) #now do fractional cpgOverlap overlappingCpGLoci = cpgCollection.getOverlap(lineLocus, 'both') overlappingBases = 0 for locus in overlappingCpGLoci: cpgStart = max(locus.start(), lineLocus.start()) cpgEnd = min(locus.end(), lineLocus.end()) overlappingBases += (cpgEnd - cpgStart) overlapFraction = float(overlappingBases) / lineLocus.len() newLine.append(round(overlapFraction, 2)) #now get the seq lineSeq = string.upper( utils.fetchSeq(genomeDirectory, chrom, start, stop, True)) if len(lineSeq) == 0: print('UH OH') print(lineSeq) print(gffLine) print(i) print(chrom) print(start) print(stop) sys.exit() gcFreq = float(lineSeq.count('GC') + lineSeq.count('CG')) / len(lineSeq) newLine.append(gcFreq) #this is where we add the ChIP-Seq signal newLine += signalVector eboxMatchList = re.findall('CA..TG', lineSeq) if len(eboxMatchList) == 0: newLine += [0] * 3 else: totalCount = len(eboxMatchList) canonCount = eboxMatchList.count('CACGTG') otherCount = totalCount - canonCount newLine += [canonCount, otherCount, totalCount] #now find the overlapping and proximal genes #here each overlapping gene the tss 1kb locus overlaps the peak if use_tads: tad_loci = tad_collection.getOverlap(lineLocus, 'both') tad_id_list = [tad_locus.ID() for tad_locus in tad_loci] tad_genes = [] for tad_id in tad_id_list: tad_genes += tad_dict[tad_id] if len(tad_genes) == 0: #print('no tad for this region') #print(gffLine) no_tad_count += 1 else: tad_genes = [] if len(tad_genes) > 0: overlappingGenes = [ startDict[locus.ID()]['name'] for locus in tss_1kb_collection.getOverlap(lineLocus, 'both') if tad_genes.count(locus.ID()) > 0 ] proximalGenes = [ startDict[locus.ID()]['name'] for locus in tss_50kb_collection.getOverlap(lineLocus, 'both') if tad_genes.count(locus.ID()) > 0 ] # print('linked peak to tad genes') # print([startDict[x]['name'] for x in tad_genes]) # print(tad_id_list) # print(gffLine) # print(overlappingGenes) # print(proximalGenes) else: overlappingGenes = [ startDict[locus.ID()]['name'] for locus in tss_1kb_collection.getOverlap(lineLocus, 'both') ] proximalGenes = [ startDict[locus.ID()]['name'] for locus in tss_50kb_collection.getOverlap(lineLocus, 'both') ] overlappingGenes = utils.uniquify(overlappingGenes) #here the tss 50kb locus overlaps the peak #overlap takes priority over proximal proximalGenes = [ gene for gene in proximalGenes if overlappingGenes.count(gene) == 0 ] proximalGenes = utils.uniquify(proximalGenes) overlappingString = string.join(overlappingGenes, ',') proximalString = string.join(proximalGenes, ',') newLine += [overlappingString, proximalString] peakTable.append(newLine) print('Out of %s regions, %s were assigned to at least 1 tad' % (len(peakTable), no_tad_count)) return peakTable
def makeEboxBeds(gff_path, name='', overwrite=False): ''' makes an ebox bed for the corresponding gff ''' if len(name) == 0: gff_name = gff_path.split('/')[-1].split('.')[0] else: gff_name = name #set output canon_path = '%s%s_CANON_EBOX.bed' % (bedFolder, gff_name) non_path = '%s%s_NONCANON_EBOX.bed' % (bedFolder, gff_name) #check to see if already done if not overwrite: if utils.checkOutput(canon_path, 0.1, 0.1) and utils.checkOutput( non_path, 0.1, 0.1): print('Found bed output at %s and %s' % (canon_path, non_path)) return canon_path, non_path #for each region spit out canonical and non canonical eboxes canonBed = [] nonBed = [] #open up the repeat filtered gff region_gff = utils.parseTable(gff_path, '\t') ticker = 0 for line in region_gff: if ticker % 100 == 0: print(ticker) ticker += 1 chrom = line[0] start = int(line[3]) end = int(line[4]) seq = string.upper( utils.fetchSeq(genomeDirectory, chrom, start, end, True)) #get the canonical starts canonStarts = [ start + match.start() for match in re.finditer('CACGTG', seq) ] #get the non canonical starts nonStarts = [ start + match.start() for match in re.finditer('CA..TG', seq) ] #filter out the canonicals nonStarts = [x for x in nonStarts if canonStarts.count(x) == 0] #now fill out the bed if len(canonStarts) > 0: for ebox_start in canonStarts: canonBed.append([chrom, ebox_start, (ebox_start + 6), '.']) if len(nonStarts) > 0: for ebox_start in nonStarts: nonBed.append([chrom, ebox_start, (ebox_start + 6), '.']) print('FOUND %s CANONICAL EBOXES' % (len(canonBed))) print('FOUND %s NON CANONICAL EBOXES' % (len(nonBed))) utils.unParseTable(canonBed, canon_path, '\t') utils.unParseTable(nonBed, non_path, '\t') return canon_path, non_path
se_id = line[0] se_namelist.append(se_id) subpeak_dict[se_id] = [] se_locus = utils.Locus(line[1], line[2], line[3], '.') overlaps = subpeak_collection.getOverlap(se_locus) for overlap in overlaps: subpeak_dict[se_id].append(overlap) subpeak = overlap fastaTitle = se_id + '|' + subpeak.chr() + '|' + str( subpeak.start()) + '|' + str(subpeak.end()) fastaLine = utils.fetchSeq(genomeDirectory, subpeak.chr(), int(subpeak.start() + 1), int(subpeak.end() + 1)) fasta.append('>' + fastaTitle) fasta.append(upper(fastaLine)) outname = projectFolder + projectName + '_SUBPEAKS.fa' utils.unParseTable(fasta, outname, '') # call FIMO and find the motifs within each enhancer motifConvertFile = '/ark/home/af661/src/coreTFnetwork/annotations/MotifDictionary.txt' motifDatabaseFile = '/ark/home/af661/src/coreTFnetwork/annotations/VertebratePWMs.txt' motifDatabase = utils.parseTable(motifConvertFile, '\t') motifDatabaseDict = {}
def rank_eboxes(nb_all_chip_dataFile,mycn_gff_path,macsFolder,genomeDirectory,window = 100): ''' uses the conserved MYCN sites and ranks eboxes within them by average background subtracted signal searches 100bp (window variable) from mycn summits ''' window = int(window) #bring in the conserved mycn region print('making gff of nb mycn summits') nb_mycn_gff = utils.parseTable(mycn_gff_path,'\t') nb_mycn_collection = utils.gffToLocusCollection(nb_mycn_gff,50) dataDict =pipeline_dfci.loadDataTable(nb_all_chip_dataFile) names_list = [name for name in dataDict.keys() if name.count('MYCN') == 1] names_list.sort() summit_loci = [] #first makes a gff of all summits +/- 100bp for all nb mycn datasets for name in names_list: summit_bed_path = '%s%s/%s_summits.bed' % (macsFolder,name,name) summit_bed = utils.parseTable(summit_bed_path,'\t') for line in summit_bed: summit_locus = utils.Locus(line[0],int(line[1])-window,int(line[2])+window,'.',line[3]) if len(nb_mycn_collection.getOverlap(summit_locus)) > 0: summit_loci.append(summit_locus) summit_collection =utils.LocusCollection(summit_loci,50) summit_merged_collection = summit_collection.stitchCollection() summit_gff = utils.locusCollectionToGFF(summit_merged_collection) summit_gff_path = '%sHG19_NB_MYCN_SUMMITS_-%s_+%s.gff' % (gffFolder,window,window) utils.unParseTable(summit_gff,summit_gff_path,'\t') #this is borrowed from above and maps chip-seq signal to the gff print('mapping to nb mycn summits and making signal dict') gffList = [summit_gff_path] summit_signal_path = pipeline_dfci.map_regions(nb_all_chip_dataFile,gffList) mycnSignalTable = utils.parseTable(summit_signal_path,'\t') #making a signal dictionary for MYCN binding names_list = ['BE2C_MYCN','KELLY_MYCN','NGP_MYCN','SHEP21_0HR_MYCN_NOSPIKE'] background_list = [dataDict[name]['background'] for name in names_list] header = mycnSignalTable[0] chip_columns = [header.index(name) for name in names_list] background_columns = [header.index(background_name) for background_name in background_list] mycn_sig_dict = {} for line in mycnSignalTable[1:]: line_sig = [] for i in range(len(names_list)): line_sig.append(float(line[chip_columns[i]]) - float(line[background_columns[i]])) region_id = line[1] coords = [int(x) for x in line[1].split(':')[-1].split('-')] line_length = coords[1]-coords[0] mycn_sig_dict[region_id] = numpy.mean(line_sig)*line_length #now for each region find the eboxes and then add up the signal print('making ebox ranking') ebox_list = ['CACGTG','CAGTTG','CAAGTG','CAGGTG','CAATTG','CAAATG','CATCTG','CAGCTG','CATGTG','CATATG'] eboxDict = {} for ebox in ebox_list: eboxDict[ebox] = [] ticker = 0 for line in summit_gff: if ticker % 1000 == 0: print(ticker) ticker+=1 chrom = line[0] sense = '.' start = int(line[3]) end = int(line[4]) region_id = '%s(%s):%s-%s' % (line[0],line[6],line[3],line[4]) signal = mycn_sig_dict[region_id] sequenceLine = utils.fetchSeq(genomeDirectory,chrom,start,end,True) motifVector = [] matches = re.finditer('CA..TG',str.upper(sequenceLine)) if matches: for match in matches: motifVector.append(match.group()) #count only 1 of each motif type per line #motifVector = utils.uniquify(motifVector) for motif in motifVector: if ebox_list.count(motif) > 0: eboxDict[motif].append(signal) else: eboxDict[utils.revComp(motif)].append(signal) eboxTable =[] eboxTableOrdered =[['EBOX','OCCURENCES','AVG_HEIGHT']] for ebox in eboxDict.keys(): newLine = [ebox,len(eboxDict[ebox]),numpy.mean(eboxDict[ebox])] eboxTable.append(newLine) occurenceOrder = utils.order([line[2] for line in eboxTable],decreasing=True) for x in occurenceOrder: eboxTableOrdered.append(eboxTable[x]) print(eboxTableOrdered) ebox_outfile = '%sHG19_NB_MYCN_CONSERVED_SUMMITS_-%s_+%s_EBOX_RANK.txt' % (tableFolder,window,window) utils.unParseTable(eboxTableOrdered,ebox_outfile,'\t') return ebox_outfile
def make_mycn_stats_table(nb_all_chip_dataFile,outFile): ''' making a table of conserved mycn peaks w/ some additional stats mycn and h3k27ac signal is avg. background normalized across 4 samples active tss defined as the union of all H3K27ac occupied promoters in NB active enhancers defined as the union of all H3K27ac sites outside of promoters ''' dataDict = pipeline_dfci.loadDataTable(nb_all_chip_dataFile) print('SETTING UP OUTPUT TABLE') outTable = [['PEAK_ID','CHROM','START','STOP','LENGTH','ACTIVE_TSS_OVERLAP','ENHANCER_OVERLAP','CPG_ISLAND_OVERLAP','CPG_ISLAND_FRACTION','GC_FREQ','MYCN_RANK','AVG_MYCN_SIGNAL','AVG_H3K27AC_SIGNAL','CANON_EBOX_COUNT','NONCANON_EBOX_COUNT','TOTAL_EBOX_COUNT','CANON_EXP','NON_CANON_EXP','GABPA_COUNT','GABPA_EXP','GATA_COUNT','GATA_EXP']] dinuc = nmers(2,['A','T','G','C']) #input files mycnSignalFile = '%sHG19_NB_MYCN_CONSERVED_-0_+0_NB_ALL_SIGNAL.txt' % (signalFolder) h3k27acSignalFile = '%sHG19_NB_MYCN_CONSERVED_-500_+500_NB_ALL_SIGNAL.txt' % (signalFolder) mycnRankFile = '%smeta_rose/NB_MYCN/NB_MYCN_0KB_STITCHED_ENHANCER_REGION_RANK_CONSERVED.txt' % (projectFolder) activeGeneFile = '%sHG19_NB_H3K27AC_ACTIVE_UNION.txt' % (geneListFolder) #note, this is the ucsc hg19 cpg islands extended file #to download and format run ./beds/download_cpg.sh cpgFile = '%sbeds/hg19_cpg_islands.bed' % (projectFolder) enhancerFile = '%smeta_rose/NB_H3K27AC/NB_H3K27AC_AllEnhancers.table.txt' % (projectFolder) print('LOADING MYCN BINDING DATA') mycnSignalTable = utils.parseTable(mycnSignalFile,'\t') #making a signal dictionary for MYCN binding names_list = ['BE2C_MYCN','KELLY_MYCN','NGP_MYCN','SHEP21_0HR_MYCN_NOSPIKE'] background_list = [dataDict[name]['background'] for name in names_list] header = mycnSignalTable[0] chip_columns = [header.index(name) for name in names_list] background_columns = [header.index(background_name) for background_name in background_list] mycn_sig_dict = {} #this only works if the first column are unique identifiers if len(mycnSignalTable) != len(utils.uniquify([line[0] for line in mycnSignalTable])): print('Error: Column 1 of must contain unique identifiers.' % (mycnSignalFile)) sys.exit() for line in mycnSignalTable[1:]: line_sig = [] for i in range(len(names_list)): line_sig.append(float(line[chip_columns[i]]) - float(line[background_columns[i]])) mycn_sig_dict[line[0]] = numpy.mean(line_sig) print('LOADING MYCN RANK DATA') mycnRankTable = utils.parseTable(mycnRankFile,'\t') print('LOADING H3K27AC BINDING DATA') h3k27acSignalTable = utils.parseTable(h3k27acSignalFile,'\t') #making a signal dictionary for background subtracted H3K27ac binding names_list = ['BE2C_H3K27AC','KELLY_H3K27AC','NGP_H3K27AC','SHEP21_0HR_H3K27AC_NOSPIKE'] background_list = [dataDict[name]['background'] for name in names_list] header = h3k27acSignalTable[0] chip_columns = [header.index(name) for name in names_list] background_columns = [header.index(background_name) for background_name in background_list] h3k27ac_sig_dict = {} #this only works if the first column are unique identifiers if len(h3k27acSignalTable) != len(utils.uniquify([line[0] for line in h3k27acSignalTable])): print('Error: Column 1 of must contain unique identifiers.' % (h3k27acSignalFile)) sys.exit() for line in h3k27acSignalTable[1:]: line_sig = [] for i in range(len(names_list)): line_sig.append(float(line[chip_columns[i]]) - float(line[background_columns[i]])) h3k27ac_sig_dict[line[0]] = numpy.mean(line_sig) #making the cpg collection print('LOADING CPGS ISLANDS') cpgBed = utils.parseTable(cpgFile,'\t') cpgLoci = [] for line in cpgBed: cpgLoci.append(utils.Locus(line[0],line[1],line[2],'.',line[-1])) cpgCollection = utils.LocusCollection(cpgLoci,50) #next make the tss collection of active promoters print('LOADING ACTIVE PROMOTERS') startDict = utils.makeStartDict(annotFile) activeTable = utils.parseTable(activeGeneFile,'\t') tss_1kb_loci = [] for line in activeTable: tss_1kb_loci.append(utils.makeTSSLocus(line[1],startDict,1000,1000)) tss_1kb_collection = utils.LocusCollection(tss_1kb_loci,50) #enhancer file print("LOADING ACTIVE ENHANCERS") enhancerTable = utils.parseTable(enhancerFile,'\t') print('STARTING WITH THE FOLLOWING NUMBER OF ENHANCERS IN NB') print(len(enhancerTable) - 6) enhancerLoci = [] for line in enhancerTable: if line[0][0] != '#' and line[0][0] != 'R': try: lineLocus = utils.Locus(line[1],int(line[2]),int(line[3]),'.',line[0]) enhancerLoci.append(lineLocus) except IndexError: print(line) sys.exit() enhancerCollection = utils.LocusCollection(enhancerLoci,50) print('CLASSIFYING MYCN PEAKS') ticker = 0 for i in range(1,len(mycnSignalTable)): if ticker%100 == 0: print(ticker) ticker +=1 line = mycnSignalTable[i] mycn_signal = round(mycn_sig_dict[line[0]],4) h3k27ac_signal = round(h3k27ac_sig_dict[line[0]],4) peakID = line[0] locusString = line[1] chrom = locusString.split('(')[0] [start,stop] = [int(x) for x in line[1].split(':')[-1].split('-')] lineLocus = utils.Locus(chrom,start,stop,'.',peakID) tssOverlap = 0 if tss_1kb_collection.getOverlap(lineLocus,'both'): tssOverlap = 1 enhancerOverlap = 0 if enhancerCollection.getOverlap(lineLocus,'both') and tssOverlap == 0: enhancerOverlap = 1 cpgIslandOverlap = 0 if cpgCollection.getOverlap(lineLocus,'both'): cpgIslandOverlap = 1 #now do fractional cpgOverlap overlappingCpGLoci = cpgCollection.getOverlap(lineLocus,'both') overlappingBases = 0 for locus in overlappingCpGLoci: cpgStart = max(locus.start(),lineLocus.start()) cpgEnd = min(locus.end(),lineLocus.end()) overlappingBases += (cpgEnd-cpgStart) overlapFraction = round(float(overlappingBases)/lineLocus.len(),2) #now get the seq lineSeq = string.upper(utils.fetchSeq(genomeDirectory,chrom,start,stop,True)) gcFreq = round(float(lineSeq.count('GC') + lineSeq.count('CG'))/len(lineSeq),2) dinuc_dict = {} for nmer in dinuc: dinuc_dict[nmer] = float(lineSeq.count('GC'))/len(lineSeq) mycnRankLine = mycnRankTable[i] mycnRank = numpy.mean([float(x) for x in mycnRankLine[6:]]) canonMatchList = re.findall('CACGTG',lineSeq) canon_count = len(canonMatchList) eboxMatchList = re.findall('CA..TG',lineSeq) ebox_count = len(eboxMatchList) non_canon_count = ebox_count-canon_count #get the expected values canon_exp = dinuc_dict['CA']*dinuc_dict['CG']*dinuc_dict['TG']*(len(lineSeq) - 5) canon_exp = round(canon_exp,2) notCG = 1- dinuc_dict['CG'] non_exp = dinuc_dict['CA']*notCG*dinuc_dict['TG']*(len(lineSeq) - 5) non_exp = round(non_exp,2) #for gata and GABPA gabpaMatchList = re.findall('CGGAAG',lineSeq) + re.findall('CTTCCG',lineSeq) gabpa_count = len(gabpaMatchList) gabpa_exp_f = dinuc_dict['CG'] * dinuc_dict['GA'] * dinuc_dict['AG']*(len(lineSeq) - 5) gabpa_exp_r = dinuc_dict['CT'] * dinuc_dict['TC'] * dinuc_dict['CG']*(len(lineSeq) - 5) gabpa_exp = round(gabpa_exp_f,2) + round(gabpa_exp_r,2) gataMatchList = re.findall('GATAA',lineSeq) + re.findall('TTATC',lineSeq) gata_count = len(gataMatchList) an_freq = 1 - dinuc_dict['AA'] - dinuc_dict['AT'] - dinuc_dict['AG'] -dinuc_dict['AC'] cn_freq = 1 - dinuc_dict['CA'] - dinuc_dict['CT'] - dinuc_dict['CG'] -dinuc_dict['CC'] gata_exp_f = dinuc_dict['GA'] * dinuc_dict['TA'] * an_freq*(len(lineSeq) - 5) gata_exp_r = dinuc_dict['TT'] * dinuc_dict['AT'] * cn_freq*(len(lineSeq) - 5) gata_exp = round(gata_exp_f,2) + round(gata_exp_r,2) newLine = [peakID,chrom,start,stop,lineLocus.len(),tssOverlap,enhancerOverlap,cpgIslandOverlap,overlapFraction,gcFreq,mycnRank,mycn_signal,h3k27ac_signal,canon_count,non_canon_count,ebox_count,canon_exp,non_exp,gabpa_count,gabpa_exp,gata_count,gata_exp] outTable.append(newLine) utils.unParseTable(outTable,outFile,'\t') return outFile