def splitRegions(inputGFF, tssCollection): #if even a single coordinate is shared with the +/-1kb splitGFF = [] debugCount = 0 for line in inputGFF: chrom = line[0] regionID = line[1] lineLocus = utils.Locus(line[0], line[3], line[4], '.') overlappingLoci = tssCollection.getOverlap(lineLocus) if len(overlappingLoci) > 0: #case where a tss Overlap #identify the parts of the line locus that are contained localTSSCollection = utils.LocusCollection(overlappingLoci, 50) overlappingCoords = lineLocus.coords() for tssLocus in overlappingLoci: overlappingCoords += tssLocus.coords() overlappingCoords = utils.uniquify(overlappingCoords) overlappingCoords.sort() #you need to hack and slash add 1 to the last coordinate of the overlappingCoords overlappingCoords[-1] += 1 i = 0 regionTicker = 1 while i < (len(overlappingCoords) - 1): start = int(overlappingCoords[i]) stop = int(overlappingCoords[(i + 1)]) - 1 if (stop - start) < 50: #this eliminates really tiny regions i += 1 continue splitLocus = utils.Locus(chrom, start + 1, stop, '.') if lineLocus.overlaps(splitLocus): #has to be a mycn site newID = '%s_%s' % (regionID, regionTicker) tssStatus = 0 if localTSSCollection.getOverlap(splitLocus): tssStatus = 1 splitGFFLine = [ chrom, newID, newID, start, stop, '', '.', tssStatus, newID ] splitGFF.append(splitGFFLine) regionTicker += 1 i += 1 else: line[7] = 0 splitGFF.append(line) return splitGFF
def merge_regions(): ''' merges ha peaks to identify all overlapping peaks filters out anything overlapping the HA controls ''' hk_dox_ha_1 = utils.importBoundRegion( '%sHK_DOX_HA_1_peaks.bed' % (macsEnrichedFolder), 'HK_DOX_HA_1') hk_dox_ha_2 = utils.importBoundRegion( '%sHK_DOX_HA_2_peaks.bed' % (macsEnrichedFolder), 'HK_DOX_HA_2') hk_dox_loci = hk_dox_ha_1.getLoci() + hk_dox_ha_2.getLoci() #control datasets hk_ctl_ha_1 = utils.importBoundRegion( '%sHK_CTL_HA_1_peaks.bed' % (macsEnrichedFolder), 'HK_CTL_HA_1') hk_ctl_ha_2 = utils.importBoundRegion( '%sHK_CTL_HA_2_peaks.bed' % (macsEnrichedFolder), 'HK_CTL_HA_2') hk_ctl_loci = hk_ctl_ha_1.getLoci() + hk_ctl_ha_2.getLoci() hk_ctl_lc = utils.LocusCollection(hk_ctl_loci) print(len(hk_dox_loci)) stitched_lc = utils.LocusCollection(hk_dox_loci).stitchCollection() print(len(stitched_lc)) filtered_loci = [] for locus in stitched_lc.getLoci(): if len(hk_dox_ha_1.getOverlap(locus)) > 0 and len( hk_dox_ha_2.getOverlap(locus)) > 0: if len(hk_ctl_lc.getOverlap(locus)) == 0: filtered_loci.append(locus) print(len(filtered_loci)) filtered_lc = utils.LocusCollection(filtered_loci) gff_path = '%sHG19_IRF2_HA_MERGED_FILTERED_CONSERVED_0_+0.gff' % ( gffFolder) filtered_gff = utils.locusCollectionToGFF(filtered_lc) utils.unParseTable(filtered_gff, gff_path, '\t')
def stitchValleys(valleyList): ''' takes a list of valley loci returns a stitched list of valleys to extract seq from ''' valleyCollection = utils.LocusCollection(valleyList,1) stitchedValleyCollection = valleyCollection.stitchCollection() loci = [] regions = [] for valley in stitchedValleyCollection.getLoci(): if [valley.chr(), valley.start(), valley.end()] not in regions: loci.append(valley) regions.append([valley.chr(), valley.start(), valley.end()]) return loci
def calculatePromoterActivity(annotationFile, bamFile, projectName, projectFolder, refseqToNameDict, background=False): ''' calculates the level of acetylation at each TF promoter ''' print 'GENERATING AN ACTIVITY TABLE USING CHIP DATA' annotTable = utils.parseTable(annotationFile, '\t') output = [] counter = 0 bam = utils.Bam(bamFile) if background: background = utils.Bam(background) startDict = utils.makeStartDict(annotationFile) tssLoci = [] for gene in startDict: tssLoci.append(utils.makeTSSLocus(gene, startDict, 2500, 2500)) tssCollection = utils.LocusCollection(tssLoci, 50) gff = utils.locusCollectionToGFF(tssCollection) outputname = projectFolder + projectName + '_TSS.gff' utils.unParseTable(gff, outputname, '\t') mappingCmd = 'bamliquidator_batch' mappingCmd += ' -r ' + outputname mappingCmd += ' -o ' + projectFolder + 'bamliquidator' mappingCmd += ' -m -e 200 ' mappingCmd += bamFile subprocess.call(mappingCmd, shell=True) print mappingCmd
def makeSECollection(enhancerFile,name,top=0): ''' returns a locus collection from a super table top gives the number of rows ''' enhancerTable = utils.parseTable(enhancerFile,'\t') superLoci = [] ticker = 0 for line in enhancerTable: if line[0][0] == '#' or line[0][0] == 'R': continue else: ticker+=1 superLoci.append(utils.Locus(line[1],line[2],line[3],'.',name+'_'+line[0])) if ticker == top: break return utils.LocusCollection(superLoci,50)
def makeSECollection(enhancerFile, name, superOnly=True): ''' returns a locus collection from a super table top gives the number of rows ''' enhancerTable = utils.parseTable(enhancerFile, '\t') enhancerLoci = [] for line in enhancerTable: if line[0][0] == '#' or line[0][0] == 'R': continue else: if superOnly and int(line[-1]) == 0: break enhancerLoci.append( utils.Locus(line[1], line[2], line[3], '.', name + '_' + line[0])) return utils.LocusCollection(enhancerLoci, 50)
def calculatePromoterActivity(annotationFile, bamFile, projectName, projectFolder, refseqToNameDict): ''' calculates the level of H3K27ac at each promoter from a H3K27ac bam file ''' print 'IDENTIFY EXPRESSED GENES' annotTable = utils.parseTable(annotationFile, '\t') output = [] counter = 0 bam = utils.Bam(bamFile) startDict = utils.makeStartDict(annotationFile) tssLoci = [] for gene in startDict: tssLoci.append(utils.makeTSSLocus(gene,startDict,1000,1000)) tssCollection = utils.LocusCollection(tssLoci,50) gff = utils.locusCollectionToGFF(tssCollection) outputname = projectFolder + projectName + '_TSS.gff' utils.unParseTable(gff, outputname, '\t') # run bamToGFF.py to quantify signal at each TSS +/- 1kb mappingCmd = 'python ./bamToGFF.py' mappingCmd += ' -r ' mappingCmd += ' -d ' mappingCmd += ' -o ' + projectFolder + 'matrix.gff' mappingCmd += ' -m 1 -f 0 -e 200 ' mappingCmd += ' -i ' + projectFolder + projectName + '_TSS.gff' mappingCmd += ' -b ' + bamFile call(mappingCmd, shell=True) print mappingCmd
def loadAnnotFile(genome,window,geneList=[],skip_cache=False): """ load in the annotation and create a startDict and tss collection for a set of refseq IDs a given genome 20170213, add by Quanhu Sheng return validGenes """ genomeDict = { 'HG18': 'annotation/hg18_refseq.ucsc', 'MM9': 'annotation/mm9_refseq.ucsc', 'MM10': 'annotation/mm10_refseq.ucsc', 'HG19': 'annotation/hg19_refseq.ucsc', 'HG19_RIBO': 'annotation/hg19_refseq.ucsc', 'RN4': 'annotation/rn4_refseq.ucsc', 'RN6': 'annotation/rn6_refseq.ucsc', } annotFile = whereAmI + '/' + genomeDict[string.upper(genome)] if not skip_cache: # Try loading from a cache, if the crc32 matches annotPathHash = zlib.crc32(annotFile) & 0xFFFFFFFF # hash the entire location of this script annotFileHash = zlib.crc32(open(annotFile, "rb").read()) & 0xFFFFFFFF cache_file_name = "%s.%s.%s.cache" % (genome, annotPathHash, annotFileHash) cache_file_path = '%s/%s' % (tempfile.gettempdir(), cache_file_name) if os.path.isfile(cache_file_path): # Cache exists! Load it! try: print('\tLoading genome data from cache.') with open(cache_file_path, 'rb') as cache_fh: cached_data = cPickle.load(cache_fh) print('\tCache loaded.') return cached_data except (IOError, cPickle.UnpicklingError): # Pickle corrupt? Let's get rid of it. print('\tWARNING: Cache corrupt or unreadable. Ignoring.') else: print('\tNo cache exists: Loading annotation (slow).') # We're still here, so either caching was disabled, or the cache doesn't exist startDict = utils.makeStartDict(annotFile, geneList) tssLoci =[] validGenes = [] for gene in geneList: if gene in startDict: tssLoci.append(utils.makeTSSLocus(gene,startDict,window,window)) validGenes.append(gene) else: print('\tWARNING: gene %s not in annotation database. Ignoring.' % gene) tssCollection = utils.LocusCollection(tssLoci,50) if not skip_cache: print('Writing cache for the first time.') with open(cache_file_path, 'wb') as cache_fh: cPickle.dump((startDict, tssCollection), cache_fh, cPickle.HIGHEST_PROTOCOL) return startDict, tssCollection, validGenes
def findCanidateTFs(annotationFile, superLoci, expressedNM, TFlist, refseqToNameDict, projectFolder, projectName): ''' find all TFs within 1Mb of the super-enhancer center that are considered expressed return a dictionary keyed by TF that points to a list of super-enhancer loci ''' print 'FINDING CANIDATE TFs' startDict = utils.makeStartDict(annotationFile) # Find the location of the TSS of all transcripts (NMid) considered expressed tssLoci = [] for geneID in expressedNM: tssLoci.append(utils.makeTSSLocus(geneID,startDict,0,0)) tssCollection = utils.LocusCollection(tssLoci,50) # Assign all transcripts (NMid) that are TFs to a super-enhancer if it is the closest gene seAssignment = [] seAssignmentGene = [] TFandSuperDict = {} for superEnh in superLoci: seCenter = (superEnh.start() + superEnh.end()) / 2 # Find all transcripts whose TSS occur within 1Mb of the SE center searchLocus = utils.Locus(superEnh.chr(), superEnh.start()-1000000, superEnh.end()+1000000, '.') allEnhancerLoci = tssCollection.getOverlap(searchLocus) allEnhancerGenes = [locus.ID() for locus in allEnhancerLoci] # Find the transcript that is closest to the center if allEnhancerGenes: distList = [abs(seCenter - startDict[geneID]['start'][0]) for geneID in allEnhancerGenes] closestGene = allEnhancerGenes[distList.index(min(distList))] else: closestGene = '' seAssignment.append([superEnh.chr(), superEnh.start(), superEnh.end(), closestGene]) # Select the transcript if it is a TF, and allow for a TF to have multiple SEs if closestGene in TFlist and closestGene not in TFandSuperDict.keys(): TFandSuperDict[closestGene] = [superEnh] elif closestGene in TFlist and closestGene in TFandSuperDict.keys(): TFandSuperDict[closestGene].append(superEnh) # Convert the selected TF NMids to gene names if closestGene != '': geneName = refseqToNameDict[closestGene] seAssignmentGene.append([superEnh.chr(), superEnh.start(), superEnh.end(), geneName]) # Output the list of SE-assigned transcripts (NMids) seAssignmentFile = projectFolder + projectName + '_SE_ASSIGNMENT_TRANSCRIPT.txt' utils.unParseTable(seAssignment, seAssignmentFile, '\t') # Output the list of SE-assigned genes seAssignmentGeneFile = projectFolder + projectName + '_SE_ASSIGNMENT_GENE.txt' utils.unParseTable(seAssignmentGene, seAssignmentGeneFile, '\t') print 'Number of canidate TFs:', len(TFandSuperDict) return TFandSuperDict
def collapseFimo(fimo_output, gene_to_enhancer_dict, candidate_tf_list, output_folder, analysis_name, motifConvertFile): ''' collapses motifs from fimo for each source node (TF) and each target node (gene enhancer regions), collapse motif instances then spit out a ginormous set of beds and a single crazy collapsed bed ''' #first build up the motif name conversion database motifDatabase = utils.parseTable(motifConvertFile, '\t') motifDatabaseDict = defaultdict(list) # The reverse of the other dict, from motif name to gene name # a motif can go to multiple genes for line in motifDatabase: motifDatabaseDict[line[0]].append(line[1]) #make the folder to store motif beds utils.formatFolder('%smotif_beds/' % (output_folder), True) edgeDict = {} #first layer are source nodes for tf in candidate_tf_list: edgeDict[tf] = defaultdict( list ) #next layer are target nodes which are derived from the fimo output fimoTable = utils.parseTable(fimo_output, '\t') print(fimo_output) #fimo sometimes puts the region in either the first or second column fimo_line = fimoTable[1] if fimo_line[1].count('|') > 0: region_index = 1 else: region_index = 2 print('USING COLUMN %s OF FIMO OUTPUT FOR REGION' % (region_index)) for line in fimoTable[1:]: source_tfs = motifDatabaseDict[line[0]] #motifId for source in source_tfs: if candidate_tf_list.count(source) == 0: continue region = line[region_index].split('|') target = region[0] if region_index == 2: target_locus = utils.Locus(region[1], int(region[2]) + int(line[3]), int(region[2]) + int(line[4]), '.') else: target_locus = utils.Locus(region[1], int(region[2]) + int(line[2]), int(region[2]) + int(line[3]), '.') #what's missing here is the enhancer id of the target locus try: edgeDict[source][target].append(target_locus) except KeyError: print('this motif is not in the network') print(line) sys.exit() #now we actually want to collapse this down in a meaningful way #overlapping motifs count as a single binding site. This way a TF with tons of motifs #that finds the same site over and over again doesn't get over counted all_bed = [] all_bed_path = '%s%s_all_motifs.bed' % (output_folder, analysis_name) for tf in candidate_tf_list: print(tf) target_nodes = edgeDict[tf].keys() bed_header = [ 'track name = "%s" description="%s motifs in %s"' % (tf, tf, analysis_name) ] all_bed.append(bed_header) target_bed = [bed_header] target_bed_path = '%smotif_beds/%s_motifs.bed' % (output_folder, tf) for target in target_nodes: edgeCollection = utils.LocusCollection(edgeDict[tf][target], 50) edgeCollection = edgeCollection.stitchCollection() edgeLoci = edgeCollection.getLoci() edgeDict[tf][target] = edgeLoci for locus in edgeLoci: bed_line = [ locus.chr(), locus.start(), locus.end(), target, '', '+' ] target_bed.append(bed_line) all_bed.append(bed_line) utils.unParseTable(target_bed, target_bed_path, '\t') #now the loci are all stitched up utils.unParseTable(all_bed, all_bed_path, '\t') return edgeDict
def main(): """ main run function """ #usage = "usage: %prog [options] -g [GENOME] -b [SORTED BAMFILE(S)] -i [INPUTFILE] -o [OUTPUTFOLDER]" parser = argparse.ArgumentParser(usage='%(prog)s [options]') # required flags parser.add_argument( "-b", "--bam", dest="bam", nargs='*', help="Enter a comma separated list of .bam files to be processed.", required=True) parser.add_argument( "-i", "--input", dest="input", type=str, help="Enter .gff or genomic region e.g. chr1:+:1-1000.", required=True) parser.add_argument( "-g", "--genome", dest="genome", type=str, help="specify a genome, HG18,HG19,MM8,MM9,MM10 are currently supported", required=True) # output flag parser.add_argument("-o", "--output", dest="output", type=str, help="Enter the output folder.", required=True) # additional options parser.add_argument( "--stretch-input", dest="stretch_input", default=None, type=int, help= "Stretch the input regions to a minimum length in bp, e.g. 10000 (for 10kb)" ) parser.add_argument( "-c", "--color", dest="color", default=None, help= "Enter a colon separated list of colors e.g. 255,0,0:255,125,0, default samples the rainbow" ) parser.add_argument( "-s", "--sense", dest="sense", default='both', help="Map to '+','-' or 'both' strands. Default maps to both.") parser.add_argument("-e", "--extension", dest="extension", default=200, help="Extends reads by n bp. Default value is 200bp") parser.add_argument( "-r", "--rpm", dest="rpm", action='store_true', default=False, help="Normalizes density to reads per million (rpm) Default is False") parser.add_argument( "-y", "--yScale", dest="yScale", default="relative", help= "Choose either relative or uniform y axis scaling. options = 'relative,uniform' Default is relative scaling" ) parser.add_argument( "-n", "--names", dest="names", default=None, help="Enter a comma separated list of names for your bams") parser.add_argument( "-p", "--plot", dest="plot", default="MULTIPLE", help= "Choose either all lines on a single plot or multiple plots. options = 'SINGLE,MULTIPLE,MERGE'" ) parser.add_argument( "-t", "--title", dest="title", default='', help= "Specify a title for the output plot(s), default will be the coordinate region" ) # DEBUG OPTION TO SAVE TEMP FILES parser.add_argument( "--scale", dest="scale", default='', help= "Enter a comma separated list of scaling factors for your bams. Default is none" ) parser.add_argument( "--save-temp", dest="save", action='store_true', default=False, help="If flagged will save temporary files made by bamPlot") parser.add_argument("--bed", dest="bed", help="Add a space-delimited list of bed files to plot") parser.add_argument( "--multi-page", dest="multi", action='store_true', default=False, help="If flagged will create a new pdf for each region") args = parser.parse_args() print(args) if args.bam and args.input and args.genome and args.output: # Support a legacy mode where a ',' delimited multiple files bamFileList = args.bam if len(args.bam) == 1: bamFileList = args.bam[0].split(',') # Make sure these are actually files & readable (!) for filename in bamFileList: assert (os.access(filename, os.R_OK)) # bringing in any beds if args.bed: bedFileList = args.bed if type(bedFileList) == str: bedFileList = args.bed.split(',') print(bedFileList) bedCollection = makeBedCollection(bedFileList) else: bedCollection = utils.LocusCollection([], 50) # Load the input for graphing. One of: # - A .gff # - A .bed # - a specific input region (e.g. chr10:.:93150000-93180000) valid_sense_options = {'+', '-', '.'} if os.access(args.input, os.R_OK): if args.input.endswith('.bed'): # Uniquely graph every input of this bed parsed_input_bed = utils.parseTable(args.input, '\t') gffName = os.path.basename(args.input) # Graph title gff = None try: if parsed_input_bed[0][5] in valid_sense_options: # This .bed might have a sense parameter gff = [[ e[0], '', args.input, e[1], e[2], '', e[5], '', '' ] for e in parsed_input_bed] except IndexError: pass if gff is None: print( "Your bed doesn't have a valid senese parameter. Defaulting to both strands, '.'" ) # We only take chr/start/stop and ignore everything else. gff = [[e[0], '', args.input, e[1], e[2], '', '.', '', ''] for e in parsed_input_bed] else: # Default to .gff, since that's the original behavior gff = utils.parseTable(args.input, '\t') gffName = args.input.split('/')[-1].split('.')[0] else: # means a coordinate line has been given e.g. chr1:+:1-100 chromLine = args.input.split(':') try: chrom = chromLine[0] sense = chromLine[1] except IndexError: print( 'Invalid input line or inaccessible file. Try: chr1:.:1-5000' ) exit() assert (sense in valid_sense_options) [start, end] = chromLine[2].split('-') if chrom[0:3] != 'chr': print('ERROR: UNRECOGNIZED GFF OR CHROMOSOME LINE INPUT') exit() gffLine = [chrom, '', args.input, start, end, '', sense, '', ''] gffName = "%s_%s_%s_%s" % (chrom, sense, start, end) gff = [gffLine] # Consider stretching the regions to a fixed minimum size if args.stretch_input: print('Stretching inputs to a minimum of: %d bp' % (args.stretch_input)) minLength = args.stretch_input stretchGff = [] for e in gff: difference = int(e[4]) - int(e[3]) if difference < minLength: pad = int((minLength - difference) / 2) stretchGff.append([ e[0], e[1], e[2], int(e[3]) - pad, int(e[4]) + pad, e[5], e[6], e[7], e[8] ]) else: stretchGff.append(e) gff = stretchGff # Sanity test the gff object assert (all([e[6] in valid_sense_options for e in gff])) # All strands are sane #assert(all([int(e[3]) < int(e[4]) for e in gff])) # All start/stops are ordered # bring in the genome genome = args.genome.upper() if ['HG18', 'HG19', 'HG19_RIBO', 'MM9', 'MM10', 'RN4'].count(genome) == 0: print( 'ERROR: UNSUPPORTED GENOME TYPE %s. USE HG19,HG18, RN4, MM9, or MM10' % (genome)) parser.print_help() exit() # bring in the rest of the options # output rootFolder = args.output if rootFolder[-1] != '/': rootFolder += '/' try: os.listdir(rootFolder) except OSError: print('ERROR: UNABLE TO FIND OUTPUT DIRECTORY %s' % (rootFolder)) exit() # Get analysis title if len(args.title) == 0: title = gffName else: title = args.title # make a temp folder tempFolder = rootFolder + title + '/' print("CREATING TEMP FOLDER %s" % (tempFolder)) pipeline_dfci.formatFolder(tempFolder, create=True) # colors if args.color: colorList = args.color.split(':') colorList = [x.split(',') for x in colorList] if len(colorList) < len(bamFileList): print( 'WARNING: FEWER COLORS THAN BAMS SPECIFIED. COLORS WILL BE RECYCLED' ) # recycling the color list colorList += colorList * (len(bamFileList) / len(colorList)) colorList = colorList[0:len(bamFileList)] else: # cycles through the colors of the rainbow colorList = tasteTheRainbow(len(bamFileList)) # sense sense = args.sense extension = int(args.extension) rpm = args.rpm scale = args.scale yScale = args.yScale.upper() # names if args.names: names = args.names.split(',') if len(names) != len(bamFileList): print( 'ERROR: NUMBER OF NAMES AND NUMBER OF BAMS DO NOT CORRESPOND' ) parser.print_help() exit() else: names = [x.split('/')[-1] for x in bamFileList] # plot style plotStyle = args.plot.upper() if ['SINGLE', 'MULTIPLE', 'MERGE'].count(plotStyle) == 0: print('ERROR: PLOT STYLE %s NOT AN OPTION' % (plotStyle)) parser.print_help() exit() # now run! summaryTableFileName = makeBamPlotTables(gff, genome, bamFileList, colorList, nBins, sense, extension, rpm, tempFolder, names, title, bedCollection, scale) print("%s is the summary table" % (summaryTableFileName)) #running the R command to plot multi = args.multi outFile = "%s%s_plots.pdf" % (rootFolder, title) rCmd = callRPlot(summaryTableFileName, outFile, yScale, plotStyle, multi) # open a bash file bashFileName = "%s%s_Rcmd.sh" % (tempFolder, title) bashFile = open(bashFileName, 'w') bashFile.write('#!/usr/bin/bash\n') bashFile.write(rCmd) bashFile.close() print("Wrote R command to %s" % (bashFileName)) os.system("bash %s" % (bashFileName)) # delete temp files if not args.save: if utils.checkOutput(outFile, 1, 10): # This is super dangerous (!). Add some sanity checks. assert (" " not in tempFolder) assert (tempFolder is not "/") removeCommand = "rm -rf %s" % (tempFolder) print(removeCommand) os.system(removeCommand) else: print("ERROR: NO OUTPUT FILE %s DETECTED" % (outFile)) else: parser.print_help() sys.exit()
def mapCollection(stitchedCollection, referenceCollection, bamFileList, mappedFolder, output, refName): ''' makes a table of factor density in a stitched locus and ranks table by number of loci stitched together ''' print('FORMATTING TABLE') loci = stitchedCollection.getLoci() locusTable = [[ 'REGION_ID', 'CHROM', 'START', 'STOP', 'NUM_LOCI', 'CONSTITUENT_SIZE' ]] lociLenList = [] # strip out any that are in chrY for locus in list(loci): if locus.chr() == 'chrY': loci.remove(locus) for locus in loci: # numLociList.append(int(stitchLocus.ID().split('_')[1])) lociLenList.append(locus.len()) # numOrder = order(numLociList,decreasing=True) lenOrder = utils.order(lociLenList, decreasing=True) ticker = 0 for i in lenOrder: ticker += 1 if ticker % 1000 == 0: print(ticker) locus = loci[i] # First get the size of the enriched regions within the stitched locus refEnrichSize = 0 refOverlappingLoci = referenceCollection.getOverlap(locus, 'both') for refLocus in refOverlappingLoci: refEnrichSize += refLocus.len() try: stitchCount = int(locus.ID().split('_')[0]) except ValueError: stitchCount = 1 coords = [int(x) for x in locus.coords()] locusTable.append([ locus.ID(), locus.chr(), min(coords), max(coords), stitchCount, refEnrichSize ]) print('GETTING MAPPED DATA') print("USING A BAMFILE LIST:") print(bamFileList) for bamFile in bamFileList: bamFileName = bamFile.split('/')[-1] print('GETTING MAPPING DATA FOR %s' % bamFile) # assumes standard convention for naming enriched region gffs # opening up the mapped GFF print('OPENING %s%s_%s_MAPPED/matrix.txt' % (mappedFolder, refName, bamFileName)) mappedGFF = utils.parseTable( '%s%s_%s_MAPPED/matrix.txt' % (mappedFolder, refName, bamFileName), '\t') signalDict = defaultdict(float) print('MAKING SIGNAL DICT FOR %s' % (bamFile)) mappedLoci = [] for line in mappedGFF[1:]: chrom = line[1].split('(')[0] start = int(line[1].split(':')[-1].split('-')[0]) end = int(line[1].split(':')[-1].split('-')[1]) mappedLoci.append(utils.Locus(chrom, start, end, '.', line[0])) try: signalDict[line[0]] = float(line[2]) * (abs(end - start)) except ValueError: print('WARNING NO SIGNAL FOR LINE:') print(line) continue mappedCollection = utils.LocusCollection(mappedLoci, 500) locusTable[0].append(bamFileName) for i in range(1, len(locusTable)): signal = 0.0 line = locusTable[i] lineLocus = utils.Locus(line[1], line[2], line[3], '.') overlappingRegions = mappedCollection.getOverlap(lineLocus, sense='both') for region in overlappingRegions: signal += signalDict[region.ID()] locusTable[i].append(signal) utils.unParseTable(locusTable, output, '\t')
import os, sys sys.path.append('/storage/cylin/bin/pipeline/') import utils motif_bed_dir = '/storage/cylin/grail/projects/rasmc_all/beds/srf_motif_analysis/' motif_beds = os.listdir(motif_bed_dir) allLoci = [] for bed in motif_beds: TF_name = bed.split('_')[0] collection = utils.importBoundRegion('%s%s' % (motif_bed_dir, bed), TF_name) allLoci += collection.getLoci() giant_collection = utils.LocusCollection(allLoci, 50) stitched_collection = giant_collection.stitchCollection(stitchWindow=50) new_bed = utils.locusCollectionToBed(stitched_collection) utils.unParseTable( new_bed, '%s50_bp_stitched_srf_motif_analysis_bed.bed' % (motif_bed_dir), '\t')
def main(): projectFolder = '/storage/goodell/home/jmreyes/projects/amish_ayala/' #gather up DMR tables #ayala MUT vs WT mutWT_hypo = utils.parseTable(projectFolder + 'bed/hypoDMRsWT.vs.Mut.bed', '\t') mutWT_hyper = utils.parseTable( projectFolder + 'bed/hyperDMRsWT.vs.Mut.bed', '\t') mutWT_control = utils.parseTable( projectFolder + 'bed/Control_nonDMRsWT.vs.Mut.bed', '\t') #ley all tbrs_all = utils.parseTable(projectFolder + 'bed/TBRS_DMRs.bed', '\t') aml_all = utils.parseTable(projectFolder + 'bed/AML_DMRs.bed', '\t') tbrs_hypo = [] tbrs_hyper = [] aml_hypo = [] aml_hyper = [] tbrs_all_loci = [] aml_all_loci = [] for line in tbrs_all: chrom = 'chr' + line[0] start = line[1] end = line[2] if 'hypo' in line: tbrs_all_loci.append( utils.Locus( chrom, start, end, '.', 'tbrs_all_hypo_' + str(chrom) + ':' + str(start) + '-' + str(end))) elif 'hyper' in line: tbrs_all_loci.append( utils.Locus( chrom, start, end, '.', 'tbrs_all_hyper_' + str(chrom) + ':' + str(start) + '-' + str(end))) for line in aml_all: chrom = 'chr' + line[0] start = line[1] end = line[2] if 'hypo' in line: aml_all_loci.append( utils.Locus( chrom, start, end, '.', 'aml_all_hypo_' + str(chrom) + ':' + str(start) + '-' + str(end))) elif 'hyper' in line: aml_all_loci.append( utils.Locus( chrom, start, end, '.', 'aml_all_hyper_' + str(chrom) + ':' + str(start) + '-' + str(end))) mutWT_hypo_loci = [] for line in mutWT_hypo: chrom = line[0] start = line[1] end = line[2] sense = '.' locusID = 'hypo_' + str(chrom) + ':' + str(start) + '-' + str(end) new_line = utils.Locus(chrom, start, end, '.', locusID) mutWT_hypo_loci.append(new_line) mutWT_hyper_loci = [] for line in mutWT_hyper: chrom = line[0] start = line[1] end = line[2] sense = '.' locusID = 'hyper_' + str(chrom) + ':' + str(start) + '-' + str(end) new_line = utils.Locus(chrom, start, end, '.', locusID) mutWT_hyper_loci.append(new_line) print len(mutWT_hyper_loci) print len(mutWT_hypo_loci) mutWT_all_loci = mutWT_hyper_loci + mutWT_hypo_loci mutWT_hypo_LC = utils.LocusCollection(mutWT_hypo_loci) tbrs_all_LC = utils.LocusCollection(tbrs_all_loci) aml_all_LC = utils.LocusCollection(aml_all_loci) tbrs_all_overlap = [] aml_all_overlap = [] for locus in mutWT_hypo_LC.getLoci(): tbrs_overlap = tbrs_all_LC.getOverlap(locus, 'both') if len(tbrs_overlap) > 0: for overlapLocus in tbrs_overlap: overlapChrom = overlapLocus.chr() overlapStart = overlapLocus.start() overlapEnd = overlapLocus.end() tbrs_all_overlap.append([ locus.ID(), overlapChrom, overlapStart, overlapEnd, overlapLocus.ID() ]) aml_overlap = aml_all_LC.getOverlap(locus, 'both') if len(aml_overlap) > 0: for overlapLocus in aml_overlap: overlapChrom = overlapLocus.chr() overlapStart = overlapLocus.start() overlapEnd = overlapLocus.end() aml_all_overlap.append([ locus.ID(), overlapChrom, overlapStart, overlapEnd, overlapLocus.ID() ]) utils.unParseTable(tbrs_all_overlap, projectFolder + 'tables/DMRsvsTBRS_all_overlaps.txt', '\t') utils.unParseTable(aml_all_overlap, projectFolder + 'tables/DMRsvsAML_all_overlaps.txt', '\t')
def mapEnhancerToGeneTop(rankByBamFile, controlBamFile, genome, annotFile, enhancerFile, transcribedFile='', uniqueGenes=True, searchWindow=50000, noFormatTable=False): ''' maps genes to enhancers. if uniqueGenes, reduces to gene name only. Otherwise, gives for each refseq ''' startDict = utils.makeStartDict(annotFile) enhancerName = enhancerFile.split('/')[-1].split('.')[0] enhancerTable = utils.parseTable(enhancerFile, '\t') # internal parameter for debugging byRefseq = False if len(transcribedFile) > 0: transcribedTable = utils.parseTable(transcribedFile, '\t') transcribedGenes = [line[1] for line in transcribedTable] else: transcribedGenes = startDict.keys() print('MAKING TRANSCRIPT COLLECTION') transcribedCollection = utils.makeTranscriptCollection( annotFile, 0, 0, 500, transcribedGenes) print('MAKING TSS COLLECTION') tssLoci = [] for geneID in transcribedGenes: tssLoci.append(utils.makeTSSLocus(geneID, startDict, 0, 0)) # this turns the tssLoci list into a LocusCollection # 50 is the internal parameter for LocusCollection and doesn't really # matter tssCollection = utils.LocusCollection(tssLoci, 50) geneDict = {'overlapping': defaultdict( list), 'proximal': defaultdict(list)} # dictionaries to hold ranks and superstatus of gene nearby enhancers rankDict = defaultdict(list) superDict = defaultdict(list) # list of all genes that appear in this analysis overallGeneList = [] # find the damn header for line in enhancerTable: if line[0][0] == '#': continue else: header = line break if noFormatTable: # set up the output tables # first by enhancer enhancerToGeneTable = [ header + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE']] else: # set up the output tables # first by enhancer enhancerToGeneTable = [ header[0:9] + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE'] + header[-2:]] # next by gene geneToEnhancerTable = [ ['GENE_NAME', 'REFSEQ_ID', 'PROXIMAL_ENHANCERS']] # next make the gene to enhancer table geneToEnhancerTable = [ ['GENE_NAME', 'REFSEQ_ID', 'PROXIMAL_ENHANCERS', 'ENHANCER_RANKS', 'IS_SUPER', 'ENHANCER_SIGNAL']] for line in enhancerTable: if line[0][0] == '#' or line[0][0] == 'R': continue enhancerString = '%s:%s-%s' % (line[1], line[2], line[3]) enhancerLocus = utils.Locus(line[1], line[2], line[3], '.', line[0]) # overlapping genes are transcribed genes whose transcript is directly # in the stitchedLocus overlappingLoci = transcribedCollection.getOverlap( enhancerLocus, 'both') overlappingGenes = [] for overlapLocus in overlappingLoci: overlappingGenes.append(overlapLocus.ID()) # proximalGenes are transcribed genes where the tss is within 50kb of # the boundary of the stitched loci proximalLoci = tssCollection.getOverlap( utils.makeSearchLocus(enhancerLocus, searchWindow, searchWindow), 'both') proximalGenes = [] for proxLocus in proximalLoci: proximalGenes.append(proxLocus.ID()) distalLoci = tssCollection.getOverlap( utils.makeSearchLocus(enhancerLocus, 1000000, 1000000), 'both') distalGenes = [] for proxLocus in distalLoci: distalGenes.append(proxLocus.ID()) overlappingGenes = utils.uniquify(overlappingGenes) proximalGenes = utils.uniquify(proximalGenes) distalGenes = utils.uniquify(distalGenes) allEnhancerGenes = overlappingGenes + proximalGenes + distalGenes # these checks make sure each gene list is unique. # technically it is possible for a gene to be overlapping, but not proximal since the # gene could be longer than the 50kb window, but we'll let that slide # here for refID in overlappingGenes: if proximalGenes.count(refID) == 1: proximalGenes.remove(refID) for refID in proximalGenes: if distalGenes.count(refID) == 1: distalGenes.remove(refID) # Now find the closest gene if len(allEnhancerGenes) == 0: closestGene = '' else: # get enhancerCenter enhancerCenter = (int(line[2]) + int(line[3])) / 2 # get absolute distance to enhancer center distList = [abs(enhancerCenter - startDict[geneID]['start'][0]) for geneID in allEnhancerGenes] # get the ID and convert to name closestGene = startDict[ allEnhancerGenes[distList.index(min(distList))]]['name'] # NOW WRITE THE ROW FOR THE ENHANCER TABLE if noFormatTable: newEnhancerLine = list(line) newEnhancerLine.append( join(utils.uniquify([startDict[x]['name'] for x in overlappingGenes]), ',')) newEnhancerLine.append( join(utils.uniquify([startDict[x]['name'] for x in proximalGenes]), ',')) newEnhancerLine.append(closestGene) else: newEnhancerLine = line[0:9] newEnhancerLine.append( join(utils.uniquify([startDict[x]['name'] for x in overlappingGenes]), ',')) newEnhancerLine.append( join(utils.uniquify([startDict[x]['name'] for x in proximalGenes]), ',')) newEnhancerLine.append(closestGene) newEnhancerLine += line[-2:] enhancerToGeneTable.append(newEnhancerLine) # Now grab all overlapping and proximal genes for the gene ordered # table overallGeneList += overlappingGenes for refID in overlappingGenes: geneDict['overlapping'][refID].append(enhancerString) rankDict[refID].append(int(line[-2])) superDict[refID].append(int(line[-1])) overallGeneList += proximalGenes for refID in proximalGenes: geneDict['proximal'][refID].append(enhancerString) rankDict[refID].append(int(line[-2])) superDict[refID].append(int(line[-1])) # End loop through # Make table by gene print('MAKING ENHANCER ASSOCIATED GENE TSS COLLECTION') overallGeneList = utils.uniquify(overallGeneList) #get the chromLists from the various bams here cmd = 'samtools idxstats %s' % (rankByBamFile) idxStats = subprocess.Popen(cmd,stdout=subprocess.PIPE,shell=True) idxStats= idxStats.communicate() bamChromList = [line.split('\t')[0] for line in idxStats[0].split('\n')[0:-2]] if len(controlBamFile) > 0: cmd = 'samtools idxstats %s' % (controlBamFile) idxStats = subprocess.Popen(cmd,stdout=subprocess.PIPE,shell=True) idxStats= idxStats.communicate() bamChromListControl = [line.split('\t')[0] for line in idxStats[0].split('\n')[0:-2]] bamChromList = [chrom for chrom in bamChromList if bamChromListControl.count(chrom) != 0] #now make sure no genes have a bad chrom overallGeneList = [gene for gene in overallGeneList if bamChromList.count(startDict[gene]['chr']) != 0] #now make an enhancer collection of all transcripts enhancerGeneCollection = utils.makeTranscriptCollection( annotFile, 5000, 5000, 500, overallGeneList) enhancerGeneGFF = utils.locusCollectionToGFF(enhancerGeneCollection) # dump the gff to file enhancerFolder = utils.getParentFolder(enhancerFile) gffRootName = "%s_TSS_ENHANCER_GENES_-5000_+5000" % (genome) enhancerGeneGFFFile = "%s%s_%s.gff" % (enhancerFolder, enhancerName,gffRootName) utils.unParseTable(enhancerGeneGFF, enhancerGeneGFFFile, '\t') # now we need to run bamToGFF # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail. bamliquidator_path = 'bamliquidator_batch' print('MAPPING SIGNAL AT ENHANCER ASSOCIATED GENE TSS') # map density at genes in the +/- 5kb tss region # first on the rankBy bam bamName = rankByBamFile.split('/')[-1] mappedRankByFolder = "%s%s_%s_%s/" % (enhancerFolder, enhancerName,gffRootName, bamName) mappedRankByFile = "%s%s_%s_%s/matrix.txt" % (enhancerFolder,enhancerName, gffRootName, bamName) cmd = bamliquidator_path + ' --sense . -e 200 --match_bamToGFF -r %s -o %s %s' % (enhancerGeneGFFFile, mappedRankByFolder,rankByBamFile) print("Mapping rankby bam %s" % (rankByBamFile)) print(cmd) os.system(cmd) #check for completion if utils.checkOutput(mappedRankByFile,0.2,5): print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (enhancerGeneGFFFile, rankByBamFile)) else: print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (enhancerGeneGFFFile, rankByBamFile)) sys.exit() # next on the control bam if it exists if len(controlBamFile) > 0: controlName = controlBamFile.split('/')[-1] mappedControlFolder = "%s%s_%s_%s/" % ( enhancerFolder, enhancerName,gffRootName, controlName) mappedControlFile = "%s%s_%s_%s/matrix.txt" % ( enhancerFolder, enhancerName,gffRootName, controlName) cmd = bamliquidator_path + ' --sense . -e 200 --match_bamToGFF -r %s -o %s %s' % (enhancerGeneGFFFile, mappedControlFolder,controlBamFile) print("Mapping control bam %s" % (controlBamFile)) print(cmd) os.system(cmd) #check for completion if utils.checkOutput(mappedControlFile,0.2,5): print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (enhancerGeneGFFFile, controlBamFile)) else: print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (enhancerGeneGFFFile, controlBamFile)) sys.exit() # now get the appropriate output files if len(controlBamFile) > 0: print("CHECKING FOR MAPPED OUTPUT AT %s AND %s" % (mappedRankByFile, mappedControlFile)) if utils.checkOutput(mappedRankByFile, 1, 1) and utils.checkOutput(mappedControlFile, 1, 1): print('MAKING ENHANCER ASSOCIATED GENE TSS SIGNAL DICTIONARIES') signalDict = makeSignalDict(mappedRankByFile, mappedControlFile) else: print("NO MAPPING OUTPUT DETECTED") sys.exit() else: print("CHECKING FOR MAPPED OUTPUT AT %s" % (mappedRankByFile)) if utils.checkOutput(mappedRankByFile, 1, 30): print('MAKING ENHANCER ASSOCIATED GENE TSS SIGNAL DICTIONARIES') signalDict = makeSignalDict(mappedRankByFile) else: print("NO MAPPING OUTPUT DETECTED") sys.exit() # use enhancer rank to order rankOrder = utils.order([min(rankDict[x]) for x in overallGeneList]) usedNames = [] # make a new dict to hold TSS signal by max per geneName geneNameSigDict = defaultdict(list) print('MAKING GENE TABLE') for i in rankOrder: refID = overallGeneList[i] geneName = startDict[refID]['name'] if usedNames.count(geneName) > 0 and uniqueGenes == True: continue else: usedNames.append(geneName) proxEnhancers = geneDict['overlapping'][ refID] + geneDict['proximal'][refID] superStatus = max(superDict[refID]) enhancerRanks = join([str(x) for x in rankDict[refID]], ',') enhancerSignal = signalDict[refID] geneNameSigDict[geneName].append(enhancerSignal) newLine = [geneName, refID, join( proxEnhancers, ','), enhancerRanks, superStatus, enhancerSignal] geneToEnhancerTable.append(newLine) #utils.unParseTable(geneToEnhancerTable,'/grail/projects/newRose/geneMapper/foo.txt','\t') print('MAKING ENHANCER TO TOP GENE TABLE') if noFormatTable: enhancerToTopGeneTable = [ enhancerToGeneTable[0] + ['TOP_GENE', 'TSS_SIGNAL']] else: enhancerToTopGeneTable = [enhancerToGeneTable[0][0:12] + [ 'TOP_GENE', 'TSS_SIGNAL'] + enhancerToGeneTable[0][-2:]] for line in enhancerToGeneTable[1:]: geneList = [] if noFormatTable: geneList += line[-3].split(',') geneList += line[-2].split(',') else: geneList += line[10].split(',') geneList += line[11].split(',') geneList = utils.uniquify([x for x in geneList if len(x) > 0]) if len(geneList) > 0: try: sigVector = [max(geneNameSigDict[x]) for x in geneList] maxIndex = sigVector.index(max(sigVector)) maxGene = geneList[maxIndex] maxSig = sigVector[maxIndex] if maxSig == 0.0: maxGene = 'NONE' maxSig = 'NONE' except ValueError: if len(geneList) == 1: maxGene = geneList[0] maxSig = 'NONE' else: maxGene = 'NONE' maxSig = 'NONE' else: maxGene = 'NONE' maxSig = 'NONE' if noFormatTable: newLine = line + [maxGene, maxSig] else: newLine = line[0:12] + [maxGene, maxSig] + line[-2:] enhancerToTopGeneTable.append(newLine) # resort enhancerToGeneTable if noFormatTable: return enhancerToGeneTable, enhancerToTopGeneTable, geneToEnhancerTable else: enhancerOrder = utils.order([int(line[-2]) for line in enhancerToGeneTable[1:]]) sortedTable = [enhancerToGeneTable[0]] sortedTopGeneTable = [enhancerToTopGeneTable[0]] for i in enhancerOrder: sortedTable.append(enhancerToGeneTable[(i + 1)]) sortedTopGeneTable.append(enhancerToTopGeneTable[(i + 1)]) return sortedTable, sortedTopGeneTable, geneToEnhancerTable
def rank_eboxes(nb_all_chip_dataFile,mycn_gff_path,macsFolder,genomeDirectory,window = 100): ''' uses the conserved MYCN sites and ranks eboxes within them by average background subtracted signal searches 100bp (window variable) from mycn summits ''' window = int(window) #bring in the conserved mycn region print('making gff of nb mycn summits') nb_mycn_gff = utils.parseTable(mycn_gff_path,'\t') nb_mycn_collection = utils.gffToLocusCollection(nb_mycn_gff,50) dataDict =pipeline_dfci.loadDataTable(nb_all_chip_dataFile) names_list = [name for name in dataDict.keys() if name.count('MYCN') == 1] names_list.sort() summit_loci = [] #first makes a gff of all summits +/- 100bp for all nb mycn datasets for name in names_list: summit_bed_path = '%s%s/%s_summits.bed' % (macsFolder,name,name) summit_bed = utils.parseTable(summit_bed_path,'\t') for line in summit_bed: summit_locus = utils.Locus(line[0],int(line[1])-window,int(line[2])+window,'.',line[3]) if len(nb_mycn_collection.getOverlap(summit_locus)) > 0: summit_loci.append(summit_locus) summit_collection =utils.LocusCollection(summit_loci,50) summit_merged_collection = summit_collection.stitchCollection() summit_gff = utils.locusCollectionToGFF(summit_merged_collection) summit_gff_path = '%sHG19_NB_MYCN_SUMMITS_-%s_+%s.gff' % (gffFolder,window,window) utils.unParseTable(summit_gff,summit_gff_path,'\t') #this is borrowed from above and maps chip-seq signal to the gff print('mapping to nb mycn summits and making signal dict') gffList = [summit_gff_path] summit_signal_path = pipeline_dfci.map_regions(nb_all_chip_dataFile,gffList) mycnSignalTable = utils.parseTable(summit_signal_path,'\t') #making a signal dictionary for MYCN binding names_list = ['BE2C_MYCN','KELLY_MYCN','NGP_MYCN','SHEP21_0HR_MYCN_NOSPIKE'] background_list = [dataDict[name]['background'] for name in names_list] header = mycnSignalTable[0] chip_columns = [header.index(name) for name in names_list] background_columns = [header.index(background_name) for background_name in background_list] mycn_sig_dict = {} for line in mycnSignalTable[1:]: line_sig = [] for i in range(len(names_list)): line_sig.append(float(line[chip_columns[i]]) - float(line[background_columns[i]])) region_id = line[1] coords = [int(x) for x in line[1].split(':')[-1].split('-')] line_length = coords[1]-coords[0] mycn_sig_dict[region_id] = numpy.mean(line_sig)*line_length #now for each region find the eboxes and then add up the signal print('making ebox ranking') ebox_list = ['CACGTG','CAGTTG','CAAGTG','CAGGTG','CAATTG','CAAATG','CATCTG','CAGCTG','CATGTG','CATATG'] eboxDict = {} for ebox in ebox_list: eboxDict[ebox] = [] ticker = 0 for line in summit_gff: if ticker % 1000 == 0: print(ticker) ticker+=1 chrom = line[0] sense = '.' start = int(line[3]) end = int(line[4]) region_id = '%s(%s):%s-%s' % (line[0],line[6],line[3],line[4]) signal = mycn_sig_dict[region_id] sequenceLine = utils.fetchSeq(genomeDirectory,chrom,start,end,True) motifVector = [] matches = re.finditer('CA..TG',str.upper(sequenceLine)) if matches: for match in matches: motifVector.append(match.group()) #count only 1 of each motif type per line #motifVector = utils.uniquify(motifVector) for motif in motifVector: if ebox_list.count(motif) > 0: eboxDict[motif].append(signal) else: eboxDict[utils.revComp(motif)].append(signal) eboxTable =[] eboxTableOrdered =[['EBOX','OCCURENCES','AVG_HEIGHT']] for ebox in eboxDict.keys(): newLine = [ebox,len(eboxDict[ebox]),numpy.mean(eboxDict[ebox])] eboxTable.append(newLine) occurenceOrder = utils.order([line[2] for line in eboxTable],decreasing=True) for x in occurenceOrder: eboxTableOrdered.append(eboxTable[x]) print(eboxTableOrdered) ebox_outfile = '%sHG19_NB_MYCN_CONSERVED_SUMMITS_-%s_+%s_EBOX_RANK.txt' % (tableFolder,window,window) utils.unParseTable(eboxTableOrdered,ebox_outfile,'\t') return ebox_outfile
def make_mycn_stats_table(nb_all_chip_dataFile,outFile): ''' making a table of conserved mycn peaks w/ some additional stats mycn and h3k27ac signal is avg. background normalized across 4 samples active tss defined as the union of all H3K27ac occupied promoters in NB active enhancers defined as the union of all H3K27ac sites outside of promoters ''' dataDict = pipeline_dfci.loadDataTable(nb_all_chip_dataFile) print('SETTING UP OUTPUT TABLE') outTable = [['PEAK_ID','CHROM','START','STOP','LENGTH','ACTIVE_TSS_OVERLAP','ENHANCER_OVERLAP','CPG_ISLAND_OVERLAP','CPG_ISLAND_FRACTION','GC_FREQ','MYCN_RANK','AVG_MYCN_SIGNAL','AVG_H3K27AC_SIGNAL','CANON_EBOX_COUNT','NONCANON_EBOX_COUNT','TOTAL_EBOX_COUNT','CANON_EXP','NON_CANON_EXP','GABPA_COUNT','GABPA_EXP','GATA_COUNT','GATA_EXP']] dinuc = nmers(2,['A','T','G','C']) #input files mycnSignalFile = '%sHG19_NB_MYCN_CONSERVED_-0_+0_NB_ALL_SIGNAL.txt' % (signalFolder) h3k27acSignalFile = '%sHG19_NB_MYCN_CONSERVED_-500_+500_NB_ALL_SIGNAL.txt' % (signalFolder) mycnRankFile = '%smeta_rose/NB_MYCN/NB_MYCN_0KB_STITCHED_ENHANCER_REGION_RANK_CONSERVED.txt' % (projectFolder) activeGeneFile = '%sHG19_NB_H3K27AC_ACTIVE_UNION.txt' % (geneListFolder) #note, this is the ucsc hg19 cpg islands extended file #to download and format run ./beds/download_cpg.sh cpgFile = '%sbeds/hg19_cpg_islands.bed' % (projectFolder) enhancerFile = '%smeta_rose/NB_H3K27AC/NB_H3K27AC_AllEnhancers.table.txt' % (projectFolder) print('LOADING MYCN BINDING DATA') mycnSignalTable = utils.parseTable(mycnSignalFile,'\t') #making a signal dictionary for MYCN binding names_list = ['BE2C_MYCN','KELLY_MYCN','NGP_MYCN','SHEP21_0HR_MYCN_NOSPIKE'] background_list = [dataDict[name]['background'] for name in names_list] header = mycnSignalTable[0] chip_columns = [header.index(name) for name in names_list] background_columns = [header.index(background_name) for background_name in background_list] mycn_sig_dict = {} #this only works if the first column are unique identifiers if len(mycnSignalTable) != len(utils.uniquify([line[0] for line in mycnSignalTable])): print('Error: Column 1 of must contain unique identifiers.' % (mycnSignalFile)) sys.exit() for line in mycnSignalTable[1:]: line_sig = [] for i in range(len(names_list)): line_sig.append(float(line[chip_columns[i]]) - float(line[background_columns[i]])) mycn_sig_dict[line[0]] = numpy.mean(line_sig) print('LOADING MYCN RANK DATA') mycnRankTable = utils.parseTable(mycnRankFile,'\t') print('LOADING H3K27AC BINDING DATA') h3k27acSignalTable = utils.parseTable(h3k27acSignalFile,'\t') #making a signal dictionary for background subtracted H3K27ac binding names_list = ['BE2C_H3K27AC','KELLY_H3K27AC','NGP_H3K27AC','SHEP21_0HR_H3K27AC_NOSPIKE'] background_list = [dataDict[name]['background'] for name in names_list] header = h3k27acSignalTable[0] chip_columns = [header.index(name) for name in names_list] background_columns = [header.index(background_name) for background_name in background_list] h3k27ac_sig_dict = {} #this only works if the first column are unique identifiers if len(h3k27acSignalTable) != len(utils.uniquify([line[0] for line in h3k27acSignalTable])): print('Error: Column 1 of must contain unique identifiers.' % (h3k27acSignalFile)) sys.exit() for line in h3k27acSignalTable[1:]: line_sig = [] for i in range(len(names_list)): line_sig.append(float(line[chip_columns[i]]) - float(line[background_columns[i]])) h3k27ac_sig_dict[line[0]] = numpy.mean(line_sig) #making the cpg collection print('LOADING CPGS ISLANDS') cpgBed = utils.parseTable(cpgFile,'\t') cpgLoci = [] for line in cpgBed: cpgLoci.append(utils.Locus(line[0],line[1],line[2],'.',line[-1])) cpgCollection = utils.LocusCollection(cpgLoci,50) #next make the tss collection of active promoters print('LOADING ACTIVE PROMOTERS') startDict = utils.makeStartDict(annotFile) activeTable = utils.parseTable(activeGeneFile,'\t') tss_1kb_loci = [] for line in activeTable: tss_1kb_loci.append(utils.makeTSSLocus(line[1],startDict,1000,1000)) tss_1kb_collection = utils.LocusCollection(tss_1kb_loci,50) #enhancer file print("LOADING ACTIVE ENHANCERS") enhancerTable = utils.parseTable(enhancerFile,'\t') print('STARTING WITH THE FOLLOWING NUMBER OF ENHANCERS IN NB') print(len(enhancerTable) - 6) enhancerLoci = [] for line in enhancerTable: if line[0][0] != '#' and line[0][0] != 'R': try: lineLocus = utils.Locus(line[1],int(line[2]),int(line[3]),'.',line[0]) enhancerLoci.append(lineLocus) except IndexError: print(line) sys.exit() enhancerCollection = utils.LocusCollection(enhancerLoci,50) print('CLASSIFYING MYCN PEAKS') ticker = 0 for i in range(1,len(mycnSignalTable)): if ticker%100 == 0: print(ticker) ticker +=1 line = mycnSignalTable[i] mycn_signal = round(mycn_sig_dict[line[0]],4) h3k27ac_signal = round(h3k27ac_sig_dict[line[0]],4) peakID = line[0] locusString = line[1] chrom = locusString.split('(')[0] [start,stop] = [int(x) for x in line[1].split(':')[-1].split('-')] lineLocus = utils.Locus(chrom,start,stop,'.',peakID) tssOverlap = 0 if tss_1kb_collection.getOverlap(lineLocus,'both'): tssOverlap = 1 enhancerOverlap = 0 if enhancerCollection.getOverlap(lineLocus,'both') and tssOverlap == 0: enhancerOverlap = 1 cpgIslandOverlap = 0 if cpgCollection.getOverlap(lineLocus,'both'): cpgIslandOverlap = 1 #now do fractional cpgOverlap overlappingCpGLoci = cpgCollection.getOverlap(lineLocus,'both') overlappingBases = 0 for locus in overlappingCpGLoci: cpgStart = max(locus.start(),lineLocus.start()) cpgEnd = min(locus.end(),lineLocus.end()) overlappingBases += (cpgEnd-cpgStart) overlapFraction = round(float(overlappingBases)/lineLocus.len(),2) #now get the seq lineSeq = string.upper(utils.fetchSeq(genomeDirectory,chrom,start,stop,True)) gcFreq = round(float(lineSeq.count('GC') + lineSeq.count('CG'))/len(lineSeq),2) dinuc_dict = {} for nmer in dinuc: dinuc_dict[nmer] = float(lineSeq.count('GC'))/len(lineSeq) mycnRankLine = mycnRankTable[i] mycnRank = numpy.mean([float(x) for x in mycnRankLine[6:]]) canonMatchList = re.findall('CACGTG',lineSeq) canon_count = len(canonMatchList) eboxMatchList = re.findall('CA..TG',lineSeq) ebox_count = len(eboxMatchList) non_canon_count = ebox_count-canon_count #get the expected values canon_exp = dinuc_dict['CA']*dinuc_dict['CG']*dinuc_dict['TG']*(len(lineSeq) - 5) canon_exp = round(canon_exp,2) notCG = 1- dinuc_dict['CG'] non_exp = dinuc_dict['CA']*notCG*dinuc_dict['TG']*(len(lineSeq) - 5) non_exp = round(non_exp,2) #for gata and GABPA gabpaMatchList = re.findall('CGGAAG',lineSeq) + re.findall('CTTCCG',lineSeq) gabpa_count = len(gabpaMatchList) gabpa_exp_f = dinuc_dict['CG'] * dinuc_dict['GA'] * dinuc_dict['AG']*(len(lineSeq) - 5) gabpa_exp_r = dinuc_dict['CT'] * dinuc_dict['TC'] * dinuc_dict['CG']*(len(lineSeq) - 5) gabpa_exp = round(gabpa_exp_f,2) + round(gabpa_exp_r,2) gataMatchList = re.findall('GATAA',lineSeq) + re.findall('TTATC',lineSeq) gata_count = len(gataMatchList) an_freq = 1 - dinuc_dict['AA'] - dinuc_dict['AT'] - dinuc_dict['AG'] -dinuc_dict['AC'] cn_freq = 1 - dinuc_dict['CA'] - dinuc_dict['CT'] - dinuc_dict['CG'] -dinuc_dict['CC'] gata_exp_f = dinuc_dict['GA'] * dinuc_dict['TA'] * an_freq*(len(lineSeq) - 5) gata_exp_r = dinuc_dict['TT'] * dinuc_dict['AT'] * cn_freq*(len(lineSeq) - 5) gata_exp = round(gata_exp_f,2) + round(gata_exp_r,2) newLine = [peakID,chrom,start,stop,lineLocus.len(),tssOverlap,enhancerOverlap,cpgIslandOverlap,overlapFraction,gcFreq,mycnRank,mycn_signal,h3k27ac_signal,canon_count,non_canon_count,ebox_count,canon_exp,non_exp,gabpa_count,gabpa_exp,gata_count,gata_exp] outTable.append(newLine) utils.unParseTable(outTable,outFile,'\t') return outFile
def main(): print('main analysis for MYCN project') print('changing directory to project folder') os.chdir(projectFolder) print('\n\n') print( '#======================================================================' ) print( '#======================I, LOADING DATA ANNOTATION======================' ) print( '#======================================================================' ) print('\n\n') #This section sanity checks each data table and makes sure both bam and .bai files are accessible #for ChIP-Seq #these are the datasets we will use pipeline_dfci.summary(shep21_dataFile) print('\n\n') print( '#======================================================================' ) print( '#================II. RUNNING DIFFERENTIAL ROSE ANALYSIS================' ) print( '#======================================================================' ) print('\n\n') #use the dynamic rose tools to first map twist1 binding sites #and then quantify name1 = 'SHEP21_0HR_TWIST' name2 = 'SHEP21_24HR_B_TWIST' analysis_name = 'SHEP21_TWIST1' rank_gff_path = wrapDRose(shep21_dataFile, name1, name2, analysis_name) print('\n\n') print( '#======================================================================' ) print( '#=================III. MAPPING MYCN DATA TO RANK GFF===================' ) print( '#======================================================================' ) print('\n\n') #for shep21 nospike gffList = [rank_gff_path] dataDict = pipeline_dfci.loadDataTable(shep21_dataFile) names_list = [ name for name in dataDict.keys() if name.count('MYCN') == 1 or name.count('INPUT') == 1 or name.count('TWIST') == 1 and name.count('rep2') == 0 ] print(names_list) #map_regions(shep21_dataFile,gffList,names_list) gffList = ['%smacsEnriched/SHEP21_0HR_TWIST_peaks.bed' % (projectFolder)] #map_regions(shep21_dataFile,gffList,names_list) #make a gff of twist and mycn sites at 0hr twist_collection = utils.importBoundRegion( '%smacsEnriched/SHEP21_0HR_TWIST_peaks.bed' % (projectFolder), 'SHEP21_0HR_TWIST') mycn_collection = utils.importBoundRegion( '%smacsEnriched/SHEP21_0HR_MYCN_NOSPIKE_peaks.bed' % (projectFolder), 'SHEP21_0HR_MYCN_NOSPIKE') all_loci = twist_collection.getLoci() + mycn_collection.getLoci() all_collection = utils.LocusCollection(all_loci, 50) stitched_collection = all_collection.stitchCollection() stitched_loci = stitched_collection.getLoci() overlap_loci = [] for locus in stitched_loci: if len(twist_collection.getOverlap(locus, 'both')) > 0 and len( mycn_collection.getOverlap(locus, 'both')) > 0: overlap_loci.append(locus) overlap_collection = utils.LocusCollection(overlap_loci, 50) overlap_gff = utils.locusCollectionToGFF(overlap_collection) overlap_gff_path = '%sHG19_SHEP21_0HR_TWIST_MYCN_INTERSECTION_-0_+0.gff' % ( gffFolder) utils.unParseTable(overlap_gff, overlap_gff_path, '\t') gffList = [overlap_gff_path] map_regions(shep21_dataFile, gffList, names_list)
def main(): ''' main run call ''' debug = False from optparse import OptionParser usage = "usage: %prog [options] -g [GENOME] -i [INPUT_REGION_GFF] -r [RANKBY_BAM_FILE] -o [OUTPUT_FOLDER] [OPTIONAL_FLAGS]" parser = OptionParser(usage=usage) # required flags parser.add_option( "-i", "--i", dest="input", nargs=1, default=None, help="Enter a .gff or .bed file of binding sites used to make enhancers" ) parser.add_option("-r", "--rankby", dest="rankby", nargs=1, default=None, help="bamfile to rank enhancer by") parser.add_option("-o", "--out", dest="out", nargs=1, default=None, help="Enter an output folder") parser.add_option("-g", "--genome", dest="genome", nargs=1, default=None, help="Enter the genome build (MM9,MM8,HG18,HG19)") # optional flags parser.add_option( "-b", "--bams", dest="bams", nargs=1, default=None, help="Enter a comma separated list of additional bam files to map to") parser.add_option("-c", "--control", dest="control", nargs=1, default=None, help="bamfile to rank enhancer by") parser.add_option( "-s", "--stitch", dest="stitch", nargs=1, default='', help= "Enter a max linking distance for stitching. Default will determine optimal stitching parameter" ) parser.add_option( "-t", "--tss", dest="tss", nargs=1, default=0, help="Enter a distance from TSS to exclude. 0 = no TSS exclusion") parser.add_option( "--mask", dest="mask", nargs=1, default=None, help= "Mask a set of regions from analysis. Provide a .bed or .gff of masking regions" ) # RETRIEVING FLAGS (options, args) = parser.parse_args() if not options.input or not options.rankby or not options.out or not options.genome: print('hi there') parser.print_help() exit() # making the out folder if it doesn't exist outFolder = utils.formatFolder(options.out, True) # figuring out folder schema gffFolder = utils.formatFolder(outFolder + 'gff/', True) mappedFolder = utils.formatFolder(outFolder + 'mappedGFF/', True) # GETTING INPUT FILE if options.input.split('.')[-1] == 'bed': # CONVERTING A BED TO GFF inputGFFName = options.input.split('/')[-1][0:-4] inputGFFFile = '%s%s.gff' % (gffFolder, inputGFFName) utils.bedToGFF(options.input, inputGFFFile) elif options.input.split('.')[-1] == 'gff': # COPY THE INPUT GFF TO THE GFF FOLDER inputGFFFile = options.input os.system('cp %s %s' % (inputGFFFile, gffFolder)) else: print( 'WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT' ) # COPY THE INPUT GFF TO THE GFF FOLDER inputGFFFile = options.input os.system('cp %s %s' % (inputGFFFile, gffFolder)) # GETTING THE LIST OF BAMFILES TO PROCESS if options.control: bamFileList = [options.rankby, options.control] else: bamFileList = [options.rankby] if options.bams: bamFileList += options.bams.split(',') #bamFileList = utils.uniquify(bamFileList) # makes sad when you have the same control bam over and over again # optional args # Stitch parameter if options.stitch == '': stitchWindow = '' else: stitchWindow = int(options.stitch) # tss options tssWindow = int(options.tss) if tssWindow != 0: removeTSS = True else: removeTSS = False # GETTING THE BOUND REGION FILE USED TO DEFINE ENHANCERS print('USING %s AS THE INPUT GFF' % (inputGFFFile)) inputName = inputGFFFile.split('/')[-1].split('.')[0] # GETTING THE GENOME genome = options.genome print('USING %s AS THE GENOME' % genome) # GETTING THE CORRECT ANNOT FILE cwd = os.getcwd() genomeDict = { 'HG18': '%s/annotation/hg18_refseq.ucsc' % (cwd), 'MM9': '%s/annotation/mm9_refseq.ucsc' % (cwd), 'HG19': '%s/annotation/hg19_refseq.ucsc' % (cwd), 'MM8': '%s/annotation/mm8_refseq.ucsc' % (cwd), 'MM10': '%s/annotation/mm10_refseq.ucsc' % (cwd), 'RN4': '%s/annotation/rn4_refseq.ucsc' % (cwd), 'RN6': '%s/annotation/rn6_refseq.ucsc' % (cwd), } annotFile = genomeDict[genome.upper()] # MAKING THE START DICT print('MAKING START DICT') startDict = utils.makeStartDict(annotFile) #GET CHROMS FOUND IN THE BAMS print('GETTING CHROMS IN BAMFILES') bamChromList = getBamChromList(bamFileList) print("USING THE FOLLOWING CHROMS") print(bamChromList) #LOADING IN THE GFF AND FILTERING BY CHROM print('LOADING AND FILTERING THE GFF') inputGFF = filterGFF(inputGFFFile, bamChromList) # LOADING IN THE BOUND REGION REFERENCE COLLECTION print('LOADING IN GFF REGIONS') referenceCollection = utils.gffToLocusCollection(inputGFF) print('CHECKING REFERENCE COLLECTION:') checkRefCollection(referenceCollection) # MASKING REFERENCE COLLECTION # see if there's a mask if options.mask: maskFile = options.mask # if it's a bed file if maskFile.split('.')[-1].upper() == 'BED': maskGFF = utils.bedToGFF(maskFile) elif maskFile.split('.')[-1].upper() == 'GFF': maskGFF = utils.parseTable(maskFile, '\t') else: print("MASK MUST BE A .gff or .bed FILE") sys.exit() maskCollection = utils.gffToLocusCollection(maskGFF) # now mask the reference loci referenceLoci = referenceCollection.getLoci() filteredLoci = [ locus for locus in referenceLoci if len(maskCollection.getOverlap(locus, 'both')) == 0 ] print("FILTERED OUT %s LOCI THAT WERE MASKED IN %s" % (len(referenceLoci) - len(filteredLoci), maskFile)) referenceCollection = utils.LocusCollection(filteredLoci, 50) # NOW STITCH REGIONS print('STITCHING REGIONS TOGETHER') stitchedCollection, debugOutput, stitchWindow = regionStitching( referenceCollection, inputName, outFolder, stitchWindow, tssWindow, annotFile, removeTSS) # NOW MAKE A STITCHED COLLECTION GFF print('MAKING GFF FROM STITCHED COLLECTION') stitchedGFF = utils.locusCollectionToGFF(stitchedCollection) # making sure start/stop ordering are correct for i in range(len(stitchedGFF)): line = stitchedGFF[i] start = int(line[3]) stop = int(line[4]) if start > stop: line[3] = stop line[4] = start print(stitchWindow) print(type(stitchWindow)) if not removeTSS: stitchedGFFFile = '%s%s_%sKB_STITCHED.gff' % (gffFolder, inputName, str(stitchWindow / 1000)) stitchedGFFName = '%s_%sKB_STITCHED' % (inputName, str(stitchWindow / 1000)) debugOutFile = '%s%s_%sKB_STITCHED.debug' % (gffFolder, inputName, str(stitchWindow / 1000)) else: stitchedGFFFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.gff' % ( gffFolder, inputName, str(stitchWindow / 1000)) stitchedGFFName = '%s_%sKB_STITCHED_TSS_DISTAL' % ( inputName, str(stitchWindow / 1000)) debugOutFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.debug' % ( gffFolder, inputName, str(stitchWindow / 1000)) # WRITING DEBUG OUTPUT TO DISK if debug: print('WRITING DEBUG OUTPUT TO DISK AS %s' % (debugOutFile)) utils.unParseTable(debugOutput, debugOutFile, '\t') # WRITE THE GFF TO DISK print('WRITING STITCHED GFF TO DISK AS %s' % (stitchedGFFFile)) utils.unParseTable(stitchedGFF, stitchedGFFFile, '\t') # SETTING UP THE OVERALL OUTPUT FILE outputFile1 = outFolder + stitchedGFFName + '_ENHANCER_REGION_MAP.txt' print('OUTPUT WILL BE WRITTEN TO %s' % (outputFile1)) # MAPPING TO THE NON STITCHED (ORIGINAL GFF) # MAPPING TO THE STITCHED GFF # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail. bamliquidator_path = 'bamliquidator_batch.py' bamFileListUnique = list(bamFileList) bamFileListUnique = utils.uniquify(bamFileListUnique) #prevent redundant mapping print("MAPPING TO THE FOLLOWING BAMS:") print(bamFileListUnique) for bamFile in bamFileListUnique: bamFileName = bamFile.split('/')[-1] # MAPPING TO THE STITCHED GFF mappedOut1Folder = '%s%s_%s_MAPPED' % (mappedFolder, stitchedGFFName, bamFileName) mappedOut1File = '%s%s_%s_MAPPED/matrix.txt' % ( mappedFolder, stitchedGFFName, bamFileName) if utils.checkOutput(mappedOut1File, 0.2, 0.2): print("FOUND %s MAPPING DATA FOR BAM: %s" % (stitchedGFFFile, mappedOut1File)) else: cmd1 = bamliquidator_path + " --sense . -e 200 --match_bamToGFF -r %s -o %s %s" % ( stitchedGFFFile, mappedOut1Folder, bamFile) print(cmd1) os.system(cmd1) if utils.checkOutput(mappedOut1File, 0.2, 5): print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (stitchedGFFFile, bamFileName)) else: print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (stitchedGFFFile, bamFileName)) sys.exit() print('BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS') # CALCULATE DENSITY BY REGION # NEED TO FIX THIS FUNCTION TO ACCOUNT FOR DIFFERENT OUTPUTS OF LIQUIDATOR mapCollection(stitchedCollection, referenceCollection, bamFileList, mappedFolder, outputFile1, refName=stitchedGFFName) print('CALLING AND PLOTTING SUPER-ENHANCERS') if options.control: rankbyName = options.rankby.split('/')[-1] controlName = options.control.split('/')[-1] cmd = 'R --no-save %s %s %s %s < ROSE2_callSuper.R' % ( outFolder, outputFile1, inputName, controlName) else: rankbyName = options.rankby.split('/')[-1] controlName = 'NONE' cmd = 'R --no-save %s %s %s %s < ROSE2_callSuper.R' % ( outFolder, outputFile1, inputName, controlName) print(cmd) os.system(cmd) # calling the gene mapper time.sleep(20) superTableFile = "%s_SuperEnhancers.table.txt" % (inputName) if options.control: cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s &" % ( genome, options.rankby, options.control, outFolder, superTableFile) else: cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s &" % ( genome, options.rankby, outFolder, superTableFile) os.system(cmd) stretchTableFile = "%s_StretchEnhancers.table.txt" % (inputName) if options.control: cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s &" % ( genome, options.rankby, options.control, outFolder, stretchTableFile) else: cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s &" % ( genome, options.rankby, outFolder, stretchTableFile) os.system(cmd) superStretchTableFile = "%s_SuperStretchEnhancers.table.txt" % (inputName) if options.control: cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s &" % ( genome, options.rankby, options.control, outFolder, superStretchTableFile) else: cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s &" % ( genome, options.rankby, outFolder, superStretchTableFile) os.system(cmd)
# From networks already constructed from CRC2.py node_file = '/crusader/projects/cll/final/network/lines/zinba/' + projectName + '/' + projectName + '_NODELIST.txt' node_table = utils.parseTable(node_file, '\t') nodelist = [x[0] for x in node_table] print nodelist super_enhancer_file = '/crusader/projects/cll/final/rose/' + projectName + '_H3K27ac/' + projectName + '_H3K27ac_peaks_SuperEnhancers.table.txt' se_table = utils.parseTable(super_enhancer_file, '\t') subpeak_file = '/crusader/projects/cll/final/zinba/lines/MEC1_ATAC/MEC1_ATAC.peaks.bed' subpeak_table = utils.parseTable(subpeak_file, '\t') subpeak_loci = [] for line in subpeak_table: subpeak_loci.append(utils.Locus(line[0], line[1], line[2], '.')) subpeak_collection = utils.LocusCollection(subpeak_loci, 100) subpeak_dict = {} # key is enhancer ID, points to a list of loci # assign subpeak Loci to each super enhancer fasta = [] se_namelist = [] for line in se_table[6:]: se_id = line[0] se_namelist.append(se_id) subpeak_dict[se_id] = [] se_locus = utils.Locus(line[1], line[2], line[3], '.') overlaps = subpeak_collection.getOverlap(se_locus) for overlap in overlaps:
def regionStitching(referenceCollection, name, outFolder, stitchWindow, tssWindow, annotFile, removeTSS=True): print('PERFORMING REGION STITCHING') # first have to turn bound region file into a locus collection # need to make sure this names correctly... each region should have a unique name #referenceCollection debugOutput = [] # filter out all bound regions that overlap the TSS of an ACTIVE GENE if removeTSS: print('REMOVING TSS FROM REGIONS USING AN EXCLUSION WINDOW OF %sBP' % (tssWindow)) # first make a locus collection of TSS startDict = utils.makeStartDict(annotFile) # now makeTSS loci for active genes removeTicker = 0 # this loop makes a locus centered around +/- tssWindow of transcribed genes # then adds it to the list tssLoci tssLoci = [] for geneID in startDict.keys(): tssLoci.append( utils.makeTSSLocus(geneID, startDict, tssWindow, tssWindow)) # this turns the tssLoci list into a LocusCollection # 50 is the internal parameter for LocusCollection and doesn't really matter tssCollection = utils.LocusCollection(tssLoci, 50) # gives all the loci in referenceCollection boundLoci = referenceCollection.getLoci() # this loop will check if each bound region is contained by the TSS exclusion zone # this will drop out a lot of the promoter only regions that are tiny # typical exclusion window is around 2kb for locus in boundLoci: if len(tssCollection.getContainers(locus, 'both')) > 0: # if true, the bound locus overlaps an active gene referenceCollection.remove(locus) debugOutput.append([locus.__str__(), locus.ID(), 'CONTAINED']) removeTicker += 1 print('REMOVED %s LOCI BECAUSE THEY WERE CONTAINED BY A TSS' % (removeTicker)) # referenceCollection is now all enriched region loci that don't overlap an active TSS if stitchWindow == '': print('DETERMINING OPTIMUM STITCHING PARAMTER') optCollection = copy.deepcopy(referenceCollection) stitchWindow = optimizeStitching(optCollection, name, outFolder, stepSize=500) print('USING A STITCHING PARAMETER OF %s' % stitchWindow) stitchedCollection = referenceCollection.stitchCollection( stitchWindow, 'both') if removeTSS: # now replace any stitched region that overlap 2 distinct genes # with the original loci that were there fixedLoci = [] tssLoci = [] for geneID in startDict.keys(): tssLoci.append(utils.makeTSSLocus(geneID, startDict, 50, 50)) # this turns the tssLoci list into a LocusCollection # 50 is the internal parameter for LocusCollection and doesn't really matter tssCollection = utils.LocusCollection(tssLoci, 50) removeTicker = 0 originalTicker = 0 for stitchedLocus in stitchedCollection.getLoci(): overlappingTSSLoci = tssCollection.getOverlap( stitchedLocus, 'both') tssNames = [ startDict[tssLocus.ID()]['name'] for tssLocus in overlappingTSSLoci ] tssNames = utils.uniquify(tssNames) if len(tssNames) > 2: # stitchedCollection.remove(stitchedLocus) originalLoci = referenceCollection.getOverlap( stitchedLocus, 'both') originalTicker += len(originalLoci) fixedLoci += originalLoci debugOutput.append([ stitchedLocus.__str__(), stitchedLocus.ID(), 'MULTIPLE_TSS' ]) removeTicker += 1 else: fixedLoci.append(stitchedLocus) print( 'REMOVED %s STITCHED LOCI BECAUSE THEY OVERLAPPED MULTIPLE TSSs' % (removeTicker)) print('ADDED BACK %s ORIGINAL LOCI' % (originalTicker)) fixedCollection = utils.LocusCollection(fixedLoci, 50) return fixedCollection, debugOutput, stitchWindow else: return stitchedCollection, debugOutput, stitchWindow
def makePeakTable(paramDict, splitGFFPath, averageTablePath, startDict, geneList, genomeDirectory, tads_path=''): ''' makes the final peak table with ebox info ''' peakTable = [[ 'REGION_ID', 'CHROM', 'START', 'STOP', 'LENGTH', 'TSS', 'CPG', 'CPG_FRACTION', 'GC_FREQ', 'SIGNAL', 'CANON_EBOX_COUNT', 'NON_CANON_EBOX_COUNT', 'TOTAL_EBOX_COUNT', 'OVERLAPPING_GENES', 'PROXIMAL_GENES' ]] print('LOADING PEAK REGIONS') peakGFF = utils.parseTable(splitGFFPath, '\t') print('LOADING BINDING DATA') signalTable = utils.parseTable(averageTablePath, '\t') print('LOADING CPGS ISLANDS') cpgBed = utils.parseTable(paramDict['cpgPath'], '\t') cpgLoci = [] for line in cpgBed: cpgLoci.append(utils.Locus(line[0], line[1], line[2], '.', line[-1])) cpgCollection = utils.LocusCollection(cpgLoci, 50) print("MAKING TSS COLLECTIONS") if len(geneList) == 0: geneList = startDict.keys() tss_1kb_loci = [] tss_50kb_loci = [] for refID in geneList: tss_1kb_loci.append(utils.makeTSSLocus(refID, startDict, 1000, 1000)) tss_50kb_loci.append(utils.makeTSSLocus(refID, startDict, 50000, 50000)) #make a 1kb flanking and 50kb flanking collection tss_1kb_collection = utils.LocusCollection(tss_1kb_loci, 50) tss_50kb_collection = utils.LocusCollection(tss_50kb_loci, 50) if len(tads_path) > 0: print('LOADING TADS FROM %s' % (tads_path)) tad_collection = utils.importBoundRegion(tads_path, 'tad') use_tads = True #building a tad dict keyed by tad ID w/ genes in that tad provided tad_dict = defaultdict(list) for tss_locus in tss_1kb_loci: overlapping_tads = tad_collection.getOverlap(tss_locus, 'both') for tad_locus in overlapping_tads: tad_dict[tad_locus.ID()].append(tss_locus.ID()) else: use_tads = False print('CLASSIFYING PEAKS') ticker = 0 no_tad_count = 0 for i in range(len(peakGFF)): if ticker % 1000 == 0: print(ticker) ticker += 1 #getting the particulars of the region gffLine = peakGFF[i] peakID = gffLine[1] chrom = gffLine[0] start = int(gffLine[3]) stop = int(gffLine[4]) lineLocus = utils.Locus(chrom, start, stop, '.', peakID) #getting the mapped signal signalLine = signalTable[(i + 1)] signalVector = [float(x) for x in signalLine[2:]] #setting up the new line newLine = [peakID, chrom, start, stop, lineLocus.len()] #get the tss status from the gff itself (we are able to do this nicely from the split gff code earlier newLine.append(gffLine[7]) #check cpg status if cpgCollection.getOverlap(lineLocus, 'both'): newLine.append(1) else: newLine.append(0) #now do fractional cpgOverlap overlappingCpGLoci = cpgCollection.getOverlap(lineLocus, 'both') overlappingBases = 0 for locus in overlappingCpGLoci: cpgStart = max(locus.start(), lineLocus.start()) cpgEnd = min(locus.end(), lineLocus.end()) overlappingBases += (cpgEnd - cpgStart) overlapFraction = float(overlappingBases) / lineLocus.len() newLine.append(round(overlapFraction, 2)) #now get the seq lineSeq = string.upper( utils.fetchSeq(genomeDirectory, chrom, start, stop, True)) if len(lineSeq) == 0: print('UH OH') print(lineSeq) print(gffLine) print(i) print(chrom) print(start) print(stop) sys.exit() gcFreq = float(lineSeq.count('GC') + lineSeq.count('CG')) / len(lineSeq) newLine.append(gcFreq) #this is where we add the ChIP-Seq signal newLine += signalVector eboxMatchList = re.findall('CA..TG', lineSeq) if len(eboxMatchList) == 0: newLine += [0] * 3 else: totalCount = len(eboxMatchList) canonCount = eboxMatchList.count('CACGTG') otherCount = totalCount - canonCount newLine += [canonCount, otherCount, totalCount] #now find the overlapping and proximal genes #here each overlapping gene the tss 1kb locus overlaps the peak if use_tads: tad_loci = tad_collection.getOverlap(lineLocus, 'both') tad_id_list = [tad_locus.ID() for tad_locus in tad_loci] tad_genes = [] for tad_id in tad_id_list: tad_genes += tad_dict[tad_id] if len(tad_genes) == 0: #print('no tad for this region') #print(gffLine) no_tad_count += 1 else: tad_genes = [] if len(tad_genes) > 0: overlappingGenes = [ startDict[locus.ID()]['name'] for locus in tss_1kb_collection.getOverlap(lineLocus, 'both') if tad_genes.count(locus.ID()) > 0 ] proximalGenes = [ startDict[locus.ID()]['name'] for locus in tss_50kb_collection.getOverlap(lineLocus, 'both') if tad_genes.count(locus.ID()) > 0 ] # print('linked peak to tad genes') # print([startDict[x]['name'] for x in tad_genes]) # print(tad_id_list) # print(gffLine) # print(overlappingGenes) # print(proximalGenes) else: overlappingGenes = [ startDict[locus.ID()]['name'] for locus in tss_1kb_collection.getOverlap(lineLocus, 'both') ] proximalGenes = [ startDict[locus.ID()]['name'] for locus in tss_50kb_collection.getOverlap(lineLocus, 'both') ] overlappingGenes = utils.uniquify(overlappingGenes) #here the tss 50kb locus overlaps the peak #overlap takes priority over proximal proximalGenes = [ gene for gene in proximalGenes if overlappingGenes.count(gene) == 0 ] proximalGenes = utils.uniquify(proximalGenes) overlappingString = string.join(overlappingGenes, ',') proximalString = string.join(proximalGenes, ',') newLine += [overlappingString, proximalString] peakTable.append(newLine) print('Out of %s regions, %s were assigned to at least 1 tad' % (len(peakTable), no_tad_count)) return peakTable
def main(): ''' main run call ''' debug = False from optparse import OptionParser usage = "usage: %prog [options] -g [GENOME] -i [INPUT_REGION_GFF] -r [RANKBY_BAM_FILE] -o [OUTPUT_FOLDER] [OPTIONAL_FLAGS]" parser = OptionParser(usage=usage) # required flags parser.add_option( "-i", "--i", dest="input", nargs=1, default=None, help= "Enter a comma separated list of .gff or .bed file of binding sites used to make enhancers" ) parser.add_option("-r", "--rankby", dest="rankby", nargs=1, default=None, help="Enter a comma separated list of bams to rank by") parser.add_option("-o", "--out", dest="out", nargs=1, default=None, help="Enter an output folder") parser.add_option("-g", "--genome", dest="genome", nargs=1, default=None, help="Enter the genome build (MM9,MM8,HG18,HG19)") # optional flags parser.add_option( "-n", "--name", dest="name", nargs=1, default=None, help="Provide a name for the analysis otherwise ROSE will guess") parser.add_option( "-c", "--control", dest="control", nargs=1, default=None, help= "Enter a comma separated list of control bams. Can either provide a single control bam for all rankby bams, or provide a control bam for each individual bam" ) parser.add_option( "-s", "--stitch", dest="stitch", nargs=1, default='', help= "Enter a max linking distance for stitching. Default will determine optimal stitching parameter" ) parser.add_option( "-t", "--tss", dest="tss", nargs=1, default=0, help="Enter a distance from TSS to exclude. 0 = no TSS exclusion") parser.add_option( "--mask", dest="mask", nargs=1, default=None, help= "Mask a set of regions from analysis. Provide a .bed or .gff of masking regions" ) # RETRIEVING FLAGS (options, args) = parser.parse_args() if not options.input or not options.rankby or not options.out or not options.genome: print('hi there') parser.print_help() exit() # making the out folder if it doesn't exist outFolder = utils.formatFolder(options.out, True) # figuring out folder schema gffFolder = utils.formatFolder(outFolder + 'gff/', True) mappedFolder = utils.formatFolder(outFolder + 'mappedGFF/', True) # GETTING INPUT FILE(s) inputList = [ inputFile for inputFile in options.input.split(',') if len(inputFile) > 1 ] #converting all input files into GFFs and moving into the GFF folder inputGFFList = [] for inputFile in inputList: if inputFile.split('.')[-1] == 'bed': # CONVERTING A BED TO GFF inputGFFName = inputFile.split('/')[-1][ 0:-4] #strips the last 4 characters i.e. '.bed' inputGFFFile = '%s%s.gff' % (gffFolder, inputGFFName) utils.bedToGFF(inputFile, inputGFFFile) elif options.input.split('.')[-1] == 'gff': # COPY THE INPUT GFF TO THE GFF FOLDER os.system('cp %s %s' % (inputFile, gffFolder)) inputGFFFile = '%s%s' % (gffFolder, inputFile.split('/')[-1]) else: print( 'WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT' ) # COPY THE INPUT GFF TO THE GFF FOLDER os.system('cp %s %s' % (inputFile, gffFolder)) inputGFFFile = '%s%s' % (gffFolder, inputFile.split('/')[-1]) inputGFFList.append(inputGFFFile) # GETTING THE LIST OF BAMFILES TO PROCESS #either same number of bams for rankby and control #or only 1 control #or none! #bamlist should be all rankby bams followed by control bams bamFileList = [] if options.control: controlBamList = [ bam for bam in options.control.split(',') if len(bam) > 0 ] rankbyBamList = [ bam for bam in options.rankby.split(',') if len(bam) > 0 ] if len(controlBamList) == len(rankbyBamList): #case where an equal number of backgrounds are given bamFileList = rankbyBamList + controlBamList elif len(controlBamList) == 1: #case where a universal background is applied bamFileList = rankbyBamList + controlBamList * len(rankbyBamList) else: print( 'ERROR: EITHER PROVIDE A SINGLE CONTROL BAM FOR ALL SAMPLES, OR ONE CONTROL BAM FOR EACH SAMPLE' ) sys.exit() else: bamFileList = [ bam for bam in options.rankby.split(',') if len(bam) > 0 ] # Stitch parameter if options.stitch == '': stitchWindow = '' else: stitchWindow = int(options.stitch) # tss options tssWindow = int(options.tss) if tssWindow != 0: removeTSS = True else: removeTSS = False # GETTING THE GENOME genome = string.upper(options.genome) print('USING %s AS THE GENOME' % (genome)) # GETTING THE CORRECT ANNOT FILE genomeDict = { 'HG18': '%s/annotation/hg18_refseq.ucsc' % (pipeline_dir), 'MM9': '%s/annotation/mm9_refseq.ucsc' % (pipeline_dir), 'HG19': '%s/annotation/hg19_refseq.ucsc' % (pipeline_dir), 'MM8': '%s/annotation/mm8_refseq.ucsc' % (pipeline_dir), 'MM10': '%s/annotation/mm10_refseq.ucsc' % (pipeline_dir), 'RN4': '%s/annotation/rn4_refseq.ucsc' % (pipeline_dir), } try: annotFile = genomeDict[genome.upper()] except KeyError: print('ERROR: UNSUPPORTED GENOMES TYPE %s' % (genome)) sys.exit() #FINDING THE ANALYSIS NAME if options.name: inputName = options.name else: inputName = inputGFFList[0].split('/')[-1].split('.')[0] print('USING %s AS THE ANALYSIS NAME' % (inputName)) print('FORMATTING INPUT REGIONS') # MAKING THE RAW INPUT FILE FROM THE INPUT GFFs #use a simpler unique region naming system if len(inputGFFList) == 1: inputGFF = utils.parseTable(inputGFFList[0], '\t') else: inputLoci = [] for gffFile in inputGFFList: print('\tprocessing %s' % (gffFile)) gff = utils.parseTable(gffFile, '\t') gffCollection = utils.gffToLocusCollection(gff, 50) inputLoci += gffCollection.getLoci() inputCollection = utils.LocusCollection(inputLoci, 50) inputCollection = inputCollection.stitchCollection( ) # stitches to produce unique regions inputGFF = utils.locusCollectionToGFF(inputCollection) formattedGFF = [] #now number things appropriately for i, line in enumerate(inputGFF): #use the coordinates to make a new id inputname_chr_sense_start_stop chrom = line[0] coords = [int(line[3]), int(line[4])] sense = line[6] lineID = '%s_%s' % (inputName, str(i + 1)) #1 indexing newLine = [ chrom, lineID, lineID, min(coords), max(coords), '', sense, '', lineID ] formattedGFF.append(newLine) #name of the master input gff file masterGFFFile = '%s%s_%s_ALL_-0_+0.gff' % (gffFolder, string.upper(genome), inputName) utils.unParseTable(formattedGFF, masterGFFFile, '\t') print('USING %s AS THE INPUT GFF' % (masterGFFFile)) # MAKING THE START DICT print('MAKING START DICT') startDict = utils.makeStartDict(annotFile) #GET CHROMS FOUND IN THE BAMS print('GETTING CHROMS IN BAMFILES') bamChromList = getBamChromList(bamFileList) print("USING THE FOLLOWING CHROMS") print(bamChromList) #LOADING IN THE GFF AND FILTERING BY CHROM print('LOADING AND FILTERING THE GFF') inputGFF = filterGFF(masterGFFFile, bamChromList) # LOADING IN THE BOUND REGION REFERENCE COLLECTION print('LOADING IN GFF REGIONS') referenceCollection = utils.gffToLocusCollection(inputGFF) print('CHECKING REFERENCE COLLECTION:') checkRefCollection(referenceCollection) # MASKING REFERENCE COLLECTION # see if there's a mask if options.mask: maskFile = options.mask # if it's a bed file if maskFile.split('.')[-1].upper() == 'BED': maskGFF = utils.bedToGFF(maskFile) elif maskFile.split('.')[-1].upper() == 'GFF': maskGFF = utils.parseTable(maskFile, '\t') else: print("MASK MUST BE A .gff or .bed FILE") sys.exit() maskCollection = utils.gffToLocusCollection(maskGFF) # now mask the reference loci referenceLoci = referenceCollection.getLoci() filteredLoci = [ locus for locus in referenceLoci if len(maskCollection.getOverlap(locus, 'both')) == 0 ] print("FILTERED OUT %s LOCI THAT WERE MASKED IN %s" % (len(referenceLoci) - len(filteredLoci), maskFile)) referenceCollection = utils.LocusCollection(filteredLoci, 50) # NOW STITCH REGIONS print('STITCHING REGIONS TOGETHER') stitchedCollection, debugOutput, stitchWindow = regionStitching( referenceCollection, inputName, outFolder, stitchWindow, tssWindow, annotFile, removeTSS) # NOW MAKE A STITCHED COLLECTION GFF print('MAKING GFF FROM STITCHED COLLECTION') stitchedGFF = utils.locusCollectionToGFF(stitchedCollection) print(stitchWindow) print(type(stitchWindow)) if not removeTSS: stitchedGFFFile = '%s%s_%sKB_STITCHED.gff' % (gffFolder, inputName, str(stitchWindow / 1000)) stitchedGFFName = '%s_%sKB_STITCHED' % (inputName, str(stitchWindow / 1000)) debugOutFile = '%s%s_%sKB_STITCHED.debug' % (gffFolder, inputName, str(stitchWindow / 1000)) else: stitchedGFFFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.gff' % ( gffFolder, inputName, str(stitchWindow / 1000)) stitchedGFFName = '%s_%sKB_STITCHED_TSS_DISTAL' % ( inputName, str(stitchWindow / 1000)) debugOutFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.debug' % ( gffFolder, inputName, str(stitchWindow / 1000)) # WRITING DEBUG OUTPUT TO DISK if debug: print('WRITING DEBUG OUTPUT TO DISK AS %s' % (debugOutFile)) utils.unParseTable(debugOutput, debugOutFile, '\t') # WRITE THE GFF TO DISK print('WRITING STITCHED GFF TO DISK AS %s' % (stitchedGFFFile)) utils.unParseTable(stitchedGFF, stitchedGFFFile, '\t') # SETTING UP THE OVERALL OUTPUT FILE outputFile1 = outFolder + stitchedGFFName + '_ENHANCER_REGION_MAP.txt' print('OUTPUT WILL BE WRITTEN TO %s' % (outputFile1)) # MAPPING TO THE NON STITCHED (ORIGINAL GFF) # MAPPING TO THE STITCHED GFF # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail. bamFileListUnique = list(bamFileList) bamFileListUnique = utils.uniquify(bamFileListUnique) #prevent redundant mapping print("MAPPING TO THE FOLLOWING BAMS:") print(bamFileListUnique) for bamFile in bamFileListUnique: bamFileName = bamFile.split('/')[-1] # MAPPING TO THE STITCHED GFF mappedOut1Folder = '%s%s_%s_MAPPED' % (mappedFolder, stitchedGFFName, bamFileName) mappedOut1File = '%s%s_%s_MAPPED/matrix.txt' % ( mappedFolder, stitchedGFFName, bamFileName) if utils.checkOutput(mappedOut1File, 0.2, 0.2): print("FOUND %s MAPPING DATA FOR BAM: %s" % (stitchedGFFFile, mappedOut1File)) else: cmd1 = bamliquidator_path + " --sense . -e 200 --match_bamToGFF -r %s -o %s %s" % ( stitchedGFFFile, mappedOut1Folder, bamFile) print(cmd1) os.system(cmd1) if utils.checkOutput(mappedOut1File, 0.2, 5): print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (stitchedGFFFile, bamFileName)) else: print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (stitchedGFFFile, bamFileName)) sys.exit() print('BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS') # CALCULATE DENSITY BY REGION # NEED TO FIX THIS FUNCTION TO ACCOUNT FOR DIFFERENT OUTPUTS OF LIQUIDATOR mapCollection(stitchedCollection, referenceCollection, bamFileList, mappedFolder, outputFile1, refName=stitchedGFFName) print('FINDING AVERAGE SIGNAL AMONGST BAMS') metaOutputFile = collapseRegionMap(outputFile1, inputName + '_MERGED_SIGNAL', controlBams=options.control) #now try the merging print('CALLING AND PLOTTING SUPER-ENHANCERS') rankbyName = inputName + '_MERGED_SIGNAL' controlName = 'NONE' cmd = 'Rscript %sROSE2_callSuper.R %s %s %s %s' % ( pipeline_dir, outFolder, metaOutputFile, inputName, controlName) print(cmd) os.system(cmd) # calling the gene mapper print('CALLING GENE MAPPING') superTableFile = "%s_SuperEnhancers.table.txt" % (inputName) #for now don't use ranking bam to call top genes cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % ( pipeline_dir, genome, outFolder, superTableFile) print(cmd) os.system(cmd) stretchTableFile = "%s_StretchEnhancers.table.txt" % (inputName) cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % ( pipeline_dir, genome, outFolder, stretchTableFile) print(cmd) os.system(cmd) superStretchTableFile = "%s_SuperStretchEnhancers.table.txt" % (inputName) cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % ( pipeline_dir, genome, outFolder, superStretchTableFile) os.system(cmd)
def main(): print('main analysis for MYCN project') print('changing directory to project folder') os.chdir(projectFolder) print('\n\n') print( '#======================================================================' ) print( '#======================I, LOADING DATA ANNOTATION======================' ) print( '#======================================================================' ) print('\n\n') #This section sanity checks each data table and makes sure both bam and .bai files are accessible #for ChIP-Seq pipeline_dfci.summary(mouse_dataFile) print('\n\n') print( '#======================================================================' ) print( '#==========================II. CALLING MACS============================' ) print( '#======================================================================' ) print('\n\n') #running peak finding using macs 1.4.2 on all chip datasets #this usually takes ~2-3 hours on a reasonably fast machine #a 3 hour time out on this entire operation is set #if peak calling takes longer than 3 hours, simply run the script again after completion #run_macs(mouse_dataFile) print('\n\n') print( '#======================================================================' ) print( '#=================II. DEFINING ACTIVE GENES IN MOUSE===================' ) print( '#======================================================================' ) print('\n\n') #here we will identify active promoters in various contexts as those with #an H3K27AC peak in the +/- 1kb tss region #UCSC refseq annotations are used for all genes #make_active_gene_lists(mouse_dataFile) print('\n\n') print( '#======================================================================' ) print( '#==================III. CALLING ROSE TO MAP ENHANCERS==================' ) print( '#======================================================================' ) print('\n\n') # #for SCG_H3K27AC # analysisName = 'SCG_H3K27AC' # namesList = ['SCG_H3K27Ac'] # bashFileName,region_map_path,namesList=define_enhancer_landscape(mouse_dataFile,analysisName,namesList) # #for CG_H3K27AC # analysisName = 'CG_H3K27AC' # namesList = ['CG_H3K27Ac'] # bashFileName,region_map_path,namesList=define_enhancer_landscape(mouse_dataFile,analysisName,namesList) # #for GANGLIA_H3K27AC # analysisName = 'GANGLIA_H3K27AC' # namesList = ['CG_H3K27Ac','SCG_H3K27Ac'] # bashFileName,region_map_path,namesList=define_enhancer_landscape(mouse_dataFile,analysisName,namesList) # #for THMYCN # analysisName = 'THMYCN_H3K27AC' # namesList = ['THMYCN_139076_H3K27Ac','THMYCN_139423_H3K27Ac','THMYCN1_H3K27Ac'] # bashFileName,region_map_path,namesList=define_enhancer_landscape(mouse_dataFile,analysisName,namesList) print('\n\n') print( '#======================================================================' ) print( '#=================IV. LIFTING OVER NB CONSERVED REGIONS================' ) print( '#======================================================================' ) print('\n\n') # #liftover a pair of gffs # #first convert to bed # nb_promoter_gff_path = '%sgff/HG19_NB_MYCN_CONSERVED_PROMOTER_-5000_+5000.gff' % (hg19_projectFolder) # nb_enhancer_gff_path = '%sgff/HG19_NB_MYCN_CONSERVED_ENHANCER_-5000_+5000.gff' % (hg19_projectFolder) # nb_promoter_bed_path ='%sbeds/HG19_NB_MYCN_CONSERVED_PROMOTER_-5000_+5000.bed' % (hg19_projectFolder) # nb_enhancer_bed_path ='%sbeds/HG19_NB_MYCN_CONSERVED_ENHANCER_-5000_+5000.bed' % (hg19_projectFolder) # nb_promoter_gff = utils.parseTable(nb_promoter_gff_path,'\t') # nb_enhancer_gff = utils.parseTable(nb_enhancer_gff_path,'\t') # utils.gffToBed(nb_promoter_gff,nb_promoter_bed_path) # utils.gffToBed(nb_enhancer_gff,nb_enhancer_bed_path) # print('converted NB conserved gffs to beds at %s and %s' % (nb_promoter_bed_path,nb_enhancer_bed_path)) # #note, now you have to liftover manually to create beds # mm9_promoter_bed_path = '%sMM9_NB_MYCN_CONSERVED_PROMOTER_-5000_+5000.bed' % (bedFolder) # mm9_enhancer_bed_path = '%sMM9_NB_MYCN_CONSERVED_ENHANCER_-5000_+5000.bed' % (bedFolder) # mm9_promoter_gff_path = '%sMM9_NB_MYCN_CONSERVED_PROMOTER_-5000_+5000.gff' % (gffFolder) # mm9_enhancer_gff_path = '%sMM9_NB_MYCN_CONSERVED_ENHANCER_-5000_+5000.gff' % (gffFolder) # utils.bedToGFF(mm9_promoter_bed_path,mm9_promoter_gff_path) # utils.bedToGFF(mm9_enhancer_bed_path,mm9_enhancer_gff_path) # print('writing mm9 nb mycn sites to %s and %s' % (mm9_promoter_gff_path,mm9_enhancer_gff_path)) print('\n\n') print( '#======================================================================' ) print( '#======================V. MAPPING ENRICHED TO GFFS=====================' ) print( '#======================================================================' ) print('\n\n') # setName = 'THMYCN' # gffList = [mm9_promoter_gff_path,mm9_enhancer_gff_path] # cellTypeList = ['THMYCN1','THMYCN2','THMYCN','CG','SCG'] # mapList = ['CG_H3K27Ac', # 'SCG_H3K27Ac', # 'THMYCN1_H3K27Ac', # 'THMYCN_139423_H3K27Ac', # 'THMYCN_139076_H3K27Ac', # ] # #pipeline_dfci.mapEnrichedToGFF(mouse_dataFile,setName,gffList,cellTypeList,macsEnrichedFolder,mappedEnrichedFolder,macs=True,namesList=mapList,useBackground=True) # #summarize info for venn diagrams for each # promoter_mapped_path = '%sMM9_NB_MYCN_CONSERVED_PROMOTER_-5000_+5000/MM9_NB_MYCN_CONSERVED_PROMOTER_-5000_+5000_THMYCN.txt' % (mappedEnrichedFolder) # promoter_venn_path = '%sMM9_NB_MYCN_CONSERVED_PROMOTER_-5000_+5000_VENN.txt' % (tableFolder) # summarizeVenn(promoter_mapped_path,group_list = ['CG','THMYCN'],output=promoter_venn_path) # enhancer_mapped_path = '%sMM9_NB_MYCN_CONSERVED_ENHANCER_-5000_+5000/MM9_NB_MYCN_CONSERVED_ENHANCER_-5000_+5000_THMYCN.txt' % (mappedEnrichedFolder) # enhancer_venn_path = '%sMM9_NB_MYCN_CONSERVED_ENHANCER_-5000_+5000_VENN.txt' % (tableFolder) # summarizeVenn(enhancer_mapped_path,group_list = ['CG','THMYCN'],output=enhancer_venn_path) print('\n\n') print( '#======================================================================' ) print( '#=====================VI. MAKING MYCN REGIONS GFF======================' ) print( '#======================================================================' ) print('\n\n') dataDict = pipeline_dfci.loadDataTable(mouse_dataFile) names_list = [ 'THMYCN2_MYCN', 'THMYCN_139076_MYCN', 'THMYCN_139423_MYCN', ] mycn_loci = [] for name in names_list: mycn_collection = utils.importBoundRegion( '%s%s' % (macsEnrichedFolder, dataDict[name]['enrichedMacs']), name) mycn_loci += mycn_collection.getLoci() mycn_collection = utils.LocusCollection(mycn_loci, 50) mycn_collection.stitchCollection() mycn_gff = utils.locusCollectionToGFF(mycn_collection) mycn_gff_path = '%sMM9_THMYCN_MYCN_-0_+0.gff' % (gffFolder) utils.unParseTable(mycn_gff, mycn_gff_path, '\t') #make collections promoter_collection = utils.gffToLocusCollection( '%sMM9_NB_MYCN_CONSERVED_PROMOTER_-5000_+5000.gff' % (gffFolder)) enhancer_collection = utils.gffToLocusCollection( '%sMM9_NB_MYCN_CONSERVED_ENHANCER_-5000_+5000.gff' % (gffFolder)) #make the overlap table overlap_table = [['PROMOTER', 'ENHANCER', 'NONE']] promoter_count = 0 enhancer_count = 0 none_count = 0 for line in mycn_gff: locus = utils.Locus(line[0], int(line[3]) - 10000, int(line[4]) + 10000, '.') if enhancer_collection.getOverlap(locus, 'both'): enhancer_count += 1 continue if promoter_collection.getOverlap(locus, 'both'): promoter_count += 1 else: none_count += 1 overlap_table.append([promoter_count, enhancer_count, none_count]) overlap_table_path = '%sMM9_THMYCN_OVERLAP.txt' % (tableFolder) utils.unParseTable(overlap_table, overlap_table_path, '\t') print('\n\n') print( '#======================================================================' ) print( '#=====================VI. MAPPING GFFS FOR HEATMAP=====================' ) print( '#======================================================================' ) print('\n\n') #map_for_heatmap(mouse_dataFile) print('\n\n') print( '#======================================================================' ) print( '#=====================VII. AVERAGING MAPPED SIGNAL=====================' ) print( '#======================================================================' ) print('\n\n') # set_list = ['GANGLIA_H3K27AC','THMYCN_H3K27AC','THMYCN_MYCN'] # set_names = [ # ['CG_H3K27Ac','SCG_H3K27Ac'], # ['THMYCN1_H3K27Ac','THMYCN_139423_H3K27Ac','THMYCN_139076_H3K27Ac'], # ['THMYCN2_MYCN','THMYCN_139076_MYCN','THMYCN_139423_MYCN'] # ] # for i in range(len(set_list)): # setName = set_list[i] # names_list =set_names[i] # print(setName) # print(names_list) # #for promoters # mapped_list = ['%sMM9_NB_MYCN_CONSERVED_PROMOTER_-5000_+5000/MM9_NB_MYCN_CONSERVED_PROMOTER_-5000_+5000_%s.gff' % (mappedFolder,name) for name in names_list] # output_path = '%sMM9_NB_MYCN_CONSERVED_PROMOTER_-5000_+5000/MM9_NB_MYCN_CONSERVED_PROMOTER_-5000_+5000_%s.gff' % (mappedFolder,setName) # print(output_path) # averagingMappedSignal(mapped_list,output_path,setName) # #for enhancers # mapped_list = ['%sMM9_NB_MYCN_CONSERVED_ENHANCER_-5000_+5000/MM9_NB_MYCN_CONSERVED_ENHANCER_-5000_+5000_%s.gff' % (mappedFolder,name) for name in names_list] # output_path = '%sMM9_NB_MYCN_CONSERVED_ENHANCER_-5000_+5000/MM9_NB_MYCN_CONSERVED_ENHANCER_-5000_+5000_%s.gff' % (mappedFolder,setName) # print(output_path) # averagingMappedSignal(mapped_list,output_path,setName) print('\n\n') print( '#======================================================================' ) print( '#=====================VIII. MAKING HEATMAPS/METAS======================' ) print( '#======================================================================' ) print('\n\n')
def mapGFFLineToAnnot(gffLine, outFolder, nBins, geneDict, txCollection, sense='both', header=''): ''' for every line produces a file with all of the rectangles to draw ''' if len(header) == 0: gffString = '%s_%s_%s_%s' % (gffLine[0], gffLine[6], gffLine[3], gffLine[4]) else: gffString = header diagramTable = [[0, 0, 0, 0]] nameTable = [['', 0, 0]] gffLocus = utils.Locus(gffLine[0], int(gffLine[3]), int(gffLine[4]), gffLine[6], gffLine[1]) scaleFactor = float(nBins) / gffLocus.len() # plotting buffer for diagrams plotBuffer = int(gffLocus.len() / float(nBins) * 20) overlapLoci = txCollection.getOverlap(gffLocus, sense='both') geneList = [locus.ID() for locus in overlapLoci] if gffLine[6] == '-': refPoint = int(gffLine[4]) else: refPoint = int(gffLine[3]) offsetCollection = utils.LocusCollection([], 500) for geneID in geneList: gene = geneDict[geneID] print(gene.commonName()) if len(gene.commonName()) > 1: name = gene.commonName() else: name = geneID offset = 4 * len(offsetCollection.getOverlap(gene.txLocus())) offsetCollection.append( utils.makeSearchLocus(gene.txLocus(), plotBuffer, plotBuffer)) # write the name of the gene down if gene.sense() == '+': geneStart = gene.txLocus().start() else: geneStart = gene.txLocus().end() geneStart = abs(geneStart - refPoint) * scaleFactor nameTable.append([name, geneStart, -2 - offset]) # draw a line across the entire txLocus [start, stop] = [ abs(x - refPoint) * scaleFactor for x in gene.txLocus().coords() ] diagramTable.append([start, -0.01 - offset, stop, 0.01 - offset]) # now draw thin boxes for all txExons if len(gene.txExons()) > 0: for txExon in gene.txExons(): [start, stop] = [ abs(x - refPoint) * scaleFactor for x in txExon.coords() ] diagramTable.append([start, -0.5 - offset, stop, 0.5 - offset]) # now draw fatty boxes for the coding exons if any if len(gene.cdExons()) > 0: for cdExon in gene.cdExons(): [start, stop] = [ abs(x - refPoint) * scaleFactor for x in cdExon.coords() ] diagramTable.append([start, -1 - offset, stop, 1 - offset]) utils.unParseTable(diagramTable, outFolder + gffString + '_diagramTemp.txt', '\t') utils.unParseTable(nameTable, outFolder + gffString + '_nameTemp.txt', '\t')
def findCanidateTFs(genome, enhancer_gff, expressedNM, expressionDictNM, bamFile, TFlist, refseqToNameDict, projectFolder, projectName, promoter): ''' Assign each Super-Enhancer to the closest active TSS to its center Return a dictionary keyed by TF that points to a list of loci ''' #loading in the enhancer gff regions enhancer_collection = utils.gffToLocusCollection(enhancer_gff) enhancer_loci = enhancer_collection.getLoci() #loading in the genome and TF info annot_file = genome.returnFeature('annot_file') startDict = utils.makeStartDict(annot_file) tf_table = utils.parseTable(genome.returnFeature('tf_file'),'\t') refID_list = [line[0] for line in tf_table] #creates a list of all NM IDs for TFs #make a collection of all TF TSSs tssLoci = [] for refID in refID_list: tssLoci.append(utils.makeTSSLocus(refID,startDict,0,0)) #this is a precise 1 coordinate TSS locus tssCollection = utils.LocusCollection(tssLoci,50) enhancerTable = [['ENHANCER_ID','CHROM','START','STOP','GENE_LIST']] gene_to_enhancer_dict = defaultdict(list) # Loop through enhancers #all gene nnames stored by refID for enhancer in enhancer_loci: # If the enhancer overlaps a TSS, save it overlapping_loci = tssCollection.getOverlap(enhancer, 'both') overlapping_refIDs =[locus.ID() for locus in overlapping_loci] # Find all gene TSS within 100 kb proximal_loci = tssCollection.getOverlap(utils.makeSearchLocus(enhancer,100000,100000),'both') proximal_refIDs =[locus.ID() for locus in proximal_loci] # If no genes are within 100 kb, find the closest active gene within 1 million bp closest_refID = [] if len(overlapping_refIDs) == 0 and len(proximal_refIDs) == 0: distal_loci = tssCollection.getOverlap(utils.makeSearchLocus(enhancer,1000000,1000000),'both') distal_refIDs =[locus.ID() for locus in distal_loci] enhancerCenter = (int(enhancer.start()) + int(enhancer.end())) / 2 distance_list = [abs(enhancerCenter - startDict[geneID]['start'][0]) for geneID in distal_refIDs] if len(distance_list) > 0: closest_refID = [distalGenes[distance_list.index(min(distance_list))]] #now we have all potential gene cases all_refIDs = overlappingGenes + proximalGenes + closest_refID #now we get all names and refIDs all_refIDs = utils.uniquify([refID for refID in all_refIDs if len(refID) > 0 ]) all_names = utils.uniquify([startDict[refID]['name'] for refID in all_refIDs]) #first do enhancer level assignment names_string = ','.join(all_names) enhancer_table.append([enhancer.ID(),enhancer.chr(),enhancer.start(),enhancer.end(),names_string]) #now do gene level assignment for refID in all_refIDs: gene_to_enhancer_dict[refID].append(enhancer.ID()) #an enhancer can be assigned to multiple genes #a promoter can only be assigned to 1 gene #promoters don't have enhancerIDs so don't add them yet #this should just be an enhancer level table #followed by a gene level table overlappingGenes = utils.uniquify(overlappingGenes) proximalGenes = utils.uniquify(proximalGenes) for refID in overlappingGenes: if proximalGenes.count(refID) == 1: proximalGenes.remove(refID) # If a TSS overlaps an enhancer, assign them together if overlappingGenes: for gene in overlappingGenes: if gene in tf_list: TFtoEnhancerDict[gene].append(enhancer) enhancerAssignment.append([gene, enhancer.chr(), enhancer.start(), enhancer.end(), enhancer.ID()]) # Otherwise, assign the enhancer to the most active gene in 100 kb elif not overlappingGenes and proximalGenes: highestGene = '' highestActivity = 0 for gene in proximalGenes: if expressionDictNM[gene] > highestActivity: highestActivity = expressionDictNM[gene] highestGene = gene if highestGene in TFlist: TFtoEnhancerDict[gene].append(enhancer) enhancerAssignment.append([gene, enhancer.chr(), enhancer.start(), enhancer.end(), enhancer.ID()]) elif not overlappingGenes and not proximalGenes and closestGene: if closestGene in TFlist: gene = closestGene TFtoEnhancerDict[gene].append(enhancer) enhancerAssignment.append([gene, enhancer.chr(), enhancer.start(), enhancer.end(), enhancer.ID()]) # Add promoter is it's not contained in the super if promoter: for gene in TFtoEnhancerDict.keys(): promoter = utils.Locus(startDict[gene]['chr'], int(startDict[gene]['start'][0]) - 2000, int(startDict[gene]['start'][0]) + 2000, startDict[gene]['sense']) overlapBool = False for enhancer in TFtoEnhancerDict[gene]: if promoter.overlaps(enhancer): overlapBool = True if not overlapBool: TFtoEnhancerDict[gene].append(promoter) seAssignmentFile = projectFolder + projectName + '_ENHANCER_ASSIGNMENT.txt' utils.unParseTable(enhancerAssignment, seAssignmentFile, '\t') return TFtoEnhancerDict
def make_shep_on_mycn_landscape(shep_on_dataFile): ''' finds mycn peaks in shep21 that are conserved in nb and segregates them into promoter or enhancer ''' dataDict = pipeline_dfci.loadDataTable(shep_on_dataFile) print('LOADING SHEP ON MYCN SITES') #load all of the shep_on sites # shep_on_gff_path = '%smeta_rose/SHEP_ON_MYC/gff/HG19_SHEP_ON_MYC_ALL_-0_+0.gff' % (projectFolder) # shep_on_gff = utils.parseTable(shep_on_gff_path,'\t') shep_on_bed_path = '%sSHEP_6HR_MYCN_peaks.bed' % (macsEnrichedFolder) shep_on_bed = utils.parseTable(shep_on_bed_path,'\t') shep_on_gff = utils.bedToGFF(shep_on_bed) #now get the conserved NB MYCN regions nb_conserved_mycn_gff_file = '%sHG19_NB_MYCN_CONSERVED_-0_+0.gff' % (gffFolder) nb_conserved_mycn_collection = utils.gffToLocusCollection(nb_conserved_mycn_gff_file) print('LOADING SHEP ACTIVE ENHANCERS') #make a collection of enhancers shep_enhancer_file = '%smeta_rose/SHEP_ON_H3K27AC/SHEP_ON_H3K27AC_AllEnhancers.table.txt' % (projectFolder) shep_enhancer_collection = utils.makeSECollection(shep_enhancer_file,'SHEP_H3K27AC') #now get the active promoters print('LOADING SHEP ACTIVE PROMOTERS') startDict = utils.makeStartDict(annotFile) shep_transcribed_file = '%sHG19_SHEP_ON_H3K27AC_ACTIVE.txt' % (geneListFolder) shep_transcribed_table = utils.parseTable(shep_transcribed_file,'\t') transcribedList = [line[1] for line in shep_transcribed_table] tssLoci = [] for refID in transcribedList: tssLoci.append(utils.makeTSSLocus(refID,startDict,1000,1000)) shep_tss_collection = utils.LocusCollection(tssLoci,50) #now initialize the 6 gffs we will need shep_mycn_gff = [] shep_mycn_gff_5kb = [] shep_mycn_gff_1kb = [] shep_mycn_promoter_gff = [] shep_mycn_promoter_gff_1kb = [] shep_mycn_promoter_gff_5kb = [] shep_mycn_enhancer_gff = [] shep_mycn_enhancer_gff_1kb = [] shep_mycn_enhancer_gff_5kb = [] #and their respective file names shep_mycn_gff_file = '%sHG19_SHEP_MYCN_CONSERVED_-0_+0.gff' % (gffFolder) shep_mycn_gff_5kb_file = '%sHG19_SHEP_MYCN_CONSERVED_-5kb_+5kb.gff' % (gffFolder) shep_mycn_gff_1kb_file = '%sHG19_SHEP_MYCN_CONSERVED_-1kb_+1kb.gff' % (gffFolder) shep_mycn_promoter_gff_file = '%sHG19_SHEP_MYCN_CONSERVED_PROMOTER_-0_+0.gff' % (gffFolder) shep_mycn_promoter_gff_5kb_file = '%sHG19_SHEP_MYCN_CONSERVED_PROMOTER_-5kb_+5kb.gff' % (gffFolder) shep_mycn_promoter_gff_1kb_file = '%sHG19_SHEP_MYCN_CONSERVED_PROMOTER_-1kb_+1kb.gff' % (gffFolder) shep_mycn_enhancer_gff_file = '%sHG19_SHEP_MYCN_CONSERVED_ENHANCER_-0_+0.gff' % (gffFolder) shep_mycn_enhancer_gff_5kb_file = '%sHG19_SHEP_MYCN_CONSERVED_ENHANCER_-5kb_+5kb.gff' % (gffFolder) shep_mycn_enhancer_gff_1kb_file = '%sHG19_SHEP_MYCN_CONSERVED_ENHANCER_-1kb_+1kb.gff' % (gffFolder) print('ITERATING THROUGH SHEP MYCN PEAKS') ticker = 0 enhancer = 0 promoter = 0 other = 0 for line in shep_on_gff: if ticker % 1000 == 0: print ticker ticker+=1 peakID = '%s_%s' % ('SHEP_MYCN',str(ticker)) lineLocus = utils.Locus(line[0],line[3],line[4],'.',peakID) if nb_conserved_mycn_collection.getOverlap(lineLocus): gffLine = [line[0],peakID,peakID,line[3],line[4],'','.','',peakID] peakCenter = (int(line[3]) + int(line[4]))/2 gffLine_5kb = [line[0],peakID,peakID,peakCenter - 5000,peakCenter + 5000,'','.','',peakID] #the 1kb is not a center +/- but a flank gffLine_1kb = [line[0],peakID,peakID,int(line[3]) - 1000,int(line[4]) + 1000,'','.','',peakID] shep_mycn_gff.append(gffLine) shep_mycn_gff_5kb.append(gffLine_5kb) shep_mycn_gff_1kb.append(gffLine_1kb) #tss overlap should take precedence over enhancer overlap if shep_tss_collection.getOverlap(lineLocus,'both'): shep_mycn_promoter_gff.append(gffLine) shep_mycn_promoter_gff_5kb.append(gffLine_5kb) shep_mycn_promoter_gff_1kb.append(gffLine_1kb) promoter+=1 #now check for enhancer overlap elif shep_enhancer_collection.getOverlap(lineLocus,'both'): shep_mycn_enhancer_gff.append(gffLine) shep_mycn_enhancer_gff_5kb.append(gffLine_5kb) shep_mycn_enhancer_gff_1kb.append(gffLine_1kb) enhancer+=1 else: other+=1 print('Of %s shep on mycn peaks' % (len(shep_on_gff))) print('%s are promoter' % (promoter)) print('%s are enhancer' % (enhancer)) print('%s are other' % (other)) #now write out the gffs utils.unParseTable(shep_mycn_gff,shep_mycn_gff_file,'\t') utils.unParseTable(shep_mycn_gff_5kb,shep_mycn_gff_5kb_file,'\t') utils.unParseTable(shep_mycn_gff_1kb,shep_mycn_gff_1kb_file,'\t') utils.unParseTable(shep_mycn_promoter_gff,shep_mycn_promoter_gff_file,'\t') utils.unParseTable(shep_mycn_promoter_gff_5kb,shep_mycn_promoter_gff_5kb_file,'\t') utils.unParseTable(shep_mycn_promoter_gff_1kb,shep_mycn_promoter_gff_1kb_file,'\t') utils.unParseTable(shep_mycn_enhancer_gff,shep_mycn_enhancer_gff_file,'\t') utils.unParseTable(shep_mycn_enhancer_gff_5kb,shep_mycn_enhancer_gff_5kb_file,'\t') utils.unParseTable(shep_mycn_enhancer_gff_1kb,shep_mycn_enhancer_gff_1kb_file,'\t')
def mapEnhancerToGene(annotFile,enhancerFile,transcribedFile='',uniqueGenes=True,searchWindow =50000,noFormatTable = False): ''' maps genes to enhancers. if uniqueGenes, reduces to gene name only. Otherwise, gives for each refseq ''' startDict = utils.makeStartDict(annotFile) enhancerTable = utils.parseTable(enhancerFile,'\t') #internal parameter for debugging byRefseq = False if len(transcribedFile) > 0: transcribedTable = utils.parseTable(transcribedFile,'\t') transcribedGenes = [line[1] for line in transcribedTable] else: transcribedGenes = startDict.keys() print('MAKING TRANSCRIPT COLLECTION') transcribedCollection = utils.makeTranscriptCollection(annotFile,0,0,500,transcribedGenes) print('MAKING TSS COLLECTION') tssLoci = [] for geneID in transcribedGenes: tssLoci.append(utils.makeTSSLocus(geneID,startDict,0,0)) #this turns the tssLoci list into a LocusCollection #50 is the internal parameter for LocusCollection and doesn't really matter tssCollection = utils.LocusCollection(tssLoci,50) geneDict = {'overlapping':defaultdict(list),'proximal':defaultdict(list)} #dictionaries to hold ranks and superstatus of gene nearby enhancers rankDict = defaultdict(list) superDict= defaultdict(list) #list of all genes that appear in this analysis overallGeneList = [] if noFormatTable: #set up the output tables #first by enhancer enhancerToGeneTable = [enhancerTable[0]+['OVERLAP_GENES','PROXIMAL_GENES','CLOSEST_GENE']] else: #set up the output tables #first by enhancer enhancerToGeneTable = [enhancerTable[0][0:9]+['OVERLAP_GENES','PROXIMAL_GENES','CLOSEST_GENE'] + enhancerTable[5][-2:]] #next by gene geneToEnhancerTable = [['GENE_NAME','REFSEQ_ID','PROXIMAL_ENHANCERS']] #next make the gene to enhancer table geneToEnhancerTable = [['GENE_NAME','REFSEQ_ID','PROXIMAL_ENHANCERS','ENHANCER_RANKS','IS_SUPER']] for line in enhancerTable: if line[0][0] =='#' or line[0][0] == 'R': continue enhancerString = '%s:%s-%s' % (line[1],line[2],line[3]) enhancerLocus = utils.Locus(line[1],line[2],line[3],'.',line[0]) #overlapping genes are transcribed genes whose transcript is directly in the stitchedLocus overlappingLoci = transcribedCollection.getOverlap(enhancerLocus,'both') overlappingGenes =[] for overlapLocus in overlappingLoci: overlappingGenes.append(overlapLocus.ID()) #proximalGenes are transcribed genes where the tss is within 50kb of the boundary of the stitched loci proximalLoci = tssCollection.getOverlap(utils.makeSearchLocus(enhancerLocus,searchWindow,searchWindow),'both') proximalGenes =[] for proxLocus in proximalLoci: proximalGenes.append(proxLocus.ID()) distalLoci = tssCollection.getOverlap(utils.makeSearchLocus(enhancerLocus,1000000,1000000),'both') distalGenes =[] for proxLocus in distalLoci: distalGenes.append(proxLocus.ID()) overlappingGenes = utils.uniquify(overlappingGenes) proximalGenes = utils.uniquify(proximalGenes) distalGenes = utils.uniquify(distalGenes) allEnhancerGenes = overlappingGenes + proximalGenes + distalGenes #these checks make sure each gene list is unique. #technically it is possible for a gene to be overlapping, but not proximal since the #gene could be longer than the 50kb window, but we'll let that slide here for refID in overlappingGenes: if proximalGenes.count(refID) == 1: proximalGenes.remove(refID) for refID in proximalGenes: if distalGenes.count(refID) == 1: distalGenes.remove(refID) #Now find the closest gene if len(allEnhancerGenes) == 0: closestGene = '' else: #get enhancerCenter enhancerCenter = (int(line[2]) + int(line[3]))/2 #get absolute distance to enhancer center distList = [abs(enhancerCenter - startDict[geneID]['start'][0]) for geneID in allEnhancerGenes] #get the ID and convert to name closestGene = startDict[allEnhancerGenes[distList.index(min(distList))]]['name'] #NOW WRITE THE ROW FOR THE ENHANCER TABLE if noFormatTable: newEnhancerLine = list(line) newEnhancerLine.append(join(utils.uniquify([startDict[x]['name'] for x in overlappingGenes]),',')) newEnhancerLine.append(join(utils.uniquify([startDict[x]['name'] for x in proximalGenes]),',')) newEnhancerLine.append(closestGene) else: newEnhancerLine = line[0:9] newEnhancerLine.append(join(utils.uniquify([startDict[x]['name'] for x in overlappingGenes]),',')) newEnhancerLine.append(join(utils.uniquify([startDict[x]['name'] for x in proximalGenes]),',')) newEnhancerLine.append(closestGene) newEnhancerLine += line[-2:] enhancerToGeneTable.append(newEnhancerLine) #Now grab all overlapping and proximal genes for the gene ordered table overallGeneList +=overlappingGenes for refID in overlappingGenes: geneDict['overlapping'][refID].append(enhancerString) rankDict[refID].append(int(line[-2])) superDict[refID].append(int(line[-1])) overallGeneList+=proximalGenes for refID in proximalGenes: geneDict['proximal'][refID].append(enhancerString) rankDict[refID].append(int(line[-2])) superDict[refID].append(int(line[-1])) #End loop through #Make table by gene overallGeneList = utils.uniquify(overallGeneList) #use enhancer rank to order rankOrder = utils.order([min(rankDict[x]) for x in overallGeneList]) usedNames = [] for i in rankOrder: refID = overallGeneList[i] geneName = startDict[refID]['name'] if usedNames.count(geneName) > 0 and uniqueGenes == True: continue else: usedNames.append(geneName) proxEnhancers = geneDict['overlapping'][refID]+geneDict['proximal'][refID] superStatus = max(superDict[refID]) enhancerRanks = join([str(x) for x in rankDict[refID]],',') newLine = [geneName,refID,join(proxEnhancers,','),enhancerRanks,superStatus] geneToEnhancerTable.append(newLine) #resort enhancerToGeneTable if noFormatTable: return enhancerToGeneTable,geneToEnhancerTable else: enhancerOrder = utils.order([int(line[-2]) for line in enhancerToGeneTable[1:]]) sortedTable = [enhancerToGeneTable[0]] for i in enhancerOrder: sortedTable.append(enhancerToGeneTable[(i+1)]) return sortedTable,geneToEnhancerTable
def loadAnnotFile(genome, window, geneList=[], skip_cache=False): """ load in the annotation and create a startDict and tss collection for a set of refseq IDs a given genome """ genomeDict = { 'HG18': 'annotation/hg18_refseq.ucsc', 'MM9': 'annotation/mm9_refseq.ucsc', 'MM10': 'annotation/mm10_refseq.ucsc', 'HG19': 'annotation/hg19_refseq.ucsc', 'HG19_RIBO': 'annotation/hg19_refseq.ucsc', 'RN4': 'annotation/rn4_refseq.ucsc', 'RN6': 'annotation/rn6_refseq.ucsc', 'HG38': 'annotation/hg38_refseq.ucsc', } genomeDirectoryDict = { 'HG19': '/storage/cylin/grail/genomes/Homo_sapiens/UCSC/hg19/Sequence/Chromosomes/', 'RN6': '/storage/cylin/grail/genomes/Rattus_norvegicus/UCSC/rn6/Sequence/Chromosomes/', 'MM9': '/storage/cylin/grail/genomes/Mus_musculus/UCSC/mm9/Sequence/Chromosomes/', 'MM10': '/storage/cylin/grail/genomes/Mus_musculus/UCSC/mm10/Sequence/Chromosomes/', 'HG38': '/storage/cylin/grail/genomes/Homo_sapiens/UCSC/hg38/Sequence/Chromosomes/', } mouse_convert_file = '%s/annotation/HMD_HumanPhenotype.rpt' % (whereAmI) #making a dictionary for mouse to human conversion mouse_convert_dict = defaultdict(str) mouse_convert_table = utils.parseTable(mouse_convert_file, '\t') for line in mouse_convert_table: mouse_convert_dict[line[4]] = line[0] genomeDirectory = genomeDirectoryDict[string.upper(genome)] #making a chrom_dict that is a list of all chroms with sequence chrom_list = utils.uniquify([ name.split('.')[0] for name in os.listdir(genomeDirectory) if len(name) > 0 ]) annotFile = whereAmI + '/' + genomeDict[string.upper(genome)] if not skip_cache: # Try loading from a cache, if the crc32 matches annotPathHash = zlib.crc32( annotFile) & 0xFFFFFFFF # hash the entire location of this script annotFileHash = zlib.crc32(open(annotFile, "rb").read()) & 0xFFFFFFFF cache_file_name = "%s.%s.%s.cache" % (genome, annotPathHash, annotFileHash) cache_file_path = '%s/%s' % (tempfile.gettempdir(), cache_file_name) if os.path.isfile(cache_file_path): # Cache exists! Load it! try: print('\tLoading genome data from cache.') with open(cache_file_path, 'rb') as cache_fh: cached_data = cPickle.load(cache_fh) print('\tCache loaded.') return cached_data except (IOError, cPickle.UnpicklingError): # Pickle corrupt? Let's get rid of it. print('\tWARNING: Cache corrupt or unreadable. Ignoring.') else: print('\tNo cache exists: Loading annotation (slow).') # We're still here, so either caching was disabled, or the cache doesn't exist startDict = utils.makeStartDict(annotFile, geneList) tssLoci = [] if geneList == []: geneList = startDict.keys() for gene in geneList: tssLoci.append(utils.makeTSSLocus(gene, startDict, window, window)) tssCollection = utils.LocusCollection(tssLoci, 50) if not skip_cache: print('Writing cache for the first time.') with open(cache_file_path, 'wb') as cache_fh: cPickle.dump((startDict, tssCollection), cache_fh, cPickle.HIGHEST_PROTOCOL) return startDict, tssCollection, genomeDirectory, chrom_list, mouse_convert_dict
def findCanidateTFs(annotationFile, enhancerLoci, expressedNM, expressionDictNM, bamFile, TFlist, refseqToNameDict, projectFolder, projectName, promoter): ''' Assign each Super-Enhancer to the closest active TSS to its center Return a dictionary keyed by TF that points to a list of loci ''' print 'FINDING CANIDATE TFs' enhancerAssignment = [] TFtoEnhancerDict = defaultdict(list) startDict = utils.makeStartDict(annotationFile) tssLoci = [] for gene in expressedNM: tssLoci.append(utils.makeTSSLocus(gene,startDict,1000,1000)) tssCollection = utils.LocusCollection(tssLoci,50) # Loop through enhancers for enhancer in enhancerLoci: # If the enhancer overlaps a TSS, save it overlappingLoci = tssCollection.getOverlap(enhancer, 'both') overlappingGenes =[] for overlapLocus in overlappingLoci: overlappingGenes.append(overlapLocus.ID()) # Find all gene TSS within 100 kb proximalLoci = tssCollection.getOverlap(utils.makeSearchLocus(enhancer,100000,100000),'both') proximalGenes =[] for proxLocus in proximalLoci: proximalGenes.append(proxLocus.ID()) # If no genes are within 100 kb, find the closest active gene closestGene = '' if len(overlappingGenes) == 0 and len(proximalGenes) == 0: distalLoci = tssCollection.getOverlap(utils.makeSearchLocus(enhancer,1000000,1000000),'both') distalGenes =[] for distalLocus in distalLoci: distalGenes.append(distalLocus.ID()) enhancerCenter = (int(enhancer.start()) + int(enhancer.end())) / 2 distList = [abs(enhancerCenter - startDict[geneID]['start'][0]) for geneID in distalGenes] if distList: closestGene = distalGenes[distList.index(min(distList))] overlappingGenes = utils.uniquify(overlappingGenes) proximalGenes = utils.uniquify(proximalGenes) for refID in overlappingGenes: if proximalGenes.count(refID) == 1: proximalGenes.remove(refID) # If a TSS overlaps an enhancer, assign them together if overlappingGenes: for gene in overlappingGenes: if gene in TFlist: TFtoEnhancerDict[gene].append(enhancer) enhancerAssignment.append([gene, enhancer.chr(), enhancer.start(), enhancer.end(), enhancer.ID()]) # Otherwise, assign the enhancer to the most active gene in 100 kb elif not overlappingGenes and proximalGenes: highestGene = '' highestActivity = 0 for gene in proximalGenes: if expressionDictNM[gene] > highestActivity: highestActivity = expressionDictNM[gene] highestGene = gene if highestGene in TFlist: TFtoEnhancerDict[gene].append(enhancer) enhancerAssignment.append([gene, enhancer.chr(), enhancer.start(), enhancer.end(), enhancer.ID()]) elif not overlappingGenes and not proximalGenes and closestGene: if closestGene in TFlist: gene = closestGene TFtoEnhancerDict[gene].append(enhancer) enhancerAssignment.append([gene, enhancer.chr(), enhancer.start(), enhancer.end(), enhancer.ID()]) # Add promoter is it's not contained in the super if promoter: for gene in TFtoEnhancerDict.keys(): promoter = utils.Locus(startDict[gene]['chr'], int(startDict[gene]['start'][0]) - 2000, int(startDict[gene]['start'][0]) + 2000, startDict[gene]['sense']) overlapBool = False for enhancer in TFtoEnhancerDict[gene]: if promoter.overlaps(enhancer): overlapBool = True if not overlapBool: TFtoEnhancerDict[gene].append(promoter) seAssignmentFile = projectFolder + projectName + '_ENHANCER_ASSIGNMENT.txt' utils.unParseTable(enhancerAssignment, seAssignmentFile, '\t') return TFtoEnhancerDict