def makePeakGFFs(peak_path_list): ''' makes a stitched gff for all MYC bound TSS and Distal regions across all datasets ''' #setting the output tss_gff_path = '%sHG19_MYC_TSS_REGIONS_-0_+0.gff' % (gffFolder) distal_gff_path = '%sHG19_MYC_DISTAL_REGIONS_-0_+0.gff' % (gffFolder) #check to see if already done if utils.checkOutput(tss_gff_path,0.1,0.1) and utils.checkOutput(distal_gff_path,0.1,0.1): print('OUTPUT FOUND AT %s and %s' % (tss_gff_path,distal_gff_path)) return tss_gff_path,distal_gff_path #emtpy loci lists to hold everything tss_loci = [] distal_loci = [] for peak_path in peak_path_list: print('processing %s' % (peak_path)) peak_table= utils.parseTable(peak_path,'\t') for line in peak_table[1:]: peak_locus = utils.Locus(line[1],line[2],line[3],'.') if int(line[5]) == 0: distal_loci.append(peak_locus) else: tss_loci.append(peak_locus) #now combind the loci print('stitching loci') distal_collection = utils.LocusCollection(distal_loci,50) tss_collection = utils.LocusCollection(tss_loci,50) stitched_distal_collection = distal_collection.stitchCollection() stitched_tss_collection = tss_collection.stitchCollection() #now make the gffs distal_gff= utils.locusCollectionToGFF(distal_collection) tss_gff= utils.locusCollectionToGFF(tss_collection) #now write to disk utils.unParseTable(distal_gff,distal_gff_path,'\t') utils.unParseTable(tss_gff,tss_gff_path,'\t') return tss_gff_path,distal_gff_path
def mergeCollections(superFile1,superFile2,name1,name2,output=''): ''' merges them collections ''' conSuperCollection = makeSECollection(superFile1,name1) tnfSuperCollection = makeSECollection(superFile2,name2) #now merge them mergedLoci = conSuperCollection.getLoci() + tnfSuperCollection.getLoci() mergedCollection = utils.LocusCollection(mergedLoci,50) #stitch the collection together stitchedCollection = mergedCollection.stitchCollection() stitchedLoci = stitchedCollection.getLoci() #loci that are in both get renamed with a new unique identifier renamedLoci =[] ticker = 1 for locus in stitchedLoci: if len(conSuperCollection.getOverlap(locus)) > 0 and len(tnfSuperCollection.getOverlap(locus)): newID = 'CONSERVED_%s' % (str(ticker)) ticker +=1 locus._ID = newID else: locus._ID = locus.ID()[2:] renamedLoci.append(locus) #now we turn this into a gff and write it out gff = utils.locusCollectionToGFF(utils.LocusCollection(renamedLoci,50)) if len(output) == 0: return gff else: print "writing merged gff to %s" % (output) utils.unParseTable(gff,output,'\t') return output
def calculatePromoterActivity(annotationFile, bamFile, projectName, projectFolder, refseqToNameDict, background=False): ''' calculates the level of acetylation at each TF promoter ''' print 'GENERATING AN ACTIVITY TABLE USING CHIP DATA' annotTable = utils.parseTable(annotationFile, '\t') output = [] counter = 0 bam = utils.Bam(bamFile) if background: background = utils.Bam(background) startDict = utils.makeStartDict(annotationFile) tssLoci = [] for gene in startDict: tssLoci.append(utils.makeTSSLocus(gene, startDict, 2500, 2500)) tssCollection = utils.LocusCollection(tssLoci, 50) gff = utils.locusCollectionToGFF(tssCollection) outputname = projectFolder + projectName + '_TSS.gff' utils.unParseTable(gff, outputname, '\t') mappingCmd = 'bamliquidator_batch' mappingCmd += ' -r ' + outputname mappingCmd += ' -o ' + projectFolder + 'bamliquidator' mappingCmd += ' -m -e 200 ' mappingCmd += bamFile subprocess.call(mappingCmd, shell=True) print mappingCmd
def calculatePromoterActivity(annotationFile, bamFile, projectName, projectFolder, refseqToNameDict): ''' calculates the level of H3K27ac at each promoter from a H3K27ac bam file ''' print 'IDENTIFY EXPRESSED GENES' annotTable = utils.parseTable(annotationFile, '\t') output = [] counter = 0 bam = utils.Bam(bamFile) startDict = utils.makeStartDict(annotationFile) tssLoci = [] for gene in startDict: tssLoci.append(utils.makeTSSLocus(gene,startDict,1000,1000)) tssCollection = utils.LocusCollection(tssLoci,50) gff = utils.locusCollectionToGFF(tssCollection) outputname = projectFolder + projectName + '_TSS.gff' utils.unParseTable(gff, outputname, '\t') # run bamToGFF.py to quantify signal at each TSS +/- 1kb mappingCmd = 'python ./bamToGFF.py' mappingCmd += ' -r ' mappingCmd += ' -d ' mappingCmd += ' -o ' + projectFolder + 'matrix.gff' mappingCmd += ' -m 1 -f 0 -e 200 ' mappingCmd += ' -i ' + projectFolder + projectName + '_TSS.gff' mappingCmd += ' -b ' + bamFile call(mappingCmd, shell=True) print mappingCmd
def merge_regions(): ''' merges ha peaks to identify all overlapping peaks filters out anything overlapping the HA controls ''' hk_dox_ha_1 = utils.importBoundRegion( '%sHK_DOX_HA_1_peaks.bed' % (macsEnrichedFolder), 'HK_DOX_HA_1') hk_dox_ha_2 = utils.importBoundRegion( '%sHK_DOX_HA_2_peaks.bed' % (macsEnrichedFolder), 'HK_DOX_HA_2') hk_dox_loci = hk_dox_ha_1.getLoci() + hk_dox_ha_2.getLoci() #control datasets hk_ctl_ha_1 = utils.importBoundRegion( '%sHK_CTL_HA_1_peaks.bed' % (macsEnrichedFolder), 'HK_CTL_HA_1') hk_ctl_ha_2 = utils.importBoundRegion( '%sHK_CTL_HA_2_peaks.bed' % (macsEnrichedFolder), 'HK_CTL_HA_2') hk_ctl_loci = hk_ctl_ha_1.getLoci() + hk_ctl_ha_2.getLoci() hk_ctl_lc = utils.LocusCollection(hk_ctl_loci) print(len(hk_dox_loci)) stitched_lc = utils.LocusCollection(hk_dox_loci).stitchCollection() print(len(stitched_lc)) filtered_loci = [] for locus in stitched_lc.getLoci(): if len(hk_dox_ha_1.getOverlap(locus)) > 0 and len( hk_dox_ha_2.getOverlap(locus)) > 0: if len(hk_ctl_lc.getOverlap(locus)) == 0: filtered_loci.append(locus) print(len(filtered_loci)) filtered_lc = utils.LocusCollection(filtered_loci) gff_path = '%sHG19_IRF2_HA_MERGED_FILTERED_CONSERVED_0_+0.gff' % ( gffFolder) filtered_gff = utils.locusCollectionToGFF(filtered_lc) utils.unParseTable(filtered_gff, gff_path, '\t')
def main(): ''' main run call ''' debug = False from optparse import OptionParser usage = "usage: %prog [options] -g [GENOME] -i [INPUT_REGION_GFF] -r [RANKBY_BAM_FILE] -o [OUTPUT_FOLDER] [OPTIONAL_FLAGS]" parser = OptionParser(usage=usage) # required flags parser.add_option( "-i", "--i", dest="input", nargs=1, default=None, help= "Enter a comma separated list of .gff or .bed file of binding sites used to make enhancers" ) parser.add_option("-r", "--rankby", dest="rankby", nargs=1, default=None, help="Enter a comma separated list of bams to rank by") parser.add_option("-o", "--out", dest="out", nargs=1, default=None, help="Enter an output folder") parser.add_option("-g", "--genome", dest="genome", nargs=1, default=None, help="Enter the genome build (MM9,MM8,HG18,HG19)") # optional flags parser.add_option( "-n", "--name", dest="name", nargs=1, default=None, help="Provide a name for the analysis otherwise ROSE will guess") parser.add_option( "-c", "--control", dest="control", nargs=1, default=None, help= "Enter a comma separated list of control bams. Can either provide a single control bam for all rankby bams, or provide a control bam for each individual bam" ) parser.add_option( "-s", "--stitch", dest="stitch", nargs=1, default='', help= "Enter a max linking distance for stitching. Default will determine optimal stitching parameter" ) parser.add_option( "-t", "--tss", dest="tss", nargs=1, default=0, help="Enter a distance from TSS to exclude. 0 = no TSS exclusion") parser.add_option( "--mask", dest="mask", nargs=1, default=None, help= "Mask a set of regions from analysis. Provide a .bed or .gff of masking regions" ) # RETRIEVING FLAGS (options, args) = parser.parse_args() if not options.input or not options.rankby or not options.out or not options.genome: print('hi there') parser.print_help() exit() # making the out folder if it doesn't exist outFolder = utils.formatFolder(options.out, True) # figuring out folder schema gffFolder = utils.formatFolder(outFolder + 'gff/', True) mappedFolder = utils.formatFolder(outFolder + 'mappedGFF/', True) # GETTING INPUT FILE(s) inputList = [ inputFile for inputFile in options.input.split(',') if len(inputFile) > 1 ] #converting all input files into GFFs and moving into the GFF folder inputGFFList = [] for inputFile in inputList: if inputFile.split('.')[-1] == 'bed': # CONVERTING A BED TO GFF inputGFFName = inputFile.split('/')[-1][ 0:-4] #strips the last 4 characters i.e. '.bed' inputGFFFile = '%s%s.gff' % (gffFolder, inputGFFName) utils.bedToGFF(inputFile, inputGFFFile) elif options.input.split('.')[-1] == 'gff': # COPY THE INPUT GFF TO THE GFF FOLDER os.system('cp %s %s' % (inputFile, gffFolder)) inputGFFFile = '%s%s' % (gffFolder, inputFile.split('/')[-1]) else: print( 'WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT' ) # COPY THE INPUT GFF TO THE GFF FOLDER os.system('cp %s %s' % (inputFile, gffFolder)) inputGFFFile = '%s%s' % (gffFolder, inputFile.split('/')[-1]) inputGFFList.append(inputGFFFile) # GETTING THE LIST OF BAMFILES TO PROCESS #either same number of bams for rankby and control #or only 1 control #or none! #bamlist should be all rankby bams followed by control bams bamFileList = [] if options.control: controlBamList = [ bam for bam in options.control.split(',') if len(bam) > 0 ] rankbyBamList = [ bam for bam in options.rankby.split(',') if len(bam) > 0 ] if len(controlBamList) == len(rankbyBamList): #case where an equal number of backgrounds are given bamFileList = rankbyBamList + controlBamList elif len(controlBamList) == 1: #case where a universal background is applied bamFileList = rankbyBamList + controlBamList * len(rankbyBamList) else: print( 'ERROR: EITHER PROVIDE A SINGLE CONTROL BAM FOR ALL SAMPLES, OR ONE CONTROL BAM FOR EACH SAMPLE' ) sys.exit() else: bamFileList = [ bam for bam in options.rankby.split(',') if len(bam) > 0 ] # Stitch parameter if options.stitch == '': stitchWindow = '' else: stitchWindow = int(options.stitch) # tss options tssWindow = int(options.tss) if tssWindow != 0: removeTSS = True else: removeTSS = False # GETTING THE GENOME genome = string.upper(options.genome) print('USING %s AS THE GENOME' % (genome)) # GETTING THE CORRECT ANNOT FILE genomeDict = { 'HG18': '%s/annotation/hg18_refseq.ucsc' % (pipeline_dir), 'MM9': '%s/annotation/mm9_refseq.ucsc' % (pipeline_dir), 'HG19': '%s/annotation/hg19_refseq.ucsc' % (pipeline_dir), 'MM8': '%s/annotation/mm8_refseq.ucsc' % (pipeline_dir), 'MM10': '%s/annotation/mm10_refseq.ucsc' % (pipeline_dir), 'RN4': '%s/annotation/rn4_refseq.ucsc' % (pipeline_dir), } try: annotFile = genomeDict[genome.upper()] except KeyError: print('ERROR: UNSUPPORTED GENOMES TYPE %s' % (genome)) sys.exit() #FINDING THE ANALYSIS NAME if options.name: inputName = options.name else: inputName = inputGFFList[0].split('/')[-1].split('.')[0] print('USING %s AS THE ANALYSIS NAME' % (inputName)) print('FORMATTING INPUT REGIONS') # MAKING THE RAW INPUT FILE FROM THE INPUT GFFs #use a simpler unique region naming system if len(inputGFFList) == 1: inputGFF = utils.parseTable(inputGFFList[0], '\t') else: inputLoci = [] for gffFile in inputGFFList: print('\tprocessing %s' % (gffFile)) gff = utils.parseTable(gffFile, '\t') gffCollection = utils.gffToLocusCollection(gff, 50) inputLoci += gffCollection.getLoci() inputCollection = utils.LocusCollection(inputLoci, 50) inputCollection = inputCollection.stitchCollection( ) # stitches to produce unique regions inputGFF = utils.locusCollectionToGFF(inputCollection) formattedGFF = [] #now number things appropriately for i, line in enumerate(inputGFF): #use the coordinates to make a new id inputname_chr_sense_start_stop chrom = line[0] coords = [int(line[3]), int(line[4])] sense = line[6] lineID = '%s_%s' % (inputName, str(i + 1)) #1 indexing newLine = [ chrom, lineID, lineID, min(coords), max(coords), '', sense, '', lineID ] formattedGFF.append(newLine) #name of the master input gff file masterGFFFile = '%s%s_%s_ALL_-0_+0.gff' % (gffFolder, string.upper(genome), inputName) utils.unParseTable(formattedGFF, masterGFFFile, '\t') print('USING %s AS THE INPUT GFF' % (masterGFFFile)) # MAKING THE START DICT print('MAKING START DICT') startDict = utils.makeStartDict(annotFile) #GET CHROMS FOUND IN THE BAMS print('GETTING CHROMS IN BAMFILES') bamChromList = getBamChromList(bamFileList) print("USING THE FOLLOWING CHROMS") print(bamChromList) #LOADING IN THE GFF AND FILTERING BY CHROM print('LOADING AND FILTERING THE GFF') inputGFF = filterGFF(masterGFFFile, bamChromList) # LOADING IN THE BOUND REGION REFERENCE COLLECTION print('LOADING IN GFF REGIONS') referenceCollection = utils.gffToLocusCollection(inputGFF) print('CHECKING REFERENCE COLLECTION:') checkRefCollection(referenceCollection) # MASKING REFERENCE COLLECTION # see if there's a mask if options.mask: maskFile = options.mask # if it's a bed file if maskFile.split('.')[-1].upper() == 'BED': maskGFF = utils.bedToGFF(maskFile) elif maskFile.split('.')[-1].upper() == 'GFF': maskGFF = utils.parseTable(maskFile, '\t') else: print("MASK MUST BE A .gff or .bed FILE") sys.exit() maskCollection = utils.gffToLocusCollection(maskGFF) # now mask the reference loci referenceLoci = referenceCollection.getLoci() filteredLoci = [ locus for locus in referenceLoci if len(maskCollection.getOverlap(locus, 'both')) == 0 ] print("FILTERED OUT %s LOCI THAT WERE MASKED IN %s" % (len(referenceLoci) - len(filteredLoci), maskFile)) referenceCollection = utils.LocusCollection(filteredLoci, 50) # NOW STITCH REGIONS print('STITCHING REGIONS TOGETHER') stitchedCollection, debugOutput, stitchWindow = regionStitching( referenceCollection, inputName, outFolder, stitchWindow, tssWindow, annotFile, removeTSS) # NOW MAKE A STITCHED COLLECTION GFF print('MAKING GFF FROM STITCHED COLLECTION') stitchedGFF = utils.locusCollectionToGFF(stitchedCollection) print(stitchWindow) print(type(stitchWindow)) if not removeTSS: stitchedGFFFile = '%s%s_%sKB_STITCHED.gff' % (gffFolder, inputName, str(stitchWindow / 1000)) stitchedGFFName = '%s_%sKB_STITCHED' % (inputName, str(stitchWindow / 1000)) debugOutFile = '%s%s_%sKB_STITCHED.debug' % (gffFolder, inputName, str(stitchWindow / 1000)) else: stitchedGFFFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.gff' % ( gffFolder, inputName, str(stitchWindow / 1000)) stitchedGFFName = '%s_%sKB_STITCHED_TSS_DISTAL' % ( inputName, str(stitchWindow / 1000)) debugOutFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.debug' % ( gffFolder, inputName, str(stitchWindow / 1000)) # WRITING DEBUG OUTPUT TO DISK if debug: print('WRITING DEBUG OUTPUT TO DISK AS %s' % (debugOutFile)) utils.unParseTable(debugOutput, debugOutFile, '\t') # WRITE THE GFF TO DISK print('WRITING STITCHED GFF TO DISK AS %s' % (stitchedGFFFile)) utils.unParseTable(stitchedGFF, stitchedGFFFile, '\t') # SETTING UP THE OVERALL OUTPUT FILE outputFile1 = outFolder + stitchedGFFName + '_ENHANCER_REGION_MAP.txt' print('OUTPUT WILL BE WRITTEN TO %s' % (outputFile1)) # MAPPING TO THE NON STITCHED (ORIGINAL GFF) # MAPPING TO THE STITCHED GFF # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail. bamFileListUnique = list(bamFileList) bamFileListUnique = utils.uniquify(bamFileListUnique) #prevent redundant mapping print("MAPPING TO THE FOLLOWING BAMS:") print(bamFileListUnique) for bamFile in bamFileListUnique: bamFileName = bamFile.split('/')[-1] # MAPPING TO THE STITCHED GFF mappedOut1Folder = '%s%s_%s_MAPPED' % (mappedFolder, stitchedGFFName, bamFileName) mappedOut1File = '%s%s_%s_MAPPED/matrix.txt' % ( mappedFolder, stitchedGFFName, bamFileName) if utils.checkOutput(mappedOut1File, 0.2, 0.2): print("FOUND %s MAPPING DATA FOR BAM: %s" % (stitchedGFFFile, mappedOut1File)) else: cmd1 = bamliquidator_path + " --sense . -e 200 --match_bamToGFF -r %s -o %s %s" % ( stitchedGFFFile, mappedOut1Folder, bamFile) print(cmd1) os.system(cmd1) if utils.checkOutput(mappedOut1File, 0.2, 5): print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (stitchedGFFFile, bamFileName)) else: print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (stitchedGFFFile, bamFileName)) sys.exit() print('BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS') # CALCULATE DENSITY BY REGION # NEED TO FIX THIS FUNCTION TO ACCOUNT FOR DIFFERENT OUTPUTS OF LIQUIDATOR mapCollection(stitchedCollection, referenceCollection, bamFileList, mappedFolder, outputFile1, refName=stitchedGFFName) print('FINDING AVERAGE SIGNAL AMONGST BAMS') metaOutputFile = collapseRegionMap(outputFile1, inputName + '_MERGED_SIGNAL', controlBams=options.control) #now try the merging print('CALLING AND PLOTTING SUPER-ENHANCERS') rankbyName = inputName + '_MERGED_SIGNAL' controlName = 'NONE' cmd = 'Rscript %sROSE2_callSuper.R %s %s %s %s' % ( pipeline_dir, outFolder, metaOutputFile, inputName, controlName) print(cmd) os.system(cmd) # calling the gene mapper print('CALLING GENE MAPPING') superTableFile = "%s_SuperEnhancers.table.txt" % (inputName) #for now don't use ranking bam to call top genes cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % ( pipeline_dir, genome, outFolder, superTableFile) print(cmd) os.system(cmd) stretchTableFile = "%s_StretchEnhancers.table.txt" % (inputName) cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % ( pipeline_dir, genome, outFolder, stretchTableFile) print(cmd) os.system(cmd) superStretchTableFile = "%s_SuperStretchEnhancers.table.txt" % (inputName) cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % ( pipeline_dir, genome, outFolder, superStretchTableFile) os.system(cmd)
def main(): ''' main run call ''' debug = False from optparse import OptionParser usage = "usage: %prog [options] -g [GENOME] -i [INPUT_REGION_GFF] -r [RANKBY_BAM_FILE] -o [OUTPUT_FOLDER] [OPTIONAL_FLAGS]" parser = OptionParser(usage=usage) # required flags parser.add_option( "-i", "--i", dest="input", nargs=1, default=None, help="Enter a .gff or .bed file of binding sites used to make enhancers" ) parser.add_option("-r", "--rankby", dest="rankby", nargs=1, default=None, help="bamfile to rank enhancer by") parser.add_option("-o", "--out", dest="out", nargs=1, default=None, help="Enter an output folder") parser.add_option("-g", "--genome", dest="genome", nargs=1, default=None, help="Enter the genome build (MM9,MM8,HG18,HG19)") # optional flags parser.add_option( "-b", "--bams", dest="bams", nargs=1, default=None, help="Enter a comma separated list of additional bam files to map to") parser.add_option("-c", "--control", dest="control", nargs=1, default=None, help="bamfile to rank enhancer by") parser.add_option( "-s", "--stitch", dest="stitch", nargs=1, default='', help= "Enter a max linking distance for stitching. Default will determine optimal stitching parameter" ) parser.add_option( "-t", "--tss", dest="tss", nargs=1, default=0, help="Enter a distance from TSS to exclude. 0 = no TSS exclusion") parser.add_option( "--mask", dest="mask", nargs=1, default=None, help= "Mask a set of regions from analysis. Provide a .bed or .gff of masking regions" ) # RETRIEVING FLAGS (options, args) = parser.parse_args() if not options.input or not options.rankby or not options.out or not options.genome: print('hi there') parser.print_help() exit() # making the out folder if it doesn't exist outFolder = utils.formatFolder(options.out, True) # figuring out folder schema gffFolder = utils.formatFolder(outFolder + 'gff/', True) mappedFolder = utils.formatFolder(outFolder + 'mappedGFF/', True) # GETTING INPUT FILE if options.input.split('.')[-1] == 'bed': # CONVERTING A BED TO GFF inputGFFName = options.input.split('/')[-1][0:-4] inputGFFFile = '%s%s.gff' % (gffFolder, inputGFFName) utils.bedToGFF(options.input, inputGFFFile) elif options.input.split('.')[-1] == 'gff': # COPY THE INPUT GFF TO THE GFF FOLDER inputGFFFile = options.input os.system('cp %s %s' % (inputGFFFile, gffFolder)) else: print( 'WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT' ) # COPY THE INPUT GFF TO THE GFF FOLDER inputGFFFile = options.input os.system('cp %s %s' % (inputGFFFile, gffFolder)) # GETTING THE LIST OF BAMFILES TO PROCESS if options.control: bamFileList = [options.rankby, options.control] else: bamFileList = [options.rankby] if options.bams: bamFileList += options.bams.split(',') #bamFileList = utils.uniquify(bamFileList) # makes sad when you have the same control bam over and over again # optional args # Stitch parameter if options.stitch == '': stitchWindow = '' else: stitchWindow = int(options.stitch) # tss options tssWindow = int(options.tss) if tssWindow != 0: removeTSS = True else: removeTSS = False # GETTING THE BOUND REGION FILE USED TO DEFINE ENHANCERS print('USING %s AS THE INPUT GFF' % (inputGFFFile)) inputName = inputGFFFile.split('/')[-1].split('.')[0] # GETTING THE GENOME genome = options.genome print('USING %s AS THE GENOME' % genome) # GETTING THE CORRECT ANNOT FILE cwd = os.getcwd() genomeDict = { 'HG18': '%s/annotation/hg18_refseq.ucsc' % (cwd), 'MM9': '%s/annotation/mm9_refseq.ucsc' % (cwd), 'HG19': '%s/annotation/hg19_refseq.ucsc' % (cwd), 'MM8': '%s/annotation/mm8_refseq.ucsc' % (cwd), 'MM10': '%s/annotation/mm10_refseq.ucsc' % (cwd), 'RN4': '%s/annotation/rn4_refseq.ucsc' % (cwd), 'RN6': '%s/annotation/rn6_refseq.ucsc' % (cwd), } annotFile = genomeDict[genome.upper()] # MAKING THE START DICT print('MAKING START DICT') startDict = utils.makeStartDict(annotFile) #GET CHROMS FOUND IN THE BAMS print('GETTING CHROMS IN BAMFILES') bamChromList = getBamChromList(bamFileList) print("USING THE FOLLOWING CHROMS") print(bamChromList) #LOADING IN THE GFF AND FILTERING BY CHROM print('LOADING AND FILTERING THE GFF') inputGFF = filterGFF(inputGFFFile, bamChromList) # LOADING IN THE BOUND REGION REFERENCE COLLECTION print('LOADING IN GFF REGIONS') referenceCollection = utils.gffToLocusCollection(inputGFF) print('CHECKING REFERENCE COLLECTION:') checkRefCollection(referenceCollection) # MASKING REFERENCE COLLECTION # see if there's a mask if options.mask: maskFile = options.mask # if it's a bed file if maskFile.split('.')[-1].upper() == 'BED': maskGFF = utils.bedToGFF(maskFile) elif maskFile.split('.')[-1].upper() == 'GFF': maskGFF = utils.parseTable(maskFile, '\t') else: print("MASK MUST BE A .gff or .bed FILE") sys.exit() maskCollection = utils.gffToLocusCollection(maskGFF) # now mask the reference loci referenceLoci = referenceCollection.getLoci() filteredLoci = [ locus for locus in referenceLoci if len(maskCollection.getOverlap(locus, 'both')) == 0 ] print("FILTERED OUT %s LOCI THAT WERE MASKED IN %s" % (len(referenceLoci) - len(filteredLoci), maskFile)) referenceCollection = utils.LocusCollection(filteredLoci, 50) # NOW STITCH REGIONS print('STITCHING REGIONS TOGETHER') stitchedCollection, debugOutput, stitchWindow = regionStitching( referenceCollection, inputName, outFolder, stitchWindow, tssWindow, annotFile, removeTSS) # NOW MAKE A STITCHED COLLECTION GFF print('MAKING GFF FROM STITCHED COLLECTION') stitchedGFF = utils.locusCollectionToGFF(stitchedCollection) # making sure start/stop ordering are correct for i in range(len(stitchedGFF)): line = stitchedGFF[i] start = int(line[3]) stop = int(line[4]) if start > stop: line[3] = stop line[4] = start print(stitchWindow) print(type(stitchWindow)) if not removeTSS: stitchedGFFFile = '%s%s_%sKB_STITCHED.gff' % (gffFolder, inputName, str(stitchWindow / 1000)) stitchedGFFName = '%s_%sKB_STITCHED' % (inputName, str(stitchWindow / 1000)) debugOutFile = '%s%s_%sKB_STITCHED.debug' % (gffFolder, inputName, str(stitchWindow / 1000)) else: stitchedGFFFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.gff' % ( gffFolder, inputName, str(stitchWindow / 1000)) stitchedGFFName = '%s_%sKB_STITCHED_TSS_DISTAL' % ( inputName, str(stitchWindow / 1000)) debugOutFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.debug' % ( gffFolder, inputName, str(stitchWindow / 1000)) # WRITING DEBUG OUTPUT TO DISK if debug: print('WRITING DEBUG OUTPUT TO DISK AS %s' % (debugOutFile)) utils.unParseTable(debugOutput, debugOutFile, '\t') # WRITE THE GFF TO DISK print('WRITING STITCHED GFF TO DISK AS %s' % (stitchedGFFFile)) utils.unParseTable(stitchedGFF, stitchedGFFFile, '\t') # SETTING UP THE OVERALL OUTPUT FILE outputFile1 = outFolder + stitchedGFFName + '_ENHANCER_REGION_MAP.txt' print('OUTPUT WILL BE WRITTEN TO %s' % (outputFile1)) # MAPPING TO THE NON STITCHED (ORIGINAL GFF) # MAPPING TO THE STITCHED GFF # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail. bamliquidator_path = 'bamliquidator_batch.py' bamFileListUnique = list(bamFileList) bamFileListUnique = utils.uniquify(bamFileListUnique) #prevent redundant mapping print("MAPPING TO THE FOLLOWING BAMS:") print(bamFileListUnique) for bamFile in bamFileListUnique: bamFileName = bamFile.split('/')[-1] # MAPPING TO THE STITCHED GFF mappedOut1Folder = '%s%s_%s_MAPPED' % (mappedFolder, stitchedGFFName, bamFileName) mappedOut1File = '%s%s_%s_MAPPED/matrix.txt' % ( mappedFolder, stitchedGFFName, bamFileName) if utils.checkOutput(mappedOut1File, 0.2, 0.2): print("FOUND %s MAPPING DATA FOR BAM: %s" % (stitchedGFFFile, mappedOut1File)) else: cmd1 = bamliquidator_path + " --sense . -e 200 --match_bamToGFF -r %s -o %s %s" % ( stitchedGFFFile, mappedOut1Folder, bamFile) print(cmd1) os.system(cmd1) if utils.checkOutput(mappedOut1File, 0.2, 5): print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (stitchedGFFFile, bamFileName)) else: print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (stitchedGFFFile, bamFileName)) sys.exit() print('BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS') # CALCULATE DENSITY BY REGION # NEED TO FIX THIS FUNCTION TO ACCOUNT FOR DIFFERENT OUTPUTS OF LIQUIDATOR mapCollection(stitchedCollection, referenceCollection, bamFileList, mappedFolder, outputFile1, refName=stitchedGFFName) print('CALLING AND PLOTTING SUPER-ENHANCERS') if options.control: rankbyName = options.rankby.split('/')[-1] controlName = options.control.split('/')[-1] cmd = 'R --no-save %s %s %s %s < ROSE2_callSuper.R' % ( outFolder, outputFile1, inputName, controlName) else: rankbyName = options.rankby.split('/')[-1] controlName = 'NONE' cmd = 'R --no-save %s %s %s %s < ROSE2_callSuper.R' % ( outFolder, outputFile1, inputName, controlName) print(cmd) os.system(cmd) # calling the gene mapper time.sleep(20) superTableFile = "%s_SuperEnhancers.table.txt" % (inputName) if options.control: cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s &" % ( genome, options.rankby, options.control, outFolder, superTableFile) else: cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s &" % ( genome, options.rankby, outFolder, superTableFile) os.system(cmd) stretchTableFile = "%s_StretchEnhancers.table.txt" % (inputName) if options.control: cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s &" % ( genome, options.rankby, options.control, outFolder, stretchTableFile) else: cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s &" % ( genome, options.rankby, outFolder, stretchTableFile) os.system(cmd) superStretchTableFile = "%s_SuperStretchEnhancers.table.txt" % (inputName) if options.control: cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s &" % ( genome, options.rankby, options.control, outFolder, superStretchTableFile) else: cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s &" % ( genome, options.rankby, outFolder, superStretchTableFile) os.system(cmd)
def main(): ''' main run call ''' debug = False from optparse import OptionParser usage = "usage: %prog [options] -g [GENOME] -i [INPUT_REGION_GFF] -r [RANKBY_BAM_FILE] -o [OUTPUT_FOLDER] [OPTIONAL_FLAGS]" parser = OptionParser(usage=usage) # required flags parser.add_option("-i", "--i", dest="input", nargs=1, default=None, help="Enter a comma separated list of .gff or .bed file of binding sites used to make enhancers") parser.add_option("-r", "--rankby", dest="rankby", nargs=1, default=None, help="Enter a comma separated list of bams to rank by") parser.add_option("-o", "--out", dest="out", nargs=1, default=None, help="Enter an output folder") parser.add_option("-g", "--genome", dest="genome", nargs=1, default=None, help="Enter the genome build (MM9,MM8,HG18,HG19)") # optional flags parser.add_option("-n", "--name", dest="name", nargs=1, default=None, help="Provide a name for the analysis otherwise ROSE will guess") parser.add_option("-c", "--control", dest="control", nargs=1, default=None, help="Enter a comma separated list of control bams. Can either provide a single control bam for all rankby bams, or provide a control bam for each individual bam") parser.add_option("-s", "--stitch", dest="stitch", nargs=1, default='', help="Enter a max linking distance for stitching. Default will determine optimal stitching parameter") parser.add_option("-t", "--tss", dest="tss", nargs=1, default=0, help="Enter a distance from TSS to exclude. 0 = no TSS exclusion") parser.add_option("--mask", dest="mask", nargs=1, default=None, help="Mask a set of regions from analysis. Provide a .bed or .gff of masking regions") # RETRIEVING FLAGS (options, args) = parser.parse_args() if not options.input or not options.rankby or not options.out or not options.genome: print('hi there') parser.print_help() exit() # making the out folder if it doesn't exist outFolder = utils.formatFolder(options.out, True) # figuring out folder schema gffFolder = utils.formatFolder(outFolder + 'gff/', True) mappedFolder = utils.formatFolder(outFolder + 'mappedGFF/', True) # GETTING INPUT FILE(s) inputList = [inputFile for inputFile in options.input.split(',') if len(inputFile) > 1] #converting all input files into GFFs and moving into the GFF folder inputGFFList = [] for inputFile in inputList: if inputFile.split('.')[-1] == 'bed': # CONVERTING A BED TO GFF inputGFFName = inputFile.split('/')[-1][0:-4] #strips the last 4 characters i.e. '.bed' inputGFFFile = '%s%s.gff' % (gffFolder, inputGFFName) utils.bedToGFF(inputFile, inputGFFFile) elif options.input.split('.')[-1] == 'gff': # COPY THE INPUT GFF TO THE GFF FOLDER os.system('cp %s %s' % (inputFile, gffFolder)) inputGFFFile = '%s%s' % (gffFolder,inputFile.split('/')[-1]) else: print('WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT') # COPY THE INPUT GFF TO THE GFF FOLDER os.system('cp %s %s' % (inputFile, gffFolder)) inputGFFFile = '%s%s' % (gffFolder,inputFile.split('/')[-1]) inputGFFList.append(inputGFFFile) # GETTING THE LIST OF BAMFILES TO PROCESS #either same number of bams for rankby and control #or only 1 control #or none! #bamlist should be all rankby bams followed by control bams bamFileList = [] if options.control: controlBamList = [bam for bam in options.control.split(',') if len(bam) >0] rankbyBamList = [bam for bam in options.rankby.split(',') if len(bam) >0] if len(controlBamList) == len(rankbyBamList): #case where an equal number of backgrounds are given bamFileList = rankbyBamList + controlBamList elif len(controlBamList) == 1: #case where a universal background is applied bamFileList = rankbyBamList + controlBamList*len(rankbyBamList) else: print('ERROR: EITHER PROVIDE A SINGLE CONTROL BAM FOR ALL SAMPLES, OR ONE CONTROL BAM FOR EACH SAMPLE') sys.exit() else: bamFileList = [bam for bam in options.rankby.split(',') if len(bam) > 0] # Stitch parameter if options.stitch == '': stitchWindow = '' else: stitchWindow = int(options.stitch) # tss options tssWindow = int(options.tss) if tssWindow != 0: removeTSS = True else: removeTSS = False # GETTING THE GENOME genome = string.upper(options.genome) print('USING %s AS THE GENOME' % (genome)) # GETTING THE CORRECT ANNOT FILE genomeDict = { 'HG18': '%s/annotation/hg18_refseq.ucsc' % (pipeline_dir), 'MM9': '%s/annotation/mm9_refseq.ucsc' % (pipeline_dir), 'HG19': '%s/annotation/hg19_refseq.ucsc' % (pipeline_dir), 'MM8': '%s/annotation/mm8_refseq.ucsc' % (pipeline_dir), 'MM10': '%s/annotation/mm10_refseq.ucsc' % (pipeline_dir), 'RN4': '%s/annotation/rn4_refseq.ucsc' % (pipeline_dir), } try: annotFile = genomeDict[genome.upper()] except KeyError: print('ERROR: UNSUPPORTED GENOMES TYPE %s' % (genome)) sys.exit() #FINDING THE ANALYSIS NAME if options.name: inputName = options.name else: inputName = inputGFFList[0].split('/')[-1].split('.')[0] print('USING %s AS THE ANALYSIS NAME' % (inputName)) print('FORMATTING INPUT REGIONS') # MAKING THE RAW INPUT FILE FROM THE INPUT GFFs #use a simpler unique region naming system if len(inputGFFList) == 1: inputGFF = utils.parseTable(inputGFFList[0],'\t') else: inputLoci = [] for gffFile in inputGFFList: print('\tprocessing %s' % (gffFile)) gff = utils.parseTable(gffFile,'\t') gffCollection = utils.gffToLocusCollection(gff,50) inputLoci += gffCollection.getLoci() inputCollection = utils.LocusCollection(inputLoci,50) inputCollection = inputCollection.stitchCollection() # stitches to produce unique regions inputGFF = utils.locusCollectionToGFF(inputCollection) formattedGFF = [] #now number things appropriately for i,line in enumerate(inputGFF): #use the coordinates to make a new id inputname_chr_sense_start_stop chrom = line[0] coords = [int(line[3]) ,int(line[4])] sense = line[6] lineID = '%s_%s' % (inputName,str(i+1)) #1 indexing newLine = [chrom,lineID,lineID,min(coords),max(coords),'',sense,'',lineID] formattedGFF.append(newLine) #name of the master input gff file masterGFFFile = '%s%s_%s_ALL_-0_+0.gff' % (gffFolder,string.upper(genome),inputName) utils.unParseTable(formattedGFF,masterGFFFile,'\t') print('USING %s AS THE INPUT GFF' % (masterGFFFile)) # MAKING THE START DICT print('MAKING START DICT') startDict = utils.makeStartDict(annotFile) #GET CHROMS FOUND IN THE BAMS print('GETTING CHROMS IN BAMFILES') bamChromList = getBamChromList(bamFileList) print("USING THE FOLLOWING CHROMS") print(bamChromList) #LOADING IN THE GFF AND FILTERING BY CHROM print('LOADING AND FILTERING THE GFF') inputGFF = filterGFF(masterGFFFile,bamChromList) # LOADING IN THE BOUND REGION REFERENCE COLLECTION print('LOADING IN GFF REGIONS') referenceCollection = utils.gffToLocusCollection(inputGFF) print('CHECKING REFERENCE COLLECTION:') checkRefCollection(referenceCollection) # MASKING REFERENCE COLLECTION # see if there's a mask if options.mask: maskFile = options.mask # if it's a bed file if maskFile.split('.')[-1].upper() == 'BED': maskGFF = utils.bedToGFF(maskFile) elif maskFile.split('.')[-1].upper() == 'GFF': maskGFF = utils.parseTable(maskFile, '\t') else: print("MASK MUST BE A .gff or .bed FILE") sys.exit() maskCollection = utils.gffToLocusCollection(maskGFF) # now mask the reference loci referenceLoci = referenceCollection.getLoci() filteredLoci = [locus for locus in referenceLoci if len(maskCollection.getOverlap(locus, 'both')) == 0] print("FILTERED OUT %s LOCI THAT WERE MASKED IN %s" % (len(referenceLoci) - len(filteredLoci), maskFile)) referenceCollection = utils.LocusCollection(filteredLoci, 50) # NOW STITCH REGIONS print('STITCHING REGIONS TOGETHER') stitchedCollection, debugOutput, stitchWindow = regionStitching(referenceCollection, inputName, outFolder, stitchWindow, tssWindow, annotFile, removeTSS) # NOW MAKE A STITCHED COLLECTION GFF print('MAKING GFF FROM STITCHED COLLECTION') stitchedGFF = utils.locusCollectionToGFF(stitchedCollection) print(stitchWindow) print(type(stitchWindow)) if not removeTSS: stitchedGFFFile = '%s%s_%sKB_STITCHED.gff' % (gffFolder, inputName, str(stitchWindow / 1000)) stitchedGFFName = '%s_%sKB_STITCHED' % (inputName, str(stitchWindow / 1000)) debugOutFile = '%s%s_%sKB_STITCHED.debug' % (gffFolder, inputName, str(stitchWindow / 1000)) else: stitchedGFFFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.gff' % (gffFolder, inputName, str(stitchWindow / 1000)) stitchedGFFName = '%s_%sKB_STITCHED_TSS_DISTAL' % (inputName, str(stitchWindow / 1000)) debugOutFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.debug' % (gffFolder, inputName, str(stitchWindow / 1000)) # WRITING DEBUG OUTPUT TO DISK if debug: print('WRITING DEBUG OUTPUT TO DISK AS %s' % (debugOutFile)) utils.unParseTable(debugOutput, debugOutFile, '\t') # WRITE THE GFF TO DISK print('WRITING STITCHED GFF TO DISK AS %s' % (stitchedGFFFile)) utils.unParseTable(stitchedGFF, stitchedGFFFile, '\t') # SETTING UP THE OVERALL OUTPUT FILE outputFile1 = outFolder + stitchedGFFName + '_ENHANCER_REGION_MAP.txt' print('OUTPUT WILL BE WRITTEN TO %s' % (outputFile1)) # MAPPING TO THE NON STITCHED (ORIGINAL GFF) # MAPPING TO THE STITCHED GFF # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail. bamFileListUnique = list(bamFileList) bamFileListUnique = utils.uniquify(bamFileListUnique) #prevent redundant mapping print("MAPPING TO THE FOLLOWING BAMS:") print(bamFileListUnique) for bamFile in bamFileListUnique: bamFileName = bamFile.split('/')[-1] # MAPPING TO THE STITCHED GFF mappedOut1Folder = '%s%s_%s_MAPPED' % (mappedFolder, stitchedGFFName, bamFileName) mappedOut1File = '%s%s_%s_MAPPED/matrix.txt' % (mappedFolder, stitchedGFFName, bamFileName) if utils.checkOutput(mappedOut1File, 0.2, 0.2): print("FOUND %s MAPPING DATA FOR BAM: %s" % (stitchedGFFFile, mappedOut1File)) else: cmd1 = bamliquidator_path + " --sense . -e 200 --match_bamToGFF -r %s -o %s %s" % (stitchedGFFFile, mappedOut1Folder, bamFile) print(cmd1) os.system(cmd1) if utils.checkOutput(mappedOut1File,0.2,5): print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (stitchedGFFFile, bamFileName)) else: print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (stitchedGFFFile, bamFileName)) sys.exit() print('BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS') # CALCULATE DENSITY BY REGION # NEED TO FIX THIS FUNCTION TO ACCOUNT FOR DIFFERENT OUTPUTS OF LIQUIDATOR mapCollection(stitchedCollection, referenceCollection, bamFileList, mappedFolder, outputFile1, refName=stitchedGFFName) print('FINDING AVERAGE SIGNAL AMONGST BAMS') metaOutputFile = collapseRegionMap(outputFile1,inputName + '_MERGED_SIGNAL',controlBams=options.control) #now try the merging print('CALLING AND PLOTTING SUPER-ENHANCERS') rankbyName = inputName + '_MERGED_SIGNAL' controlName = 'NONE' cmd = 'Rscript %sROSE2_callSuper.R %s %s %s %s' % (pipeline_dir,outFolder, metaOutputFile, inputName, controlName) print(cmd) os.system(cmd) # calling the gene mapper print('CALLING GENE MAPPING') superTableFile = "%s_SuperEnhancers.table.txt" % (inputName) #for now don't use ranking bam to call top genes cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % (pipeline_dir,genome, outFolder, superTableFile) print(cmd) os.system(cmd) stretchTableFile = "%s_StretchEnhancers.table.txt" % (inputName) cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % (pipeline_dir,genome, outFolder, stretchTableFile) print(cmd) os.system(cmd) superStretchTableFile = "%s_SuperStretchEnhancers.table.txt" % (inputName) cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % (pipeline_dir,genome, outFolder, superStretchTableFile) os.system(cmd)
def main(): ''' main run call ''' debug = False from optparse import OptionParser usage = "usage: %prog [options] -g [GENOME] -i [INPUT_REGION_GFF] -r [RANKBY_BAM_FILE] -o [OUTPUT_FOLDER] [OPTIONAL_FLAGS]" parser = OptionParser(usage=usage) # required flags parser.add_option("-i", "--i", dest="input", nargs=1, default=None, help="Enter a .gff or .bed file of binding sites used to make enhancers") parser.add_option("-r", "--rankby", dest="rankby", nargs=1, default=None, help="bamfile to rank enhancer by") parser.add_option("-o", "--out", dest="out", nargs=1, default=None, help="Enter an output folder") parser.add_option("-g", "--genome", dest="genome", nargs=1, default=None, help="Enter the genome build (MM9,MM8,HG18,HG19)") # optional flags parser.add_option("-b", "--bams", dest="bams", nargs=1, default=None, help="Enter a comma separated list of additional bam files to map to") parser.add_option("-c", "--control", dest="control", nargs=1, default=None, help="bamfile to rank enhancer by") parser.add_option("-s", "--stitch", dest="stitch", nargs=1, default='', help="Enter a max linking distance for stitching. Default will determine optimal stitching parameter") parser.add_option("-t", "--tss", dest="tss", nargs=1, default=0, help="Enter a distance from TSS to exclude. 0 = no TSS exclusion") parser.add_option("--mask", dest="mask", nargs=1, default=None, help="Mask a set of regions from analysis. Provide a .bed or .gff of masking regions") # RETRIEVING FLAGS (options, args) = parser.parse_args() if not options.input or not options.rankby or not options.out or not options.genome: print('hi there') parser.print_help() exit() # making the out folder if it doesn't exist outFolder = utils.formatFolder(options.out, True) # figuring out folder schema gffFolder = utils.formatFolder(outFolder + 'gff/', True) mappedFolder = utils.formatFolder(outFolder + 'mappedGFF/', True) # GETTING INPUT FILE if options.input.split('.')[-1] == 'bed': # CONVERTING A BED TO GFF inputGFFName = options.input.split('/')[-1][0:-4] inputGFFFile = '%s%s.gff' % (gffFolder, inputGFFName) utils.bedToGFF(options.input, inputGFFFile) elif options.input.split('.')[-1] == 'gff': # COPY THE INPUT GFF TO THE GFF FOLDER inputGFFFile = options.input os.system('cp %s %s' % (inputGFFFile, gffFolder)) else: print('WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT') # COPY THE INPUT GFF TO THE GFF FOLDER inputGFFFile = options.input os.system('cp %s %s' % (inputGFFFile, gffFolder)) # GETTING THE LIST OF BAMFILES TO PROCESS if options.control: bamFileList = [options.rankby, options.control] else: bamFileList = [options.rankby] if options.bams: bamFileList += options.bams.split(',') bamFileList = utils.uniquify(bamFileList) # optional args # Stitch parameter if options.stitch == '': stitchWindow = '' else: stitchWindow = int(options.stitch) # tss options tssWindow = int(options.tss) if tssWindow != 0: removeTSS = True else: removeTSS = False # GETTING THE BOUND REGION FILE USED TO DEFINE ENHANCERS print('USING %s AS THE INPUT GFF' % (inputGFFFile)) inputName = inputGFFFile.split('/')[-1].split('.')[0] # GETTING THE GENOME genome = options.genome print('USING %s AS THE GENOME' % genome) # GETTING THE CORRECT ANNOT FILE cwd = os.getcwd() genomeDict = { 'HG18': '%s/annotation/hg18_refseq.ucsc' % (cwd), 'MM9': '%s/annotation/mm9_refseq.ucsc' % (cwd), 'HG19': '%s/annotation/hg19_refseq.ucsc' % (cwd), 'MM8': '%s/annotation/mm8_refseq.ucsc' % (cwd), 'MM10': '%s/annotation/mm10_refseq.ucsc' % (cwd), } annotFile = genomeDict[genome.upper()] # MAKING THE START DICT print('MAKING START DICT') startDict = utils.makeStartDict(annotFile) # LOADING IN THE BOUND REGION REFERENCE COLLECTION print('LOADING IN GFF REGIONS') referenceCollection = utils.gffToLocusCollection(inputGFFFile) # MASKING REFERENCE COLLECTION # see if there's a mask if options.mask: maskFile = options.mask # if it's a bed file if maskFile.split('.')[-1].upper() == 'BED': maskGFF = utils.bedToGFF(maskFile) elif maskFile.split('.')[-1].upper() == 'GFF': maskGFF = utils.parseTable(maskFile, '\t') else: print("MASK MUST BE A .gff or .bed FILE") sys.exit() maskCollection = utils.gffToLocusCollection(maskGFF) # now mask the reference loci referenceLoci = referenceCollection.getLoci() filteredLoci = [locus for locus in referenceLoci if len(maskCollection.getOverlap(locus, 'both')) == 0] print("FILTERED OUT %s LOCI THAT WERE MASKED IN %s" % (len(referenceLoci) - len(filteredLoci), maskFile)) referenceCollection = utils.LocusCollection(filteredLoci, 50) # NOW STITCH REGIONS print('STITCHING REGIONS TOGETHER') stitchedCollection, debugOutput, stitchWindow = regionStitching(inputGFFFile, inputName, outFolder, stitchWindow, tssWindow, annotFile, removeTSS) # NOW MAKE A STITCHED COLLECTION GFF print('MAKING GFF FROM STITCHED COLLECTION') stitchedGFF = utils.locusCollectionToGFF(stitchedCollection) # making sure start/stop ordering are correct for i in range(len(stitchedGFF)): line = stitchedGFF[i] start = int(line[3]) stop = int(line[4]) if start > stop: line[3] = stop line[4] = start print(stitchWindow) print(type(stitchWindow)) if not removeTSS: stitchedGFFFile = '%s%s_%sKB_STITCHED.gff' % (gffFolder, inputName, str(stitchWindow / 1000)) stitchedGFFName = '%s_%sKB_STITCHED' % (inputName, str(stitchWindow / 1000)) debugOutFile = '%s%s_%sKB_STITCHED.debug' % (gffFolder, inputName, str(stitchWindow / 1000)) else: stitchedGFFFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.gff' % (gffFolder, inputName, str(stitchWindow / 1000)) stitchedGFFName = '%s_%sKB_STITCHED_TSS_DISTAL' % (inputName, str(stitchWindow / 1000)) debugOutFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.debug' % (gffFolder, inputName, str(stitchWindow / 1000)) # WRITING DEBUG OUTPUT TO DISK if debug: print('WRITING DEBUG OUTPUT TO DISK AS %s' % (debugOutFile)) utils.unParseTable(debugOutput, debugOutFile, '\t') # WRITE THE GFF TO DISK print('WRITING STITCHED GFF TO DISK AS %s' % (stitchedGFFFile)) utils.unParseTable(stitchedGFF, stitchedGFFFile, '\t') # SETTING UP THE OVERALL OUTPUT FILE outputFile1 = outFolder + stitchedGFFName + '_ENHANCER_REGION_MAP.txt' print('OUTPUT WILL BE WRITTEN TO %s' % (outputFile1)) # MAPPING TO THE NON STITCHED (ORIGINAL GFF) # MAPPING TO THE STITCHED GFF # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail. bamliquidator_path = '/ark/home/jdm/pipeline/bamliquidator_batch.py' if not os.path.isfile(bamliquidator_path): bamliquidator_path = 'bamliquidator_batch.py' if not os.path.isfile(bamliquidator_path): raise ValueError('bamliquidator_batch.py not found in path') for bamFile in bamFileList: bamFileName = bamFile.split('/')[-1] # MAPPING TO THE STITCHED GFF mappedOut1Folder = '%s%s_%s_MAPPED' % (mappedFolder, stitchedGFFName, bamFileName) mappedOut1File = '%s%s_%s_MAPPED/matrix.gff' % (mappedFolder, stitchedGFFName, bamFileName) if utils.checkOutput(mappedOut1File, 0.2, 0.2): print("FOUND %s MAPPING DATA FOR BAM: %s" % (stitchedGFFFile, mappedOut1File)) else: cmd1 = "python " + bamliquidator_path + " --sense . -e 200 --match_bamToGFF -r %s -o %s %s" % (stitchedGFFFile, mappedOut1Folder, bamFile) print(cmd1) output1 = subprocess.Popen(cmd1, stdout=subprocess.PIPE, shell=True) output1 = output1.communicate() if len(output1[0]) > 0: # test if mapping worked correctly print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (stitchedGFFFile, bamFileName)) else: print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (stitchedGFFFile, bamFileName)) sys.exit() # MAPPING TO THE ORIGINAL GFF mappedOut2Folder = '%s%s_%s_MAPPED' % (mappedFolder, inputName, bamFileName) mappedOut2File = '%s%s_%s_MAPPED/matrix.gff' % (mappedFolder, inputName, bamFileName) if utils.checkOutput(mappedOut2File, 0.2, 0.2): print("FOUND %s MAPPING DATA FOR BAM: %s" % (stitchedGFFFile, mappedOut2File)) else: cmd2 = "python " + bamliquidator_path + " --sense . -e 200 --match_bamToGFF -r %s -o %s %s" % (inputGFFFile, mappedOut2Folder, bamFile) print(cmd2) output2 = subprocess.Popen(cmd2, stdout=subprocess.PIPE, shell=True) output2 = output2.communicate() if len(output2[0]) > 0: # test if mapping worked correctly print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (inputGFFFile, bamFileName)) else: print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (inputGFFFile, bamFileName)) sys.exit() print('BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS') # CALCULATE DENSITY BY REGION # NEED TO FIX THIS FUNCTION TO ACCOUNT FOR DIFFERENT OUTPUTS OF LIQUIDATOR mapCollection(stitchedCollection, referenceCollection, bamFileList, mappedFolder, outputFile1, refName=stitchedGFFName) print('CALLING AND PLOTTING SUPER-ENHANCERS') if options.control: rankbyName = options.rankby.split('/')[-1] controlName = options.control.split('/')[-1] cmd = 'R --no-save %s %s %s %s < ROSE2_callSuper.R' % (outFolder, outputFile1, inputName, controlName) else: rankbyName = options.rankby.split('/')[-1] controlName = 'NONE' cmd = 'R --no-save %s %s %s %s < ROSE2_callSuper.R' % (outFolder, outputFile1, inputName, controlName) print(cmd) os.system(cmd) # calling the gene mapper time.sleep(20) superTableFile = "%s_SuperEnhancers.table.txt" % (inputName) if options.control: cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s" % (genome, options.rankby, options.control, outFolder, superTableFile) else: cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s" % (genome, options.rankby, outFolder, superTableFile) os.system(cmd) stretchTableFile = "%s_StretchEnhancers.table.txt" % (inputName) if options.control: cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s" % (genome, options.rankby, options.control, outFolder, stretchTableFile) else: cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s" % (genome, options.rankby, outFolder, stretchTableFile) os.system(cmd) superStretchTableFile = "%s_SuperStretchEnhancers.table.txt" % (inputName) if options.control: cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s" % (genome, options.rankby, options.control, outFolder, superStretchTableFile) else: cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s" % (genome, options.rankby, outFolder, superStretchTableFile) os.system(cmd)
def rank_eboxes(nb_all_chip_dataFile,mycn_gff_path,macsFolder,genomeDirectory,window = 100): ''' uses the conserved MYCN sites and ranks eboxes within them by average background subtracted signal searches 100bp (window variable) from mycn summits ''' window = int(window) #bring in the conserved mycn region print('making gff of nb mycn summits') nb_mycn_gff = utils.parseTable(mycn_gff_path,'\t') nb_mycn_collection = utils.gffToLocusCollection(nb_mycn_gff,50) dataDict =pipeline_dfci.loadDataTable(nb_all_chip_dataFile) names_list = [name for name in dataDict.keys() if name.count('MYCN') == 1] names_list.sort() summit_loci = [] #first makes a gff of all summits +/- 100bp for all nb mycn datasets for name in names_list: summit_bed_path = '%s%s/%s_summits.bed' % (macsFolder,name,name) summit_bed = utils.parseTable(summit_bed_path,'\t') for line in summit_bed: summit_locus = utils.Locus(line[0],int(line[1])-window,int(line[2])+window,'.',line[3]) if len(nb_mycn_collection.getOverlap(summit_locus)) > 0: summit_loci.append(summit_locus) summit_collection =utils.LocusCollection(summit_loci,50) summit_merged_collection = summit_collection.stitchCollection() summit_gff = utils.locusCollectionToGFF(summit_merged_collection) summit_gff_path = '%sHG19_NB_MYCN_SUMMITS_-%s_+%s.gff' % (gffFolder,window,window) utils.unParseTable(summit_gff,summit_gff_path,'\t') #this is borrowed from above and maps chip-seq signal to the gff print('mapping to nb mycn summits and making signal dict') gffList = [summit_gff_path] summit_signal_path = pipeline_dfci.map_regions(nb_all_chip_dataFile,gffList) mycnSignalTable = utils.parseTable(summit_signal_path,'\t') #making a signal dictionary for MYCN binding names_list = ['BE2C_MYCN','KELLY_MYCN','NGP_MYCN','SHEP21_0HR_MYCN_NOSPIKE'] background_list = [dataDict[name]['background'] for name in names_list] header = mycnSignalTable[0] chip_columns = [header.index(name) for name in names_list] background_columns = [header.index(background_name) for background_name in background_list] mycn_sig_dict = {} for line in mycnSignalTable[1:]: line_sig = [] for i in range(len(names_list)): line_sig.append(float(line[chip_columns[i]]) - float(line[background_columns[i]])) region_id = line[1] coords = [int(x) for x in line[1].split(':')[-1].split('-')] line_length = coords[1]-coords[0] mycn_sig_dict[region_id] = numpy.mean(line_sig)*line_length #now for each region find the eboxes and then add up the signal print('making ebox ranking') ebox_list = ['CACGTG','CAGTTG','CAAGTG','CAGGTG','CAATTG','CAAATG','CATCTG','CAGCTG','CATGTG','CATATG'] eboxDict = {} for ebox in ebox_list: eboxDict[ebox] = [] ticker = 0 for line in summit_gff: if ticker % 1000 == 0: print(ticker) ticker+=1 chrom = line[0] sense = '.' start = int(line[3]) end = int(line[4]) region_id = '%s(%s):%s-%s' % (line[0],line[6],line[3],line[4]) signal = mycn_sig_dict[region_id] sequenceLine = utils.fetchSeq(genomeDirectory,chrom,start,end,True) motifVector = [] matches = re.finditer('CA..TG',str.upper(sequenceLine)) if matches: for match in matches: motifVector.append(match.group()) #count only 1 of each motif type per line #motifVector = utils.uniquify(motifVector) for motif in motifVector: if ebox_list.count(motif) > 0: eboxDict[motif].append(signal) else: eboxDict[utils.revComp(motif)].append(signal) eboxTable =[] eboxTableOrdered =[['EBOX','OCCURENCES','AVG_HEIGHT']] for ebox in eboxDict.keys(): newLine = [ebox,len(eboxDict[ebox]),numpy.mean(eboxDict[ebox])] eboxTable.append(newLine) occurenceOrder = utils.order([line[2] for line in eboxTable],decreasing=True) for x in occurenceOrder: eboxTableOrdered.append(eboxTable[x]) print(eboxTableOrdered) ebox_outfile = '%sHG19_NB_MYCN_CONSERVED_SUMMITS_-%s_+%s_EBOX_RANK.txt' % (tableFolder,window,window) utils.unParseTable(eboxTableOrdered,ebox_outfile,'\t') return ebox_outfile
def main(): print('main analysis for MYCN project') print('changing directory to project folder') os.chdir(projectFolder) print('\n\n') print( '#======================================================================' ) print( '#======================I, LOADING DATA ANNOTATION======================' ) print( '#======================================================================' ) print('\n\n') #This section sanity checks each data table and makes sure both bam and .bai files are accessible #for ChIP-Seq #these are the datasets we will use pipeline_dfci.summary(shep21_dataFile) print('\n\n') print( '#======================================================================' ) print( '#================II. RUNNING DIFFERENTIAL ROSE ANALYSIS================' ) print( '#======================================================================' ) print('\n\n') #use the dynamic rose tools to first map twist1 binding sites #and then quantify name1 = 'SHEP21_0HR_TWIST' name2 = 'SHEP21_24HR_B_TWIST' analysis_name = 'SHEP21_TWIST1' rank_gff_path = wrapDRose(shep21_dataFile, name1, name2, analysis_name) print('\n\n') print( '#======================================================================' ) print( '#=================III. MAPPING MYCN DATA TO RANK GFF===================' ) print( '#======================================================================' ) print('\n\n') #for shep21 nospike gffList = [rank_gff_path] dataDict = pipeline_dfci.loadDataTable(shep21_dataFile) names_list = [ name for name in dataDict.keys() if name.count('MYCN') == 1 or name.count('INPUT') == 1 or name.count('TWIST') == 1 and name.count('rep2') == 0 ] print(names_list) #map_regions(shep21_dataFile,gffList,names_list) gffList = ['%smacsEnriched/SHEP21_0HR_TWIST_peaks.bed' % (projectFolder)] #map_regions(shep21_dataFile,gffList,names_list) #make a gff of twist and mycn sites at 0hr twist_collection = utils.importBoundRegion( '%smacsEnriched/SHEP21_0HR_TWIST_peaks.bed' % (projectFolder), 'SHEP21_0HR_TWIST') mycn_collection = utils.importBoundRegion( '%smacsEnriched/SHEP21_0HR_MYCN_NOSPIKE_peaks.bed' % (projectFolder), 'SHEP21_0HR_MYCN_NOSPIKE') all_loci = twist_collection.getLoci() + mycn_collection.getLoci() all_collection = utils.LocusCollection(all_loci, 50) stitched_collection = all_collection.stitchCollection() stitched_loci = stitched_collection.getLoci() overlap_loci = [] for locus in stitched_loci: if len(twist_collection.getOverlap(locus, 'both')) > 0 and len( mycn_collection.getOverlap(locus, 'both')) > 0: overlap_loci.append(locus) overlap_collection = utils.LocusCollection(overlap_loci, 50) overlap_gff = utils.locusCollectionToGFF(overlap_collection) overlap_gff_path = '%sHG19_SHEP21_0HR_TWIST_MYCN_INTERSECTION_-0_+0.gff' % ( gffFolder) utils.unParseTable(overlap_gff, overlap_gff_path, '\t') gffList = [overlap_gff_path] map_regions(shep21_dataFile, gffList, names_list)
def mapEnhancerToGeneTop(rankByBamFile, controlBamFile, genome, annotFile, enhancerFile, transcribedFile='', uniqueGenes=True, searchWindow=50000, noFormatTable=False): ''' maps genes to enhancers. if uniqueGenes, reduces to gene name only. Otherwise, gives for each refseq ''' startDict = utils.makeStartDict(annotFile) enhancerName = enhancerFile.split('/')[-1].split('.')[0] enhancerTable = utils.parseTable(enhancerFile, '\t') # internal parameter for debugging byRefseq = False if len(transcribedFile) > 0: transcribedTable = utils.parseTable(transcribedFile, '\t') transcribedGenes = [line[1] for line in transcribedTable] else: transcribedGenes = startDict.keys() print('MAKING TRANSCRIPT COLLECTION') transcribedCollection = utils.makeTranscriptCollection( annotFile, 0, 0, 500, transcribedGenes) print('MAKING TSS COLLECTION') tssLoci = [] for geneID in transcribedGenes: tssLoci.append(utils.makeTSSLocus(geneID, startDict, 0, 0)) # this turns the tssLoci list into a LocusCollection # 50 is the internal parameter for LocusCollection and doesn't really # matter tssCollection = utils.LocusCollection(tssLoci, 50) geneDict = {'overlapping': defaultdict( list), 'proximal': defaultdict(list)} # dictionaries to hold ranks and superstatus of gene nearby enhancers rankDict = defaultdict(list) superDict = defaultdict(list) # list of all genes that appear in this analysis overallGeneList = [] # find the damn header for line in enhancerTable: if line[0][0] == '#': continue else: header = line break if noFormatTable: # set up the output tables # first by enhancer enhancerToGeneTable = [ header + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE']] else: # set up the output tables # first by enhancer enhancerToGeneTable = [ header[0:9] + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE'] + header[-2:]] # next by gene geneToEnhancerTable = [ ['GENE_NAME', 'REFSEQ_ID', 'PROXIMAL_ENHANCERS']] # next make the gene to enhancer table geneToEnhancerTable = [ ['GENE_NAME', 'REFSEQ_ID', 'PROXIMAL_ENHANCERS', 'ENHANCER_RANKS', 'IS_SUPER', 'ENHANCER_SIGNAL']] for line in enhancerTable: if line[0][0] == '#' or line[0][0] == 'R': continue enhancerString = '%s:%s-%s' % (line[1], line[2], line[3]) enhancerLocus = utils.Locus(line[1], line[2], line[3], '.', line[0]) # overlapping genes are transcribed genes whose transcript is directly # in the stitchedLocus overlappingLoci = transcribedCollection.getOverlap( enhancerLocus, 'both') overlappingGenes = [] for overlapLocus in overlappingLoci: overlappingGenes.append(overlapLocus.ID()) # proximalGenes are transcribed genes where the tss is within 50kb of # the boundary of the stitched loci proximalLoci = tssCollection.getOverlap( utils.makeSearchLocus(enhancerLocus, searchWindow, searchWindow), 'both') proximalGenes = [] for proxLocus in proximalLoci: proximalGenes.append(proxLocus.ID()) distalLoci = tssCollection.getOverlap( utils.makeSearchLocus(enhancerLocus, 1000000, 1000000), 'both') distalGenes = [] for proxLocus in distalLoci: distalGenes.append(proxLocus.ID()) overlappingGenes = utils.uniquify(overlappingGenes) proximalGenes = utils.uniquify(proximalGenes) distalGenes = utils.uniquify(distalGenes) allEnhancerGenes = overlappingGenes + proximalGenes + distalGenes # these checks make sure each gene list is unique. # technically it is possible for a gene to be overlapping, but not proximal since the # gene could be longer than the 50kb window, but we'll let that slide # here for refID in overlappingGenes: if proximalGenes.count(refID) == 1: proximalGenes.remove(refID) for refID in proximalGenes: if distalGenes.count(refID) == 1: distalGenes.remove(refID) # Now find the closest gene if len(allEnhancerGenes) == 0: closestGene = '' else: # get enhancerCenter enhancerCenter = (int(line[2]) + int(line[3])) / 2 # get absolute distance to enhancer center distList = [abs(enhancerCenter - startDict[geneID]['start'][0]) for geneID in allEnhancerGenes] # get the ID and convert to name closestGene = startDict[ allEnhancerGenes[distList.index(min(distList))]]['name'] # NOW WRITE THE ROW FOR THE ENHANCER TABLE if noFormatTable: newEnhancerLine = list(line) newEnhancerLine.append( join(utils.uniquify([startDict[x]['name'] for x in overlappingGenes]), ',')) newEnhancerLine.append( join(utils.uniquify([startDict[x]['name'] for x in proximalGenes]), ',')) newEnhancerLine.append(closestGene) else: newEnhancerLine = line[0:9] newEnhancerLine.append( join(utils.uniquify([startDict[x]['name'] for x in overlappingGenes]), ',')) newEnhancerLine.append( join(utils.uniquify([startDict[x]['name'] for x in proximalGenes]), ',')) newEnhancerLine.append(closestGene) newEnhancerLine += line[-2:] enhancerToGeneTable.append(newEnhancerLine) # Now grab all overlapping and proximal genes for the gene ordered # table overallGeneList += overlappingGenes for refID in overlappingGenes: geneDict['overlapping'][refID].append(enhancerString) rankDict[refID].append(int(line[-2])) superDict[refID].append(int(line[-1])) overallGeneList += proximalGenes for refID in proximalGenes: geneDict['proximal'][refID].append(enhancerString) rankDict[refID].append(int(line[-2])) superDict[refID].append(int(line[-1])) # End loop through # Make table by gene print('MAKING ENHANCER ASSOCIATED GENE TSS COLLECTION') overallGeneList = utils.uniquify(overallGeneList) enhancerGeneCollection = utils.makeTranscriptCollection( annotFile, 5000, 5000, 500, overallGeneList) enhancerGeneGFF = utils.locusCollectionToGFF(enhancerGeneCollection) # dump the gff to file enhancerFolder = utils.getParentFolder(enhancerFile) gffRootName = "%s_TSS_ENHANCER_GENES_-5000_+5000" % (genome) enhancerGeneGFFFile = "%s%s_%s.gff" % (enhancerFolder, enhancerName,gffRootName) utils.unParseTable(enhancerGeneGFF, enhancerGeneGFFFile, '\t') # now we need to run bamToGFF # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail. bamliquidator_path = '/ark/home/jdm/pipeline/bamliquidator_batch.py' if not os.path.isfile(bamliquidator_path): bamliquidator_path = 'bamliquidator_batch.py' if not os.path.isfile(bamliquidator_path): raise ValueError('bamliquidator_batch.py not found in path') print('MAPPING SIGNAL AT ENHANCER ASSOCIATED GENE TSS') # map density at genes in the +/- 5kb tss region # first on the rankBy bam bamName = rankByBamFile.split('/')[-1] mappedRankByFolder = "%s%s_%s_%s/" % (enhancerFolder, enhancerName,gffRootName, bamName) mappedRankByFile = "%s%s_%s_%s/matrix.gff" % (enhancerFolder,enhancerName, gffRootName, bamName) cmd = 'python ' + bamliquidator_path + ' --sense . -e 200 --match_bamToGFF -r %s -o %s %s' % (enhancerGeneGFFFile, mappedRankByFolder,rankByBamFile) print("Mapping rankby bam %s" % (rankByBamFile)) print(cmd) outputRank = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True) outputRank = outputRank.communicate() if len(outputRank[0]) > 0: # test if mapping worked correctly print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (enhancerGeneGFFFile, rankByBamFile)) else: print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (enhancerGeneGFFFile, rankByBamFile)) sys.exit() # next on the control bam if it exists if len(controlBamFile) > 0: controlName = controlBamFile.split('/')[-1] mappedControlFolder = "%s%s_%s_%s/" % ( enhancerFolder, enhancerName,gffRootName, controlName) mappedControlFile = "%s%s_%s_%s/matrix.gff" % ( enhancerFolder, enhancerName,gffRootName, controlName) cmd = 'python ' + bamliquidator_path + ' --sense . -e 200 --match_bamToGFF -r %s -o %s %s' % (enhancerGeneGFFFile, mappedControlFolder,controlBamFile) print("Mapping control bam %s" % (controlBamFile)) print(cmd) outputControl = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True) outputControl = outputControl.communicate() if len(outputControl[0]) > 0: # test if mapping worked correctly print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (enhancerGeneGFFFile, controlBamFile)) else: print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (enhancerGeneGFFFile, controlBamFile)) sys.exit() # now get the appropriate output files if len(controlBamFile) > 0: print("CHECKING FOR MAPPED OUTPUT AT %s AND %s" % (mappedRankByFile, mappedControlFile)) if utils.checkOutput(mappedRankByFile, 1, 1) and utils.checkOutput(mappedControlFile, 1, 1): print('MAKING ENHANCER ASSOCIATED GENE TSS SIGNAL DICTIONARIES') signalDict = makeSignalDict(mappedRankByFile, mappedControlFile) else: print("NO MAPPING OUTPUT DETECTED") sys.exit() else: print("CHECKING FOR MAPPED OUTPUT AT %s" % (mappedRankByFile)) if utils.checkOutput(mappedRankByFile, 1, 30): print('MAKING ENHANCER ASSOCIATED GENE TSS SIGNAL DICTIONARIES') signalDict = makeSignalDict(mappedRankByFile) else: print("NO MAPPING OUTPUT DETECTED") sys.exit() # use enhancer rank to order rankOrder = utils.order([min(rankDict[x]) for x in overallGeneList]) usedNames = [] # make a new dict to hold TSS signal by max per geneName geneNameSigDict = defaultdict(list) print('MAKING GENE TABLE') for i in rankOrder: refID = overallGeneList[i] geneName = startDict[refID]['name'] if usedNames.count(geneName) > 0 and uniqueGenes == True: continue else: usedNames.append(geneName) proxEnhancers = geneDict['overlapping'][ refID] + geneDict['proximal'][refID] superStatus = max(superDict[refID]) enhancerRanks = join([str(x) for x in rankDict[refID]], ',') enhancerSignal = signalDict[refID] geneNameSigDict[geneName].append(enhancerSignal) newLine = [geneName, refID, join( proxEnhancers, ','), enhancerRanks, superStatus, enhancerSignal] geneToEnhancerTable.append(newLine) #utils.unParseTable(geneToEnhancerTable,'/grail/projects/newRose/geneMapper/foo.txt','\t') print('MAKING ENHANCER TO TOP GENE TABLE') if noFormatTable: enhancerToTopGeneTable = [ enhancerToGeneTable[0] + ['TOP_GENE', 'TSS_SIGNAL']] else: enhancerToTopGeneTable = [enhancerToGeneTable[0][0:12] + [ 'TOP_GENE', 'TSS_SIGNAL'] + enhancerToGeneTable[0][-2:]] for line in enhancerToGeneTable[1:]: geneList = [] if noFormatTable: geneList += line[-3].split(',') geneList += line[-2].split(',') else: geneList += line[10].split(',') geneList += line[11].split(',') geneList = utils.uniquify([x for x in geneList if len(x) > 0]) if len(geneList) > 0: try: sigVector = [max(geneNameSigDict[x]) for x in geneList] maxIndex = sigVector.index(max(sigVector)) maxGene = geneList[maxIndex] maxSig = sigVector[maxIndex] if maxSig == 0.0: maxGene = 'NONE' maxSig = 'NONE' except ValueError: if len(geneList) == 1: maxGene = geneList[0] maxSig = 'NONE' else: maxGene = 'NONE' maxSig = 'NONE' else: maxGene = 'NONE' maxSig = 'NONE' if noFormatTable: newLine = line + [maxGene, maxSig] else: newLine = line[0:12] + [maxGene, maxSig] + line[-2:] enhancerToTopGeneTable.append(newLine) # resort enhancerToGeneTable if noFormatTable: return enhancerToGeneTable, enhancerToTopGeneTable, geneToEnhancerTable else: enhancerOrder = utils.order([int(line[-2]) for line in enhancerToGeneTable[1:]]) sortedTable = [enhancerToGeneTable[0]] sortedTopGeneTable = [enhancerToTopGeneTable[0]] for i in enhancerOrder: sortedTable.append(enhancerToGeneTable[(i + 1)]) sortedTopGeneTable.append(enhancerToTopGeneTable[(i + 1)]) return sortedTable, sortedTopGeneTable, geneToEnhancerTable
def mapEnhancerToGeneTop(rankByBamFile, controlBamFile, genome, annotFile, enhancerFile, transcribedFile='', uniqueGenes=True, searchWindow=50000, noFormatTable=False): ''' maps genes to enhancers. if uniqueGenes, reduces to gene name only. Otherwise, gives for each refseq ''' startDict = utils.makeStartDict(annotFile) enhancerName = enhancerFile.split('/')[-1].split('.')[0] enhancerTable = utils.parseTable(enhancerFile, '\t') # internal parameter for debugging byRefseq = False if len(transcribedFile) > 0: transcribedTable = utils.parseTable(transcribedFile, '\t') transcribedGenes = [line[1] for line in transcribedTable] else: transcribedGenes = startDict.keys() print('MAKING TRANSCRIPT COLLECTION') transcribedCollection = utils.makeTranscriptCollection( annotFile, 0, 0, 500, transcribedGenes) print('MAKING TSS COLLECTION') tssLoci = [] for geneID in transcribedGenes: tssLoci.append(utils.makeTSSLocus(geneID, startDict, 0, 0)) # this turns the tssLoci list into a LocusCollection # 50 is the internal parameter for LocusCollection and doesn't really # matter tssCollection = utils.LocusCollection(tssLoci, 50) geneDict = {'overlapping': defaultdict( list), 'proximal': defaultdict(list)} # dictionaries to hold ranks and superstatus of gene nearby enhancers rankDict = defaultdict(list) superDict = defaultdict(list) # list of all genes that appear in this analysis overallGeneList = [] # find the damn header for line in enhancerTable: if line[0][0] == '#': continue else: header = line break if noFormatTable: # set up the output tables # first by enhancer enhancerToGeneTable = [ header + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE']] else: # set up the output tables # first by enhancer enhancerToGeneTable = [ header[0:9] + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE'] + header[-2:]] # next by gene geneToEnhancerTable = [ ['GENE_NAME', 'REFSEQ_ID', 'PROXIMAL_ENHANCERS']] # next make the gene to enhancer table geneToEnhancerTable = [ ['GENE_NAME', 'REFSEQ_ID', 'PROXIMAL_ENHANCERS', 'ENHANCER_RANKS', 'IS_SUPER', 'ENHANCER_SIGNAL']] for line in enhancerTable: if line[0][0] == '#' or line[0][0] == 'R': continue enhancerString = '%s:%s-%s' % (line[1], line[2], line[3]) enhancerLocus = utils.Locus(line[1], line[2], line[3], '.', line[0]) # overlapping genes are transcribed genes whose transcript is directly # in the stitchedLocus overlappingLoci = transcribedCollection.getOverlap( enhancerLocus, 'both') overlappingGenes = [] for overlapLocus in overlappingLoci: overlappingGenes.append(overlapLocus.ID()) # proximalGenes are transcribed genes where the tss is within 50kb of # the boundary of the stitched loci proximalLoci = tssCollection.getOverlap( utils.makeSearchLocus(enhancerLocus, searchWindow, searchWindow), 'both') proximalGenes = [] for proxLocus in proximalLoci: proximalGenes.append(proxLocus.ID()) distalLoci = tssCollection.getOverlap( utils.makeSearchLocus(enhancerLocus, 1000000, 1000000), 'both') distalGenes = [] for proxLocus in distalLoci: distalGenes.append(proxLocus.ID()) overlappingGenes = utils.uniquify(overlappingGenes) proximalGenes = utils.uniquify(proximalGenes) distalGenes = utils.uniquify(distalGenes) allEnhancerGenes = overlappingGenes + proximalGenes + distalGenes # these checks make sure each gene list is unique. # technically it is possible for a gene to be overlapping, but not proximal since the # gene could be longer than the 50kb window, but we'll let that slide # here for refID in overlappingGenes: if proximalGenes.count(refID) == 1: proximalGenes.remove(refID) for refID in proximalGenes: if distalGenes.count(refID) == 1: distalGenes.remove(refID) # Now find the closest gene if len(allEnhancerGenes) == 0: closestGene = '' else: # get enhancerCenter enhancerCenter = (int(line[2]) + int(line[3])) / 2 # get absolute distance to enhancer center distList = [abs(enhancerCenter - startDict[geneID]['start'][0]) for geneID in allEnhancerGenes] # get the ID and convert to name closestGene = startDict[ allEnhancerGenes[distList.index(min(distList))]]['name'] # NOW WRITE THE ROW FOR THE ENHANCER TABLE if noFormatTable: newEnhancerLine = list(line) newEnhancerLine.append( join(utils.uniquify([startDict[x]['name'] for x in overlappingGenes]), ',')) newEnhancerLine.append( join(utils.uniquify([startDict[x]['name'] for x in proximalGenes]), ',')) newEnhancerLine.append(closestGene) else: newEnhancerLine = line[0:9] newEnhancerLine.append( join(utils.uniquify([startDict[x]['name'] for x in overlappingGenes]), ',')) newEnhancerLine.append( join(utils.uniquify([startDict[x]['name'] for x in proximalGenes]), ',')) newEnhancerLine.append(closestGene) newEnhancerLine += line[-2:] enhancerToGeneTable.append(newEnhancerLine) # Now grab all overlapping and proximal genes for the gene ordered # table overallGeneList += overlappingGenes for refID in overlappingGenes: geneDict['overlapping'][refID].append(enhancerString) rankDict[refID].append(int(line[-2])) superDict[refID].append(int(line[-1])) overallGeneList += proximalGenes for refID in proximalGenes: geneDict['proximal'][refID].append(enhancerString) rankDict[refID].append(int(line[-2])) superDict[refID].append(int(line[-1])) # End loop through # Make table by gene print('MAKING ENHANCER ASSOCIATED GENE TSS COLLECTION') overallGeneList = utils.uniquify(overallGeneList) #get the chromLists from the various bams here cmd = 'samtools idxstats %s' % (rankByBamFile) idxStats = subprocess.Popen(cmd,stdout=subprocess.PIPE,shell=True) idxStats= idxStats.communicate() bamChromList = [line.split('\t')[0] for line in idxStats[0].split('\n')[0:-2]] if len(controlBamFile) > 0: cmd = 'samtools idxstats %s' % (controlBamFile) idxStats = subprocess.Popen(cmd,stdout=subprocess.PIPE,shell=True) idxStats= idxStats.communicate() bamChromListControl = [line.split('\t')[0] for line in idxStats[0].split('\n')[0:-2]] bamChromList = [chrom for chrom in bamChromList if bamChromListControl.count(chrom) != 0] #now make sure no genes have a bad chrom overallGeneList = [gene for gene in overallGeneList if bamChromList.count(startDict[gene]['chr']) != 0] #now make an enhancer collection of all transcripts enhancerGeneCollection = utils.makeTranscriptCollection( annotFile, 5000, 5000, 500, overallGeneList) enhancerGeneGFF = utils.locusCollectionToGFF(enhancerGeneCollection) # dump the gff to file enhancerFolder = utils.getParentFolder(enhancerFile) gffRootName = "%s_TSS_ENHANCER_GENES_-5000_+5000" % (genome) enhancerGeneGFFFile = "%s%s_%s.gff" % (enhancerFolder, enhancerName,gffRootName) utils.unParseTable(enhancerGeneGFF, enhancerGeneGFFFile, '\t') # now we need to run bamToGFF # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail. bamliquidator_path = 'bamliquidator_batch' print('MAPPING SIGNAL AT ENHANCER ASSOCIATED GENE TSS') # map density at genes in the +/- 5kb tss region # first on the rankBy bam bamName = rankByBamFile.split('/')[-1] mappedRankByFolder = "%s%s_%s_%s/" % (enhancerFolder, enhancerName,gffRootName, bamName) mappedRankByFile = "%s%s_%s_%s/matrix.txt" % (enhancerFolder,enhancerName, gffRootName, bamName) cmd = bamliquidator_path + ' --sense . -e 200 --match_bamToGFF -r %s -o %s %s' % (enhancerGeneGFFFile, mappedRankByFolder,rankByBamFile) print("Mapping rankby bam %s" % (rankByBamFile)) print(cmd) os.system(cmd) #check for completion if utils.checkOutput(mappedRankByFile,0.2,5): print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (enhancerGeneGFFFile, rankByBamFile)) else: print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (enhancerGeneGFFFile, rankByBamFile)) sys.exit() # next on the control bam if it exists if len(controlBamFile) > 0: controlName = controlBamFile.split('/')[-1] mappedControlFolder = "%s%s_%s_%s/" % ( enhancerFolder, enhancerName,gffRootName, controlName) mappedControlFile = "%s%s_%s_%s/matrix.txt" % ( enhancerFolder, enhancerName,gffRootName, controlName) cmd = bamliquidator_path + ' --sense . -e 200 --match_bamToGFF -r %s -o %s %s' % (enhancerGeneGFFFile, mappedControlFolder,controlBamFile) print("Mapping control bam %s" % (controlBamFile)) print(cmd) os.system(cmd) #check for completion if utils.checkOutput(mappedControlFile,0.2,5): print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (enhancerGeneGFFFile, controlBamFile)) else: print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (enhancerGeneGFFFile, controlBamFile)) sys.exit() # now get the appropriate output files if len(controlBamFile) > 0: print("CHECKING FOR MAPPED OUTPUT AT %s AND %s" % (mappedRankByFile, mappedControlFile)) if utils.checkOutput(mappedRankByFile, 1, 1) and utils.checkOutput(mappedControlFile, 1, 1): print('MAKING ENHANCER ASSOCIATED GENE TSS SIGNAL DICTIONARIES') signalDict = makeSignalDict(mappedRankByFile, mappedControlFile) else: print("NO MAPPING OUTPUT DETECTED") sys.exit() else: print("CHECKING FOR MAPPED OUTPUT AT %s" % (mappedRankByFile)) if utils.checkOutput(mappedRankByFile, 1, 30): print('MAKING ENHANCER ASSOCIATED GENE TSS SIGNAL DICTIONARIES') signalDict = makeSignalDict(mappedRankByFile) else: print("NO MAPPING OUTPUT DETECTED") sys.exit() # use enhancer rank to order rankOrder = utils.order([min(rankDict[x]) for x in overallGeneList]) usedNames = [] # make a new dict to hold TSS signal by max per geneName geneNameSigDict = defaultdict(list) print('MAKING GENE TABLE') for i in rankOrder: refID = overallGeneList[i] geneName = startDict[refID]['name'] if usedNames.count(geneName) > 0 and uniqueGenes == True: continue else: usedNames.append(geneName) proxEnhancers = geneDict['overlapping'][ refID] + geneDict['proximal'][refID] superStatus = max(superDict[refID]) enhancerRanks = join([str(x) for x in rankDict[refID]], ',') enhancerSignal = signalDict[refID] geneNameSigDict[geneName].append(enhancerSignal) newLine = [geneName, refID, join( proxEnhancers, ','), enhancerRanks, superStatus, enhancerSignal] geneToEnhancerTable.append(newLine) #utils.unParseTable(geneToEnhancerTable,'/grail/projects/newRose/geneMapper/foo.txt','\t') print('MAKING ENHANCER TO TOP GENE TABLE') if noFormatTable: enhancerToTopGeneTable = [ enhancerToGeneTable[0] + ['TOP_GENE', 'TSS_SIGNAL']] else: enhancerToTopGeneTable = [enhancerToGeneTable[0][0:12] + [ 'TOP_GENE', 'TSS_SIGNAL'] + enhancerToGeneTable[0][-2:]] for line in enhancerToGeneTable[1:]: geneList = [] if noFormatTable: geneList += line[-3].split(',') geneList += line[-2].split(',') else: geneList += line[10].split(',') geneList += line[11].split(',') geneList = utils.uniquify([x for x in geneList if len(x) > 0]) if len(geneList) > 0: try: sigVector = [max(geneNameSigDict[x]) for x in geneList] maxIndex = sigVector.index(max(sigVector)) maxGene = geneList[maxIndex] maxSig = sigVector[maxIndex] if maxSig == 0.0: maxGene = 'NONE' maxSig = 'NONE' except ValueError: if len(geneList) == 1: maxGene = geneList[0] maxSig = 'NONE' else: maxGene = 'NONE' maxSig = 'NONE' else: maxGene = 'NONE' maxSig = 'NONE' if noFormatTable: newLine = line + [maxGene, maxSig] else: newLine = line[0:12] + [maxGene, maxSig] + line[-2:] enhancerToTopGeneTable.append(newLine) # resort enhancerToGeneTable if noFormatTable: return enhancerToGeneTable, enhancerToTopGeneTable, geneToEnhancerTable else: enhancerOrder = utils.order([int(line[-2]) for line in enhancerToGeneTable[1:]]) sortedTable = [enhancerToGeneTable[0]] sortedTopGeneTable = [enhancerToTopGeneTable[0]] for i in enhancerOrder: sortedTable.append(enhancerToGeneTable[(i + 1)]) sortedTopGeneTable.append(enhancerToTopGeneTable[(i + 1)]) return sortedTable, sortedTopGeneTable, geneToEnhancerTable
def main(): print('main analysis for MYCN project') print('changing directory to project folder') os.chdir(projectFolder) print('\n\n') print( '#======================================================================' ) print( '#======================I, LOADING DATA ANNOTATION======================' ) print( '#======================================================================' ) print('\n\n') #This section sanity checks each data table and makes sure both bam and .bai files are accessible #for ChIP-Seq pipeline_dfci.summary(mouse_dataFile) print('\n\n') print( '#======================================================================' ) print( '#==========================II. CALLING MACS============================' ) print( '#======================================================================' ) print('\n\n') #running peak finding using macs 1.4.2 on all chip datasets #this usually takes ~2-3 hours on a reasonably fast machine #a 3 hour time out on this entire operation is set #if peak calling takes longer than 3 hours, simply run the script again after completion #run_macs(mouse_dataFile) print('\n\n') print( '#======================================================================' ) print( '#=================II. DEFINING ACTIVE GENES IN MOUSE===================' ) print( '#======================================================================' ) print('\n\n') #here we will identify active promoters in various contexts as those with #an H3K27AC peak in the +/- 1kb tss region #UCSC refseq annotations are used for all genes #make_active_gene_lists(mouse_dataFile) print('\n\n') print( '#======================================================================' ) print( '#==================III. CALLING ROSE TO MAP ENHANCERS==================' ) print( '#======================================================================' ) print('\n\n') # #for SCG_H3K27AC # analysisName = 'SCG_H3K27AC' # namesList = ['SCG_H3K27Ac'] # bashFileName,region_map_path,namesList=define_enhancer_landscape(mouse_dataFile,analysisName,namesList) # #for CG_H3K27AC # analysisName = 'CG_H3K27AC' # namesList = ['CG_H3K27Ac'] # bashFileName,region_map_path,namesList=define_enhancer_landscape(mouse_dataFile,analysisName,namesList) # #for GANGLIA_H3K27AC # analysisName = 'GANGLIA_H3K27AC' # namesList = ['CG_H3K27Ac','SCG_H3K27Ac'] # bashFileName,region_map_path,namesList=define_enhancer_landscape(mouse_dataFile,analysisName,namesList) # #for THMYCN # analysisName = 'THMYCN_H3K27AC' # namesList = ['THMYCN_139076_H3K27Ac','THMYCN_139423_H3K27Ac','THMYCN1_H3K27Ac'] # bashFileName,region_map_path,namesList=define_enhancer_landscape(mouse_dataFile,analysisName,namesList) print('\n\n') print( '#======================================================================' ) print( '#=================IV. LIFTING OVER NB CONSERVED REGIONS================' ) print( '#======================================================================' ) print('\n\n') # #liftover a pair of gffs # #first convert to bed # nb_promoter_gff_path = '%sgff/HG19_NB_MYCN_CONSERVED_PROMOTER_-5000_+5000.gff' % (hg19_projectFolder) # nb_enhancer_gff_path = '%sgff/HG19_NB_MYCN_CONSERVED_ENHANCER_-5000_+5000.gff' % (hg19_projectFolder) # nb_promoter_bed_path ='%sbeds/HG19_NB_MYCN_CONSERVED_PROMOTER_-5000_+5000.bed' % (hg19_projectFolder) # nb_enhancer_bed_path ='%sbeds/HG19_NB_MYCN_CONSERVED_ENHANCER_-5000_+5000.bed' % (hg19_projectFolder) # nb_promoter_gff = utils.parseTable(nb_promoter_gff_path,'\t') # nb_enhancer_gff = utils.parseTable(nb_enhancer_gff_path,'\t') # utils.gffToBed(nb_promoter_gff,nb_promoter_bed_path) # utils.gffToBed(nb_enhancer_gff,nb_enhancer_bed_path) # print('converted NB conserved gffs to beds at %s and %s' % (nb_promoter_bed_path,nb_enhancer_bed_path)) # #note, now you have to liftover manually to create beds # mm9_promoter_bed_path = '%sMM9_NB_MYCN_CONSERVED_PROMOTER_-5000_+5000.bed' % (bedFolder) # mm9_enhancer_bed_path = '%sMM9_NB_MYCN_CONSERVED_ENHANCER_-5000_+5000.bed' % (bedFolder) # mm9_promoter_gff_path = '%sMM9_NB_MYCN_CONSERVED_PROMOTER_-5000_+5000.gff' % (gffFolder) # mm9_enhancer_gff_path = '%sMM9_NB_MYCN_CONSERVED_ENHANCER_-5000_+5000.gff' % (gffFolder) # utils.bedToGFF(mm9_promoter_bed_path,mm9_promoter_gff_path) # utils.bedToGFF(mm9_enhancer_bed_path,mm9_enhancer_gff_path) # print('writing mm9 nb mycn sites to %s and %s' % (mm9_promoter_gff_path,mm9_enhancer_gff_path)) print('\n\n') print( '#======================================================================' ) print( '#======================V. MAPPING ENRICHED TO GFFS=====================' ) print( '#======================================================================' ) print('\n\n') # setName = 'THMYCN' # gffList = [mm9_promoter_gff_path,mm9_enhancer_gff_path] # cellTypeList = ['THMYCN1','THMYCN2','THMYCN','CG','SCG'] # mapList = ['CG_H3K27Ac', # 'SCG_H3K27Ac', # 'THMYCN1_H3K27Ac', # 'THMYCN_139423_H3K27Ac', # 'THMYCN_139076_H3K27Ac', # ] # #pipeline_dfci.mapEnrichedToGFF(mouse_dataFile,setName,gffList,cellTypeList,macsEnrichedFolder,mappedEnrichedFolder,macs=True,namesList=mapList,useBackground=True) # #summarize info for venn diagrams for each # promoter_mapped_path = '%sMM9_NB_MYCN_CONSERVED_PROMOTER_-5000_+5000/MM9_NB_MYCN_CONSERVED_PROMOTER_-5000_+5000_THMYCN.txt' % (mappedEnrichedFolder) # promoter_venn_path = '%sMM9_NB_MYCN_CONSERVED_PROMOTER_-5000_+5000_VENN.txt' % (tableFolder) # summarizeVenn(promoter_mapped_path,group_list = ['CG','THMYCN'],output=promoter_venn_path) # enhancer_mapped_path = '%sMM9_NB_MYCN_CONSERVED_ENHANCER_-5000_+5000/MM9_NB_MYCN_CONSERVED_ENHANCER_-5000_+5000_THMYCN.txt' % (mappedEnrichedFolder) # enhancer_venn_path = '%sMM9_NB_MYCN_CONSERVED_ENHANCER_-5000_+5000_VENN.txt' % (tableFolder) # summarizeVenn(enhancer_mapped_path,group_list = ['CG','THMYCN'],output=enhancer_venn_path) print('\n\n') print( '#======================================================================' ) print( '#=====================VI. MAKING MYCN REGIONS GFF======================' ) print( '#======================================================================' ) print('\n\n') dataDict = pipeline_dfci.loadDataTable(mouse_dataFile) names_list = [ 'THMYCN2_MYCN', 'THMYCN_139076_MYCN', 'THMYCN_139423_MYCN', ] mycn_loci = [] for name in names_list: mycn_collection = utils.importBoundRegion( '%s%s' % (macsEnrichedFolder, dataDict[name]['enrichedMacs']), name) mycn_loci += mycn_collection.getLoci() mycn_collection = utils.LocusCollection(mycn_loci, 50) mycn_collection.stitchCollection() mycn_gff = utils.locusCollectionToGFF(mycn_collection) mycn_gff_path = '%sMM9_THMYCN_MYCN_-0_+0.gff' % (gffFolder) utils.unParseTable(mycn_gff, mycn_gff_path, '\t') #make collections promoter_collection = utils.gffToLocusCollection( '%sMM9_NB_MYCN_CONSERVED_PROMOTER_-5000_+5000.gff' % (gffFolder)) enhancer_collection = utils.gffToLocusCollection( '%sMM9_NB_MYCN_CONSERVED_ENHANCER_-5000_+5000.gff' % (gffFolder)) #make the overlap table overlap_table = [['PROMOTER', 'ENHANCER', 'NONE']] promoter_count = 0 enhancer_count = 0 none_count = 0 for line in mycn_gff: locus = utils.Locus(line[0], int(line[3]) - 10000, int(line[4]) + 10000, '.') if enhancer_collection.getOverlap(locus, 'both'): enhancer_count += 1 continue if promoter_collection.getOverlap(locus, 'both'): promoter_count += 1 else: none_count += 1 overlap_table.append([promoter_count, enhancer_count, none_count]) overlap_table_path = '%sMM9_THMYCN_OVERLAP.txt' % (tableFolder) utils.unParseTable(overlap_table, overlap_table_path, '\t') print('\n\n') print( '#======================================================================' ) print( '#=====================VI. MAPPING GFFS FOR HEATMAP=====================' ) print( '#======================================================================' ) print('\n\n') #map_for_heatmap(mouse_dataFile) print('\n\n') print( '#======================================================================' ) print( '#=====================VII. AVERAGING MAPPED SIGNAL=====================' ) print( '#======================================================================' ) print('\n\n') # set_list = ['GANGLIA_H3K27AC','THMYCN_H3K27AC','THMYCN_MYCN'] # set_names = [ # ['CG_H3K27Ac','SCG_H3K27Ac'], # ['THMYCN1_H3K27Ac','THMYCN_139423_H3K27Ac','THMYCN_139076_H3K27Ac'], # ['THMYCN2_MYCN','THMYCN_139076_MYCN','THMYCN_139423_MYCN'] # ] # for i in range(len(set_list)): # setName = set_list[i] # names_list =set_names[i] # print(setName) # print(names_list) # #for promoters # mapped_list = ['%sMM9_NB_MYCN_CONSERVED_PROMOTER_-5000_+5000/MM9_NB_MYCN_CONSERVED_PROMOTER_-5000_+5000_%s.gff' % (mappedFolder,name) for name in names_list] # output_path = '%sMM9_NB_MYCN_CONSERVED_PROMOTER_-5000_+5000/MM9_NB_MYCN_CONSERVED_PROMOTER_-5000_+5000_%s.gff' % (mappedFolder,setName) # print(output_path) # averagingMappedSignal(mapped_list,output_path,setName) # #for enhancers # mapped_list = ['%sMM9_NB_MYCN_CONSERVED_ENHANCER_-5000_+5000/MM9_NB_MYCN_CONSERVED_ENHANCER_-5000_+5000_%s.gff' % (mappedFolder,name) for name in names_list] # output_path = '%sMM9_NB_MYCN_CONSERVED_ENHANCER_-5000_+5000/MM9_NB_MYCN_CONSERVED_ENHANCER_-5000_+5000_%s.gff' % (mappedFolder,setName) # print(output_path) # averagingMappedSignal(mapped_list,output_path,setName) print('\n\n') print( '#======================================================================' ) print( '#=====================VIII. MAKING HEATMAPS/METAS======================' ) print( '#======================================================================' ) print('\n\n')
def mergeCollections(enhancerFile1,enhancerFile2,name1,name2,output='',inputGFF=''): ''' merges them collections ''' print(enhancerFile1) print(enhancerFile2) name1Collection = makeSECollection(enhancerFile1,name1) name2Collection = makeSECollection(enhancerFile2,name2) print(len(name1Collection)) print(len(name2Collection)) print('weeeeeee') if len(inputGFF) == 0: #now merge them mergedLoci = name1Collection.getLoci() + name2Collection.getLoci() mergedCollection = utils.LocusCollection(mergedLoci,50) #stitch the collection together stitchedCollection = mergedCollection.stitchCollection() stitchedLoci = stitchedCollection.getLoci() else: locusCollection = utils.gffToLocusCollection(inputGFF) stitchedCollection = locusCollection.stitchCollection() stitchedLoci = stitchedCollection.getLoci() #rename loci by presence in group1 or group2 renamedLoci =[] conserved_ticker = 1 name1_ticker = 1 name2_ticker = 1 for locus in stitchedLoci: if len(name1Collection.getOverlap(locus)) > 0 and len(name1Collection.getOverlap(locus)) > 0: newID = 'CONSERVED_%s' % (str(conserved_ticker)) conserved_ticker +=1 elif len(name1Collection.getOverlap(locus)) > 0 and len(name1Collection.getOverlap(locus)) == 0: newID = '%s_%s' % (name1,str(name1_ticker)) name1_ticker +=1 else: newID = '%s_%s' % (name2,str(name2_ticker)) name2_ticker +=1 locus._ID = newID renamedLoci.append(locus) #now we turn this into a gff and write it out gff = utils.locusCollectionToGFF(utils.LocusCollection(renamedLoci,50)) if len(output) == 0: return gff else: print "writing merged gff to %s" % (output) utils.unParseTable(gff,output,'\t') return output