def main(): ''' main run call ''' debug = False from optparse import OptionParser usage = "usage: %prog [options] -g [GENOME] -i [INPUT_REGION_GFF] -r [RANKBY_BAM_FILE] -o [OUTPUT_FOLDER] [OPTIONAL_FLAGS]" parser = OptionParser(usage=usage) # required flags parser.add_option( "-i", "--i", dest="input", nargs=1, default=None, help= "Enter a comma separated list of .gff or .bed file of binding sites used to make enhancers" ) parser.add_option("-r", "--rankby", dest="rankby", nargs=1, default=None, help="Enter a comma separated list of bams to rank by") parser.add_option("-o", "--out", dest="out", nargs=1, default=None, help="Enter an output folder") parser.add_option("-g", "--genome", dest="genome", nargs=1, default=None, help="Enter the genome build (MM9,MM8,HG18,HG19)") # optional flags parser.add_option( "-n", "--name", dest="name", nargs=1, default=None, help="Provide a name for the analysis otherwise ROSE will guess") parser.add_option( "-c", "--control", dest="control", nargs=1, default=None, help= "Enter a comma separated list of control bams. Can either provide a single control bam for all rankby bams, or provide a control bam for each individual bam" ) parser.add_option( "-s", "--stitch", dest="stitch", nargs=1, default='', help= "Enter a max linking distance for stitching. Default will determine optimal stitching parameter" ) parser.add_option( "-t", "--tss", dest="tss", nargs=1, default=0, help="Enter a distance from TSS to exclude. 0 = no TSS exclusion") parser.add_option( "--mask", dest="mask", nargs=1, default=None, help= "Mask a set of regions from analysis. Provide a .bed or .gff of masking regions" ) # RETRIEVING FLAGS (options, args) = parser.parse_args() if not options.input or not options.rankby or not options.out or not options.genome: print('hi there') parser.print_help() exit() # making the out folder if it doesn't exist outFolder = utils.formatFolder(options.out, True) # figuring out folder schema gffFolder = utils.formatFolder(outFolder + 'gff/', True) mappedFolder = utils.formatFolder(outFolder + 'mappedGFF/', True) # GETTING INPUT FILE(s) inputList = [ inputFile for inputFile in options.input.split(',') if len(inputFile) > 1 ] #converting all input files into GFFs and moving into the GFF folder inputGFFList = [] for inputFile in inputList: if inputFile.split('.')[-1] == 'bed': # CONVERTING A BED TO GFF inputGFFName = inputFile.split('/')[-1][ 0:-4] #strips the last 4 characters i.e. '.bed' inputGFFFile = '%s%s.gff' % (gffFolder, inputGFFName) utils.bedToGFF(inputFile, inputGFFFile) elif options.input.split('.')[-1] == 'gff': # COPY THE INPUT GFF TO THE GFF FOLDER os.system('cp %s %s' % (inputFile, gffFolder)) inputGFFFile = '%s%s' % (gffFolder, inputFile.split('/')[-1]) else: print( 'WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT' ) # COPY THE INPUT GFF TO THE GFF FOLDER os.system('cp %s %s' % (inputFile, gffFolder)) inputGFFFile = '%s%s' % (gffFolder, inputFile.split('/')[-1]) inputGFFList.append(inputGFFFile) # GETTING THE LIST OF BAMFILES TO PROCESS #either same number of bams for rankby and control #or only 1 control #or none! #bamlist should be all rankby bams followed by control bams bamFileList = [] if options.control: controlBamList = [ bam for bam in options.control.split(',') if len(bam) > 0 ] rankbyBamList = [ bam for bam in options.rankby.split(',') if len(bam) > 0 ] if len(controlBamList) == len(rankbyBamList): #case where an equal number of backgrounds are given bamFileList = rankbyBamList + controlBamList elif len(controlBamList) == 1: #case where a universal background is applied bamFileList = rankbyBamList + controlBamList * len(rankbyBamList) else: print( 'ERROR: EITHER PROVIDE A SINGLE CONTROL BAM FOR ALL SAMPLES, OR ONE CONTROL BAM FOR EACH SAMPLE' ) sys.exit() else: bamFileList = [ bam for bam in options.rankby.split(',') if len(bam) > 0 ] # Stitch parameter if options.stitch == '': stitchWindow = '' else: stitchWindow = int(options.stitch) # tss options tssWindow = int(options.tss) if tssWindow != 0: removeTSS = True else: removeTSS = False # GETTING THE GENOME genome = string.upper(options.genome) print('USING %s AS THE GENOME' % (genome)) # GETTING THE CORRECT ANNOT FILE genomeDict = { 'HG18': '%s/annotation/hg18_refseq.ucsc' % (pipeline_dir), 'MM9': '%s/annotation/mm9_refseq.ucsc' % (pipeline_dir), 'HG19': '%s/annotation/hg19_refseq.ucsc' % (pipeline_dir), 'MM8': '%s/annotation/mm8_refseq.ucsc' % (pipeline_dir), 'MM10': '%s/annotation/mm10_refseq.ucsc' % (pipeline_dir), 'RN4': '%s/annotation/rn4_refseq.ucsc' % (pipeline_dir), } try: annotFile = genomeDict[genome.upper()] except KeyError: print('ERROR: UNSUPPORTED GENOMES TYPE %s' % (genome)) sys.exit() #FINDING THE ANALYSIS NAME if options.name: inputName = options.name else: inputName = inputGFFList[0].split('/')[-1].split('.')[0] print('USING %s AS THE ANALYSIS NAME' % (inputName)) print('FORMATTING INPUT REGIONS') # MAKING THE RAW INPUT FILE FROM THE INPUT GFFs #use a simpler unique region naming system if len(inputGFFList) == 1: inputGFF = utils.parseTable(inputGFFList[0], '\t') else: inputLoci = [] for gffFile in inputGFFList: print('\tprocessing %s' % (gffFile)) gff = utils.parseTable(gffFile, '\t') gffCollection = utils.gffToLocusCollection(gff, 50) inputLoci += gffCollection.getLoci() inputCollection = utils.LocusCollection(inputLoci, 50) inputCollection = inputCollection.stitchCollection( ) # stitches to produce unique regions inputGFF = utils.locusCollectionToGFF(inputCollection) formattedGFF = [] #now number things appropriately for i, line in enumerate(inputGFF): #use the coordinates to make a new id inputname_chr_sense_start_stop chrom = line[0] coords = [int(line[3]), int(line[4])] sense = line[6] lineID = '%s_%s' % (inputName, str(i + 1)) #1 indexing newLine = [ chrom, lineID, lineID, min(coords), max(coords), '', sense, '', lineID ] formattedGFF.append(newLine) #name of the master input gff file masterGFFFile = '%s%s_%s_ALL_-0_+0.gff' % (gffFolder, string.upper(genome), inputName) utils.unParseTable(formattedGFF, masterGFFFile, '\t') print('USING %s AS THE INPUT GFF' % (masterGFFFile)) # MAKING THE START DICT print('MAKING START DICT') startDict = utils.makeStartDict(annotFile) #GET CHROMS FOUND IN THE BAMS print('GETTING CHROMS IN BAMFILES') bamChromList = getBamChromList(bamFileList) print("USING THE FOLLOWING CHROMS") print(bamChromList) #LOADING IN THE GFF AND FILTERING BY CHROM print('LOADING AND FILTERING THE GFF') inputGFF = filterGFF(masterGFFFile, bamChromList) # LOADING IN THE BOUND REGION REFERENCE COLLECTION print('LOADING IN GFF REGIONS') referenceCollection = utils.gffToLocusCollection(inputGFF) print('CHECKING REFERENCE COLLECTION:') checkRefCollection(referenceCollection) # MASKING REFERENCE COLLECTION # see if there's a mask if options.mask: maskFile = options.mask # if it's a bed file if maskFile.split('.')[-1].upper() == 'BED': maskGFF = utils.bedToGFF(maskFile) elif maskFile.split('.')[-1].upper() == 'GFF': maskGFF = utils.parseTable(maskFile, '\t') else: print("MASK MUST BE A .gff or .bed FILE") sys.exit() maskCollection = utils.gffToLocusCollection(maskGFF) # now mask the reference loci referenceLoci = referenceCollection.getLoci() filteredLoci = [ locus for locus in referenceLoci if len(maskCollection.getOverlap(locus, 'both')) == 0 ] print("FILTERED OUT %s LOCI THAT WERE MASKED IN %s" % (len(referenceLoci) - len(filteredLoci), maskFile)) referenceCollection = utils.LocusCollection(filteredLoci, 50) # NOW STITCH REGIONS print('STITCHING REGIONS TOGETHER') stitchedCollection, debugOutput, stitchWindow = regionStitching( referenceCollection, inputName, outFolder, stitchWindow, tssWindow, annotFile, removeTSS) # NOW MAKE A STITCHED COLLECTION GFF print('MAKING GFF FROM STITCHED COLLECTION') stitchedGFF = utils.locusCollectionToGFF(stitchedCollection) print(stitchWindow) print(type(stitchWindow)) if not removeTSS: stitchedGFFFile = '%s%s_%sKB_STITCHED.gff' % (gffFolder, inputName, str(stitchWindow / 1000)) stitchedGFFName = '%s_%sKB_STITCHED' % (inputName, str(stitchWindow / 1000)) debugOutFile = '%s%s_%sKB_STITCHED.debug' % (gffFolder, inputName, str(stitchWindow / 1000)) else: stitchedGFFFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.gff' % ( gffFolder, inputName, str(stitchWindow / 1000)) stitchedGFFName = '%s_%sKB_STITCHED_TSS_DISTAL' % ( inputName, str(stitchWindow / 1000)) debugOutFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.debug' % ( gffFolder, inputName, str(stitchWindow / 1000)) # WRITING DEBUG OUTPUT TO DISK if debug: print('WRITING DEBUG OUTPUT TO DISK AS %s' % (debugOutFile)) utils.unParseTable(debugOutput, debugOutFile, '\t') # WRITE THE GFF TO DISK print('WRITING STITCHED GFF TO DISK AS %s' % (stitchedGFFFile)) utils.unParseTable(stitchedGFF, stitchedGFFFile, '\t') # SETTING UP THE OVERALL OUTPUT FILE outputFile1 = outFolder + stitchedGFFName + '_ENHANCER_REGION_MAP.txt' print('OUTPUT WILL BE WRITTEN TO %s' % (outputFile1)) # MAPPING TO THE NON STITCHED (ORIGINAL GFF) # MAPPING TO THE STITCHED GFF # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail. bamFileListUnique = list(bamFileList) bamFileListUnique = utils.uniquify(bamFileListUnique) #prevent redundant mapping print("MAPPING TO THE FOLLOWING BAMS:") print(bamFileListUnique) for bamFile in bamFileListUnique: bamFileName = bamFile.split('/')[-1] # MAPPING TO THE STITCHED GFF mappedOut1Folder = '%s%s_%s_MAPPED' % (mappedFolder, stitchedGFFName, bamFileName) mappedOut1File = '%s%s_%s_MAPPED/matrix.txt' % ( mappedFolder, stitchedGFFName, bamFileName) if utils.checkOutput(mappedOut1File, 0.2, 0.2): print("FOUND %s MAPPING DATA FOR BAM: %s" % (stitchedGFFFile, mappedOut1File)) else: cmd1 = bamliquidator_path + " --sense . -e 200 --match_bamToGFF -r %s -o %s %s" % ( stitchedGFFFile, mappedOut1Folder, bamFile) print(cmd1) os.system(cmd1) if utils.checkOutput(mappedOut1File, 0.2, 5): print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (stitchedGFFFile, bamFileName)) else: print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (stitchedGFFFile, bamFileName)) sys.exit() print('BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS') # CALCULATE DENSITY BY REGION # NEED TO FIX THIS FUNCTION TO ACCOUNT FOR DIFFERENT OUTPUTS OF LIQUIDATOR mapCollection(stitchedCollection, referenceCollection, bamFileList, mappedFolder, outputFile1, refName=stitchedGFFName) print('FINDING AVERAGE SIGNAL AMONGST BAMS') metaOutputFile = collapseRegionMap(outputFile1, inputName + '_MERGED_SIGNAL', controlBams=options.control) #now try the merging print('CALLING AND PLOTTING SUPER-ENHANCERS') rankbyName = inputName + '_MERGED_SIGNAL' controlName = 'NONE' cmd = 'Rscript %sROSE2_callSuper.R %s %s %s %s' % ( pipeline_dir, outFolder, metaOutputFile, inputName, controlName) print(cmd) os.system(cmd) # calling the gene mapper print('CALLING GENE MAPPING') superTableFile = "%s_SuperEnhancers.table.txt" % (inputName) #for now don't use ranking bam to call top genes cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % ( pipeline_dir, genome, outFolder, superTableFile) print(cmd) os.system(cmd) stretchTableFile = "%s_StretchEnhancers.table.txt" % (inputName) cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % ( pipeline_dir, genome, outFolder, stretchTableFile) print(cmd) os.system(cmd) superStretchTableFile = "%s_SuperStretchEnhancers.table.txt" % (inputName) cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % ( pipeline_dir, genome, outFolder, superStretchTableFile) os.system(cmd)
def main(): ''' main run method for enhancer promoter contribution tool ''' parser = argparse.ArgumentParser(usage='%(prog)s [options]') # required flags parser.add_argument( "-b", "--bam", dest="bam", nargs='*', help="Enter a space separated list of .bam files for the main factor", required=True) parser.add_argument("-i", "--input", dest="input", type=str, help="Enter .gff or .bed file of regions to analyze", required=True) parser.add_argument( "-g", "--genome", dest="genome", type=str, help= "specify a genome, HG18,HG19,HG38,MM8,MM9,MM10,RN6 are currently supported", required=True) # output flag parser.add_argument("-o", "--output", dest="output", type=str, help="Enter the output folder.", required=True) # additional options flags and optional arguments parser.add_argument( "-a", "--activity", dest="activity", type=str, help= "specify a table where first column represents a list of active refseq genes", required=False) parser.add_argument( "-c", "--control", dest="control", nargs='*', help= "Enter a space separated list of .bam files for background. If flagged, will perform background subtraction", required=False) parser.add_argument( "-w", "--window", dest="window", type=int, help= "Enter a window to define the TSS area +/- the TSS. Default is 1kb", required=False, default=1000) parser.add_argument( "--other-bams", dest="other", nargs='*', help="enter a space separated list of other bams to map to", required=False) parser.add_argument( "--name", dest="name", type=str, help= "enter a root name for the analysis, otherwise will try to find the name from the input file", required=False) parser.add_argument( "--top", dest="top", type=int, help= "Run the analysis on the top N genes by total signal. Default is 5000", required=False, default=5000) parser.add_argument( "--tads", dest="tads", type=str, help= "Include a .bed of tad regions to restrict enhancer/gene association", required=False, default=None) args = parser.parse_args() print(args) #minimum arguments needed to proceed if args.bam and args.input and args.genome and args.output: #===================================================================================== #===============================I. PARSING ARGUMENTS================================== #===================================================================================== print( '\n\n#======================================\n#===========I. DATA SUMMARY============\n#======================================\n' ) #top analysis subset top = args.top #input genome genome = args.genome.upper() print('PERFORMING ANALYSIS ON %s GENOME BUILD' % (genome)) #set of bams bamFileList = args.bam #bring in the input path inputPath = args.input #try to get the input name or use the name argument if args.name: analysisName = args.name else: analysisName = inputPath.split('/')[-1].split('.')[0] print('USING %s AS ANALYSIS NAME' % (analysisName)) #setting up the output folder parentFolder = utils.formatFolder(args.output, True) outputFolder = utils.formatFolder( '%s%s' % (parentFolder, analysisName), True) print('WRITING OUTPUT TO %s' % (outputFolder)) if inputPath.split('.')[-1] == 'bed': #type is bed print('input in bed format, converting to gff') inputGFF = utils.bedToGFF(inputPath) else: inputGFF = utils.parseTable(inputPath, '\t') #the tss window window = int(args.window) #activity path if args.activity: activityPath = args.activity activityTable = utils.parseTable(activityPath, '\t') #try to find the column for refseq id for i in range(len(activityTable[0])): if str(activityTable[0][i]).count('NM_') > 0 or str( activityTable[0][i]).count('NR_') > 0: ref_col = i geneList = [line[ref_col] for line in activityTable ] # this needs to be REFSEQ NM ID print('IDENTIFIED %s ACTIVE GENES' % (len(geneList))) else: geneList = [] #check if tads are being invoked if args.tads: print('LOADING TAD LOCATIONS FROM %s' % (args.tads)) use_tads = True tads_path = args.tads else: use_tads = False tads_path = '' print('LOADING ANNOTATION DATA FOR GENOME %s' % (genome)) #important here to define the window startDict, tssCollection, genomeDirectory, chrom_list, mouse_convert_dict = loadAnnotFile( genome, window, geneList, True) #print(tssCollection.getOverlap(utils.Locus('chr5',171387630,171388066,'.'))) #sys.exit() print('FILTERING THE INPUT GFF FOR GOOD CHROMOSOMES') print(chrom_list) filtered_gff = [ line for line in inputGFF if chrom_list.count(line[0]) > 0 ] print('%s of INITIAL %s REGIONS ARE IN GOOD CHROMOSOMES' % (len(filtered_gff), len(inputGFF))) #===================================================================================== #================II. IDENTIFYING TSS PROXIMAL AND DISTAL ELEMENTS===================== #===================================================================================== print( '\n\n#======================================\n#==II. MAPPING TO TSS/DISTAL REGIONS===\n#======================================\n' ) #now we need to split the input region print('SPLITTING THE INPUT GFF USING A WINDOW OF %s' % (window)) splitGFF = splitRegions(filtered_gff, tssCollection) print(len(filtered_gff)) print(len(splitGFF)) splitGFFPath = '%s%s_SPLIT.gff' % (outputFolder, analysisName) utils.unParseTable(splitGFF, splitGFFPath, '\t') print('WRITING TSS SPLIT GFF OUT TO %s' % (splitGFFPath)) #now you have to map the bams to the gff print('MAPPING TO THE SPLIT GFF') mappedFolder = utils.formatFolder('%sbam_mapping' % (outputFolder), True) signalTable = mapBams(bamFileList, splitGFFPath, analysisName, mappedFolder) signalTablePath = '%s%s_signal_table.txt' % (outputFolder, analysisName) utils.unParseTable(signalTable, signalTablePath, '\t') if args.control: controlBamFileList = args.control controlSignalTable = mapBams(controlBamFileList, splitGFFPath, analysisName, mappedFolder) controlSignalTablePath = '%s%s_control_signal_table.txt' % ( outputFolder, analysisName) utils.unParseTable(controlSignalTable, controlSignalTablePath, '\t') #now create the background subtracted summarized average table print('CREATING AN AVERAGE SIGNAL TABLE') averageTable = makeAverageTable(outputFolder, analysisName, useBackground=args.control) averageTablePath = '%s%s_average_table.txt' % (outputFolder, analysisName) utils.unParseTable(averageTable, averageTablePath, '\t') #now load up all of the cpg and other parameters to make the actual peak table #first check if this has already been done peakTablePath = '%s%s_PEAK_TABLE.txt' % (outputFolder, analysisName) if utils.checkOutput(peakTablePath, 0.1, 0.1): print('PEAK TABLE OUTPUT ALREADY EXISTS') peakTable = utils.parseTable(peakTablePath, '\t') else: peakTable = makePeakTable(paramDict, splitGFFPath, averageTablePath, startDict, geneList, genomeDirectory, tads_path) utils.unParseTable(peakTable, peakTablePath, '\t') geneTable = makeGeneTable(peakTable, analysisName) geneTablePath = '%s%s_GENE_TABLE.txt' % (outputFolder, analysisName) utils.unParseTable(geneTable, geneTablePath, '\t') #if mouse, need to convert genes over if genome.count('MM') == 1: print('CONVERTING MOUSE NAMES TO HUMAN HOMOLOGS FOR GSEA') converted_geneTablePath = '%s%s_GENE_TABLE_CONVERTED.txt' % ( outputFolder, analysisName) converted_geneTable = [geneTable[0]] for line in geneTable[1:]: converted_name = mouse_convert_dict[line[0]] if len(converted_name) > 0: converted_geneTable.append([converted_name] + line[1:]) utils.unParseTable(converted_geneTable, converted_geneTablePath, '\t') geneTablePath = converted_geneTablePath geneTable = converted_geneTable #===================================================================================== #===================================III. PLOTTING ==================================== #===================================================================================== print( '\n\n#======================================\n#===III. PLOTTING ENHANCER/PROMOTER===\n#======================================\n' ) #if there are fewer genes in the gene table than the top genes, only run on all if len(geneTable) < int(top): print( 'WARNING: ONLY %s GENES WITH SIGNAL AT EITHER PROMOTERS OR ENHANCERS. NOT ENOUGH TO RUN ANALYSIS ON TOP %s' % (len(geneTable) - 1, top)) top = 0 use_top = False else: use_top = True #now call the R code print('CALLING R PLOTTING SCRIPTS') callRWaterfall(geneTablePath, outputFolder, analysisName, top) #===================================================================================== #==================================IV. RUNNING GSEA=================================== #===================================================================================== print( '\n\n#======================================\n#============IV. RUNNING GSEA=========\n#======================================\n' ) #now let's call gsea print('RUNNING GSEA ON C2') callGSEA(outputFolder, analysisName, top, 'enhancer_vs_promoter', use_top) callGSEA(outputFolder, analysisName, top, 'total_contribution', use_top) if use_top: print('DETECTING GSEA OUTPUT FOR TOP %s GENES' % (top)) #for top by enhancer v promoter metric top_promoterTablePath, top_distalTablePath = detectGSEAOutput( analysisName, outputFolder, top, 'enhancer_vs_promoter') top_signalTablePath, top_backgroundTablePath = detectGSEAOutput( analysisName, outputFolder, top, 'total_contribution') print('MAKING NES PLOTS FOR TOP %s GENES' % (top)) callR_GSEA(top_promoterTablePath, top_distalTablePath, outputFolder, analysisName + '_enhancer_vs_promoter', top) callR_GSEA(top_signalTablePath, top_backgroundTablePath, outputFolder, analysisName + '_total_contribution', top) print('DETECTING GSEA OUTPUT FOR ALL GENES') #for top all_promoterTablePath, all_distalTablePath = detectGSEAOutput( analysisName, outputFolder, 'all') print('MAKING NES PLOTS FOR ALL GENES') callR_GSEA(all_promoterTablePath, all_distalTablePath, outputFolder, analysisName, 'all') #these files can be parsed to make the NES plot #[x for x in fileList if x.count('report_for') == 1and x.count('xls') ==1] print('ALL DONE WITH ANALYSIS FOR %s' % (analysisName))
def main(): ''' main run method for enhancer promoter contribution tool ''' parser = argparse.ArgumentParser(usage='%(prog)s [options]') # required flags parser.add_argument("-b", "--bam", dest="bam", nargs='*', help="Enter a space separated list of .bam files for the main factor", required=True) parser.add_argument("-i", "--input", dest="input", type=str, help="Enter .gff or .bed file of regions to analyze", required=True) parser.add_argument("-g", "--genome", dest="genome", type=str, help="specify a genome, HG18,HG19,MM8,MM9,MM10,RN6 are currently supported", required=True) # output flag parser.add_argument("-o", "--output", dest="output", type=str, help="Enter the output folder.", required=True) # additional options flags and optional arguments parser.add_argument("-a", "--activity", dest="activity", type=str, help="specify a table where first column represents a list of active refseq genes", required=False) parser.add_argument("-c", "--control", dest="control", nargs='*', help="Enter a space separated list of .bam files for background. If flagged, will perform background subtraction", required=False) parser.add_argument("-w", "--window", dest="window",type=int, help="Enter a window to define the TSS area +/- the TSS. Default is 1kb", required=False, default=1000) parser.add_argument("--other-bams", dest="other", nargs='*', help="enter a space separated list of other bams to map to", required=False) parser.add_argument("--name", dest="name", type=str, help="enter a root name for the analysis, otherwise will try to find the name from the input file", required=False) parser.add_argument("--top", dest="top", type=int, help="Run the analysis on the top N genes by total signal. Default is 5000", required=False,default=5000) parser.add_argument("--tads", dest="tads", type=str, help="Include a .bed of tad regions to restrict enhancer/gene association", required=False,default=None) #add by Quanhu Sheng parser.add_argument("--genomeDirectory", dest="genomeDirectory", type=str, help="Enter the folder contains chromosome sequence in fasta format", required=True) #gseaPath = '/usr/local/bin/gsea/gsea2-2.2.2.jar' #gmxPath = '/grail/annotations/gsea/c2.all.v5.1.symbols.gmt' #C2 set parser.add_argument("--gseaPath", dest="gseaPath", type=str, help="Enter GSEA jar file location", required=True) parser.add_argument("--gmxPath", dest="gmxPath", type=str, help="Enter GSEA gmt file location, such as c2.all.v5.1.symbols.gmt", required=True) parser.add_argument("--cpgPath", dest="cpgPath", type=str, help="Enter cpg coordinates in bed format", required=True) args = parser.parse_args() print(args) #minimum arguments needed to proceed if args.bam and args.input and args.genome and args.genomeDirectory and args.output and args.gseaPath and args.gmxPath and args.cpgPath: #top analysis subset top = args.top #input genome genome = args.genome print('PERFORMING ANALYSIS ON %s GENOME BUILD' % (genome)) #set of bams bamFileList = args.bam #bring in the input path inputPath = args.input #try to get the input name or use the name argument if args.name: analysisName = args.name else: analysisName = inputPath.split('/')[-1].split('.')[0] print('USING %s AS ANALYSIS NAME' % (analysisName)) #setting up the output folder parentFolder = utils.formatFolder(args.output,True) outputFolder = utils.formatFolder('%s%s' % (parentFolder,analysisName),True) print('WRITING OUTPUT TO %s' % (outputFolder)) if inputPath.split('.')[-1] == 'bed': #type is bed inputGFF = utils.bedToGFF(inputPath) else: inputGFF = utils.parseTable(inputPath,'\t') #the tss window window = int(args.window) #activity path if args.activity: activityPath = args.activity activityTable = utils.parseTable(activityPath,'\t') #try to find the column for refseq id for i in range(len(activityTable[0])): if str(activityTable[0][i]).count('NM_') > 0 or str(activityTable[0][i]).count('NR_') >0: ref_col = i geneList = [line[ref_col] for line in activityTable] # this needs to be REFSEQ NM ID print('IDENTIFIED %s ACTIVE GENES' % (len(geneList))) else: geneList = [] #check if tads are being invoked if args.tads: print('LOADING TAD LOCATIONS FROM %s' % (args.tads)) tads_path = args.tads else: tads_path = '' print('LOADING ANNOTATION DATA FOR GENOME %s' % (genome)) genomeDirectory=args.genomeDirectory #important here to define the window startDict,tssCollection,geneList = loadAnnotFile(genome,window,geneList,True) print('IDENTIFIED %s valid ACTIVE GENES' % (len(geneList))) print(len(startDict)) #now we need to split the input region print('SPLITTING THE INPUT GFF USING A WINDOW OF %s' % (window)) splitGFF = splitRegions(inputGFF,tssCollection) print(len(inputGFF)) print(len(splitGFF)) splitGFFPath = '%s%s_SPLIT.gff' % (outputFolder,analysisName) utils.unParseTable(splitGFF,splitGFFPath,'\t') print('WRITING TSS SPLIT GFF OUT TO %s' % (splitGFFPath)) #now you have to map the bams to the gff print('MAPPING TO THE SPLIT GFF') mappedFolder = utils.formatFolder('%sbam_mapping' % (outputFolder),True) signalTable = mapBams(bamFileList,splitGFFPath,analysisName,mappedFolder) signalTablePath = '%s%s_signal_table.txt' % (outputFolder,analysisName) utils.unParseTable(signalTable,signalTablePath,'\t') if args.control: controlBamFileList = args.control controlSignalTable = mapBams(controlBamFileList,splitGFFPath,analysisName,mappedFolder) controlSignalTablePath = '%s%s_control_signal_table.txt' % (outputFolder,analysisName) utils.unParseTable(controlSignalTable,controlSignalTablePath,'\t') #now create the background subtracted summarized average table print('CREATING AN AVERAGE SIGNAL TABLE') averageTable = makeAverageTable(outputFolder,analysisName,useBackground = args.control) averageTablePath = '%s%s_average_table.txt' % (outputFolder,analysisName) utils.unParseTable(averageTable,averageTablePath,'\t') #now load up all of the cpg and other parameters to make the actual peak table #first check if this has already been done peakTablePath = '%s%s_PEAK_TABLE.txt' % (outputFolder,analysisName) if utils.checkOutput(peakTablePath,0.1,0.1): print('PEAK TABLE OUTPUT ALREADY EXISTS') peakTable = utils.parseTable(peakTablePath,'\t') else: peakTable = makePeakTable(args.cpgPath,splitGFFPath,averageTablePath,startDict,geneList,genomeDirectory,tads_path) utils.unParseTable(peakTable,peakTablePath,'\t') geneTable = makeGeneTable(peakTable,analysisName) geneTablePath = '%s%s_GENE_TABLE.txt' % (outputFolder,analysisName) utils.unParseTable(geneTable,geneTablePath,'\t') if(top > len(geneTable)): top = 'all' #now call the R code print('CALLING R PLOTTING SCRIPTS') callRWaterfall(geneTablePath,outputFolder,analysisName,top) #now let's call gsea print('RUNNING GSEA ON C2') callGSEA(args.gseaPath, args.gmxPath, outputFolder,analysisName,top) if top != 'all': print('DETECTING GSEA OUTPUT FOR TOP %s GENES' % (top)) top_promoterTablePath,top_distalTablePath = detectGSEAOutput(analysisName,outputFolder,top) print('MAKING NES PLOTS FOR TOP %s GENES' % (top)) callR_GSEA(top_promoterTablePath,top_distalTablePath,outputFolder,analysisName,top) print('DETECTING GSEA OUTPUT FOR ALL GENES') top_promoterTablePath,top_distalTablePath = detectGSEAOutput(analysisName,outputFolder,'all') print('MAKING NES PLOTS FOR ALL GENES') callR_GSEA(top_promoterTablePath,top_distalTablePath,outputFolder,analysisName,'all') #these files can be parsed to make the NES plot #gsea_report_for_DISTAL_1459192369220.xls #[x for x in fileList if x.count('report_for') == 1and x.count('xls') ==1] print('ALL DONE WITH ANALYSIS FOR %s' % (analysisName))
def main(): ''' main run call ''' debug = False from optparse import OptionParser usage = "usage: %prog [options] -g [GENOME] -i [INPUT_REGION_GFF] -r [RANKBY_BAM_FILE] -o [OUTPUT_FOLDER] [OPTIONAL_FLAGS]" parser = OptionParser(usage=usage) # required flags parser.add_option( "-i", "--i", dest="input", nargs=1, default=None, help="Enter a .gff or .bed file of binding sites used to make enhancers" ) parser.add_option("-r", "--rankby", dest="rankby", nargs=1, default=None, help="bamfile to rank enhancer by") parser.add_option("-o", "--out", dest="out", nargs=1, default=None, help="Enter an output folder") parser.add_option("-g", "--genome", dest="genome", nargs=1, default=None, help="Enter the genome build (MM9,MM8,HG18,HG19)") # optional flags parser.add_option( "-b", "--bams", dest="bams", nargs=1, default=None, help="Enter a comma separated list of additional bam files to map to") parser.add_option("-c", "--control", dest="control", nargs=1, default=None, help="bamfile to rank enhancer by") parser.add_option( "-s", "--stitch", dest="stitch", nargs=1, default='', help= "Enter a max linking distance for stitching. Default will determine optimal stitching parameter" ) parser.add_option( "-t", "--tss", dest="tss", nargs=1, default=0, help="Enter a distance from TSS to exclude. 0 = no TSS exclusion") parser.add_option( "--mask", dest="mask", nargs=1, default=None, help= "Mask a set of regions from analysis. Provide a .bed or .gff of masking regions" ) # RETRIEVING FLAGS (options, args) = parser.parse_args() if not options.input or not options.rankby or not options.out or not options.genome: print('hi there') parser.print_help() exit() # making the out folder if it doesn't exist outFolder = utils.formatFolder(options.out, True) # figuring out folder schema gffFolder = utils.formatFolder(outFolder + 'gff/', True) mappedFolder = utils.formatFolder(outFolder + 'mappedGFF/', True) # GETTING INPUT FILE if options.input.split('.')[-1] == 'bed': # CONVERTING A BED TO GFF inputGFFName = options.input.split('/')[-1][0:-4] inputGFFFile = '%s%s.gff' % (gffFolder, inputGFFName) utils.bedToGFF(options.input, inputGFFFile) elif options.input.split('.')[-1] == 'gff': # COPY THE INPUT GFF TO THE GFF FOLDER inputGFFFile = options.input os.system('cp %s %s' % (inputGFFFile, gffFolder)) else: print( 'WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT' ) # COPY THE INPUT GFF TO THE GFF FOLDER inputGFFFile = options.input os.system('cp %s %s' % (inputGFFFile, gffFolder)) # GETTING THE LIST OF BAMFILES TO PROCESS if options.control: bamFileList = [options.rankby, options.control] else: bamFileList = [options.rankby] if options.bams: bamFileList += options.bams.split(',') #bamFileList = utils.uniquify(bamFileList) # makes sad when you have the same control bam over and over again # optional args # Stitch parameter if options.stitch == '': stitchWindow = '' else: stitchWindow = int(options.stitch) # tss options tssWindow = int(options.tss) if tssWindow != 0: removeTSS = True else: removeTSS = False # GETTING THE BOUND REGION FILE USED TO DEFINE ENHANCERS print('USING %s AS THE INPUT GFF' % (inputGFFFile)) inputName = inputGFFFile.split('/')[-1].split('.')[0] # GETTING THE GENOME genome = options.genome print('USING %s AS THE GENOME' % genome) # GETTING THE CORRECT ANNOT FILE cwd = os.getcwd() genomeDict = { 'HG18': '%s/annotation/hg18_refseq.ucsc' % (cwd), 'MM9': '%s/annotation/mm9_refseq.ucsc' % (cwd), 'HG19': '%s/annotation/hg19_refseq.ucsc' % (cwd), 'MM8': '%s/annotation/mm8_refseq.ucsc' % (cwd), 'MM10': '%s/annotation/mm10_refseq.ucsc' % (cwd), 'RN4': '%s/annotation/rn4_refseq.ucsc' % (cwd), 'RN6': '%s/annotation/rn6_refseq.ucsc' % (cwd), } annotFile = genomeDict[genome.upper()] # MAKING THE START DICT print('MAKING START DICT') startDict = utils.makeStartDict(annotFile) #GET CHROMS FOUND IN THE BAMS print('GETTING CHROMS IN BAMFILES') bamChromList = getBamChromList(bamFileList) print("USING THE FOLLOWING CHROMS") print(bamChromList) #LOADING IN THE GFF AND FILTERING BY CHROM print('LOADING AND FILTERING THE GFF') inputGFF = filterGFF(inputGFFFile, bamChromList) # LOADING IN THE BOUND REGION REFERENCE COLLECTION print('LOADING IN GFF REGIONS') referenceCollection = utils.gffToLocusCollection(inputGFF) print('CHECKING REFERENCE COLLECTION:') checkRefCollection(referenceCollection) # MASKING REFERENCE COLLECTION # see if there's a mask if options.mask: maskFile = options.mask # if it's a bed file if maskFile.split('.')[-1].upper() == 'BED': maskGFF = utils.bedToGFF(maskFile) elif maskFile.split('.')[-1].upper() == 'GFF': maskGFF = utils.parseTable(maskFile, '\t') else: print("MASK MUST BE A .gff or .bed FILE") sys.exit() maskCollection = utils.gffToLocusCollection(maskGFF) # now mask the reference loci referenceLoci = referenceCollection.getLoci() filteredLoci = [ locus for locus in referenceLoci if len(maskCollection.getOverlap(locus, 'both')) == 0 ] print("FILTERED OUT %s LOCI THAT WERE MASKED IN %s" % (len(referenceLoci) - len(filteredLoci), maskFile)) referenceCollection = utils.LocusCollection(filteredLoci, 50) # NOW STITCH REGIONS print('STITCHING REGIONS TOGETHER') stitchedCollection, debugOutput, stitchWindow = regionStitching( referenceCollection, inputName, outFolder, stitchWindow, tssWindow, annotFile, removeTSS) # NOW MAKE A STITCHED COLLECTION GFF print('MAKING GFF FROM STITCHED COLLECTION') stitchedGFF = utils.locusCollectionToGFF(stitchedCollection) # making sure start/stop ordering are correct for i in range(len(stitchedGFF)): line = stitchedGFF[i] start = int(line[3]) stop = int(line[4]) if start > stop: line[3] = stop line[4] = start print(stitchWindow) print(type(stitchWindow)) if not removeTSS: stitchedGFFFile = '%s%s_%sKB_STITCHED.gff' % (gffFolder, inputName, str(stitchWindow / 1000)) stitchedGFFName = '%s_%sKB_STITCHED' % (inputName, str(stitchWindow / 1000)) debugOutFile = '%s%s_%sKB_STITCHED.debug' % (gffFolder, inputName, str(stitchWindow / 1000)) else: stitchedGFFFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.gff' % ( gffFolder, inputName, str(stitchWindow / 1000)) stitchedGFFName = '%s_%sKB_STITCHED_TSS_DISTAL' % ( inputName, str(stitchWindow / 1000)) debugOutFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.debug' % ( gffFolder, inputName, str(stitchWindow / 1000)) # WRITING DEBUG OUTPUT TO DISK if debug: print('WRITING DEBUG OUTPUT TO DISK AS %s' % (debugOutFile)) utils.unParseTable(debugOutput, debugOutFile, '\t') # WRITE THE GFF TO DISK print('WRITING STITCHED GFF TO DISK AS %s' % (stitchedGFFFile)) utils.unParseTable(stitchedGFF, stitchedGFFFile, '\t') # SETTING UP THE OVERALL OUTPUT FILE outputFile1 = outFolder + stitchedGFFName + '_ENHANCER_REGION_MAP.txt' print('OUTPUT WILL BE WRITTEN TO %s' % (outputFile1)) # MAPPING TO THE NON STITCHED (ORIGINAL GFF) # MAPPING TO THE STITCHED GFF # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail. bamliquidator_path = 'bamliquidator_batch.py' bamFileListUnique = list(bamFileList) bamFileListUnique = utils.uniquify(bamFileListUnique) #prevent redundant mapping print("MAPPING TO THE FOLLOWING BAMS:") print(bamFileListUnique) for bamFile in bamFileListUnique: bamFileName = bamFile.split('/')[-1] # MAPPING TO THE STITCHED GFF mappedOut1Folder = '%s%s_%s_MAPPED' % (mappedFolder, stitchedGFFName, bamFileName) mappedOut1File = '%s%s_%s_MAPPED/matrix.txt' % ( mappedFolder, stitchedGFFName, bamFileName) if utils.checkOutput(mappedOut1File, 0.2, 0.2): print("FOUND %s MAPPING DATA FOR BAM: %s" % (stitchedGFFFile, mappedOut1File)) else: cmd1 = bamliquidator_path + " --sense . -e 200 --match_bamToGFF -r %s -o %s %s" % ( stitchedGFFFile, mappedOut1Folder, bamFile) print(cmd1) os.system(cmd1) if utils.checkOutput(mappedOut1File, 0.2, 5): print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (stitchedGFFFile, bamFileName)) else: print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (stitchedGFFFile, bamFileName)) sys.exit() print('BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS') # CALCULATE DENSITY BY REGION # NEED TO FIX THIS FUNCTION TO ACCOUNT FOR DIFFERENT OUTPUTS OF LIQUIDATOR mapCollection(stitchedCollection, referenceCollection, bamFileList, mappedFolder, outputFile1, refName=stitchedGFFName) print('CALLING AND PLOTTING SUPER-ENHANCERS') if options.control: rankbyName = options.rankby.split('/')[-1] controlName = options.control.split('/')[-1] cmd = 'R --no-save %s %s %s %s < ROSE2_callSuper.R' % ( outFolder, outputFile1, inputName, controlName) else: rankbyName = options.rankby.split('/')[-1] controlName = 'NONE' cmd = 'R --no-save %s %s %s %s < ROSE2_callSuper.R' % ( outFolder, outputFile1, inputName, controlName) print(cmd) os.system(cmd) # calling the gene mapper time.sleep(20) superTableFile = "%s_SuperEnhancers.table.txt" % (inputName) if options.control: cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s &" % ( genome, options.rankby, options.control, outFolder, superTableFile) else: cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s &" % ( genome, options.rankby, outFolder, superTableFile) os.system(cmd) stretchTableFile = "%s_StretchEnhancers.table.txt" % (inputName) if options.control: cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s &" % ( genome, options.rankby, options.control, outFolder, stretchTableFile) else: cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s &" % ( genome, options.rankby, outFolder, stretchTableFile) os.system(cmd) superStretchTableFile = "%s_SuperStretchEnhancers.table.txt" % (inputName) if options.control: cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s &" % ( genome, options.rankby, options.control, outFolder, superStretchTableFile) else: cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s &" % ( genome, options.rankby, outFolder, superStretchTableFile) os.system(cmd)
def main(): ''' main run call ''' debug = False from optparse import OptionParser usage = "usage: %prog [options] -g [GENOME] -i [INPUT_REGION_GFF] -r [RANKBY_BAM_FILE] -o [OUTPUT_FOLDER] [OPTIONAL_FLAGS]" parser = OptionParser(usage=usage) # required flags parser.add_option("-i", "--i", dest="input", nargs=1, default=None, help="Enter a comma separated list of .gff or .bed file of binding sites used to make enhancers") parser.add_option("-r", "--rankby", dest="rankby", nargs=1, default=None, help="Enter a comma separated list of bams to rank by") parser.add_option("-o", "--out", dest="out", nargs=1, default=None, help="Enter an output folder") parser.add_option("-g", "--genome", dest="genome", nargs=1, default=None, help="Enter the genome build (MM9,MM8,HG18,HG19)") # optional flags parser.add_option("-n", "--name", dest="name", nargs=1, default=None, help="Provide a name for the analysis otherwise ROSE will guess") parser.add_option("-c", "--control", dest="control", nargs=1, default=None, help="Enter a comma separated list of control bams. Can either provide a single control bam for all rankby bams, or provide a control bam for each individual bam") parser.add_option("-s", "--stitch", dest="stitch", nargs=1, default='', help="Enter a max linking distance for stitching. Default will determine optimal stitching parameter") parser.add_option("-t", "--tss", dest="tss", nargs=1, default=0, help="Enter a distance from TSS to exclude. 0 = no TSS exclusion") parser.add_option("--mask", dest="mask", nargs=1, default=None, help="Mask a set of regions from analysis. Provide a .bed or .gff of masking regions") # RETRIEVING FLAGS (options, args) = parser.parse_args() if not options.input or not options.rankby or not options.out or not options.genome: print('hi there') parser.print_help() exit() # making the out folder if it doesn't exist outFolder = utils.formatFolder(options.out, True) # figuring out folder schema gffFolder = utils.formatFolder(outFolder + 'gff/', True) mappedFolder = utils.formatFolder(outFolder + 'mappedGFF/', True) # GETTING INPUT FILE(s) inputList = [inputFile for inputFile in options.input.split(',') if len(inputFile) > 1] #converting all input files into GFFs and moving into the GFF folder inputGFFList = [] for inputFile in inputList: if inputFile.split('.')[-1] == 'bed': # CONVERTING A BED TO GFF inputGFFName = inputFile.split('/')[-1][0:-4] #strips the last 4 characters i.e. '.bed' inputGFFFile = '%s%s.gff' % (gffFolder, inputGFFName) utils.bedToGFF(inputFile, inputGFFFile) elif options.input.split('.')[-1] == 'gff': # COPY THE INPUT GFF TO THE GFF FOLDER os.system('cp %s %s' % (inputFile, gffFolder)) inputGFFFile = '%s%s' % (gffFolder,inputFile.split('/')[-1]) else: print('WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT') # COPY THE INPUT GFF TO THE GFF FOLDER os.system('cp %s %s' % (inputFile, gffFolder)) inputGFFFile = '%s%s' % (gffFolder,inputFile.split('/')[-1]) inputGFFList.append(inputGFFFile) # GETTING THE LIST OF BAMFILES TO PROCESS #either same number of bams for rankby and control #or only 1 control #or none! #bamlist should be all rankby bams followed by control bams bamFileList = [] if options.control: controlBamList = [bam for bam in options.control.split(',') if len(bam) >0] rankbyBamList = [bam for bam in options.rankby.split(',') if len(bam) >0] if len(controlBamList) == len(rankbyBamList): #case where an equal number of backgrounds are given bamFileList = rankbyBamList + controlBamList elif len(controlBamList) == 1: #case where a universal background is applied bamFileList = rankbyBamList + controlBamList*len(rankbyBamList) else: print('ERROR: EITHER PROVIDE A SINGLE CONTROL BAM FOR ALL SAMPLES, OR ONE CONTROL BAM FOR EACH SAMPLE') sys.exit() else: bamFileList = [bam for bam in options.rankby.split(',') if len(bam) > 0] # Stitch parameter if options.stitch == '': stitchWindow = '' else: stitchWindow = int(options.stitch) # tss options tssWindow = int(options.tss) if tssWindow != 0: removeTSS = True else: removeTSS = False # GETTING THE GENOME genome = string.upper(options.genome) print('USING %s AS THE GENOME' % (genome)) # GETTING THE CORRECT ANNOT FILE genomeDict = { 'HG18': '%s/annotation/hg18_refseq.ucsc' % (pipeline_dir), 'MM9': '%s/annotation/mm9_refseq.ucsc' % (pipeline_dir), 'HG19': '%s/annotation/hg19_refseq.ucsc' % (pipeline_dir), 'MM8': '%s/annotation/mm8_refseq.ucsc' % (pipeline_dir), 'MM10': '%s/annotation/mm10_refseq.ucsc' % (pipeline_dir), 'RN4': '%s/annotation/rn4_refseq.ucsc' % (pipeline_dir), } try: annotFile = genomeDict[genome.upper()] except KeyError: print('ERROR: UNSUPPORTED GENOMES TYPE %s' % (genome)) sys.exit() #FINDING THE ANALYSIS NAME if options.name: inputName = options.name else: inputName = inputGFFList[0].split('/')[-1].split('.')[0] print('USING %s AS THE ANALYSIS NAME' % (inputName)) print('FORMATTING INPUT REGIONS') # MAKING THE RAW INPUT FILE FROM THE INPUT GFFs #use a simpler unique region naming system if len(inputGFFList) == 1: inputGFF = utils.parseTable(inputGFFList[0],'\t') else: inputLoci = [] for gffFile in inputGFFList: print('\tprocessing %s' % (gffFile)) gff = utils.parseTable(gffFile,'\t') gffCollection = utils.gffToLocusCollection(gff,50) inputLoci += gffCollection.getLoci() inputCollection = utils.LocusCollection(inputLoci,50) inputCollection = inputCollection.stitchCollection() # stitches to produce unique regions inputGFF = utils.locusCollectionToGFF(inputCollection) formattedGFF = [] #now number things appropriately for i,line in enumerate(inputGFF): #use the coordinates to make a new id inputname_chr_sense_start_stop chrom = line[0] coords = [int(line[3]) ,int(line[4])] sense = line[6] lineID = '%s_%s' % (inputName,str(i+1)) #1 indexing newLine = [chrom,lineID,lineID,min(coords),max(coords),'',sense,'',lineID] formattedGFF.append(newLine) #name of the master input gff file masterGFFFile = '%s%s_%s_ALL_-0_+0.gff' % (gffFolder,string.upper(genome),inputName) utils.unParseTable(formattedGFF,masterGFFFile,'\t') print('USING %s AS THE INPUT GFF' % (masterGFFFile)) # MAKING THE START DICT print('MAKING START DICT') startDict = utils.makeStartDict(annotFile) #GET CHROMS FOUND IN THE BAMS print('GETTING CHROMS IN BAMFILES') bamChromList = getBamChromList(bamFileList) print("USING THE FOLLOWING CHROMS") print(bamChromList) #LOADING IN THE GFF AND FILTERING BY CHROM print('LOADING AND FILTERING THE GFF') inputGFF = filterGFF(masterGFFFile,bamChromList) # LOADING IN THE BOUND REGION REFERENCE COLLECTION print('LOADING IN GFF REGIONS') referenceCollection = utils.gffToLocusCollection(inputGFF) print('CHECKING REFERENCE COLLECTION:') checkRefCollection(referenceCollection) # MASKING REFERENCE COLLECTION # see if there's a mask if options.mask: maskFile = options.mask # if it's a bed file if maskFile.split('.')[-1].upper() == 'BED': maskGFF = utils.bedToGFF(maskFile) elif maskFile.split('.')[-1].upper() == 'GFF': maskGFF = utils.parseTable(maskFile, '\t') else: print("MASK MUST BE A .gff or .bed FILE") sys.exit() maskCollection = utils.gffToLocusCollection(maskGFF) # now mask the reference loci referenceLoci = referenceCollection.getLoci() filteredLoci = [locus for locus in referenceLoci if len(maskCollection.getOverlap(locus, 'both')) == 0] print("FILTERED OUT %s LOCI THAT WERE MASKED IN %s" % (len(referenceLoci) - len(filteredLoci), maskFile)) referenceCollection = utils.LocusCollection(filteredLoci, 50) # NOW STITCH REGIONS print('STITCHING REGIONS TOGETHER') stitchedCollection, debugOutput, stitchWindow = regionStitching(referenceCollection, inputName, outFolder, stitchWindow, tssWindow, annotFile, removeTSS) # NOW MAKE A STITCHED COLLECTION GFF print('MAKING GFF FROM STITCHED COLLECTION') stitchedGFF = utils.locusCollectionToGFF(stitchedCollection) print(stitchWindow) print(type(stitchWindow)) if not removeTSS: stitchedGFFFile = '%s%s_%sKB_STITCHED.gff' % (gffFolder, inputName, str(stitchWindow / 1000)) stitchedGFFName = '%s_%sKB_STITCHED' % (inputName, str(stitchWindow / 1000)) debugOutFile = '%s%s_%sKB_STITCHED.debug' % (gffFolder, inputName, str(stitchWindow / 1000)) else: stitchedGFFFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.gff' % (gffFolder, inputName, str(stitchWindow / 1000)) stitchedGFFName = '%s_%sKB_STITCHED_TSS_DISTAL' % (inputName, str(stitchWindow / 1000)) debugOutFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.debug' % (gffFolder, inputName, str(stitchWindow / 1000)) # WRITING DEBUG OUTPUT TO DISK if debug: print('WRITING DEBUG OUTPUT TO DISK AS %s' % (debugOutFile)) utils.unParseTable(debugOutput, debugOutFile, '\t') # WRITE THE GFF TO DISK print('WRITING STITCHED GFF TO DISK AS %s' % (stitchedGFFFile)) utils.unParseTable(stitchedGFF, stitchedGFFFile, '\t') # SETTING UP THE OVERALL OUTPUT FILE outputFile1 = outFolder + stitchedGFFName + '_ENHANCER_REGION_MAP.txt' print('OUTPUT WILL BE WRITTEN TO %s' % (outputFile1)) # MAPPING TO THE NON STITCHED (ORIGINAL GFF) # MAPPING TO THE STITCHED GFF # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail. bamFileListUnique = list(bamFileList) bamFileListUnique = utils.uniquify(bamFileListUnique) #prevent redundant mapping print("MAPPING TO THE FOLLOWING BAMS:") print(bamFileListUnique) for bamFile in bamFileListUnique: bamFileName = bamFile.split('/')[-1] # MAPPING TO THE STITCHED GFF mappedOut1Folder = '%s%s_%s_MAPPED' % (mappedFolder, stitchedGFFName, bamFileName) mappedOut1File = '%s%s_%s_MAPPED/matrix.txt' % (mappedFolder, stitchedGFFName, bamFileName) if utils.checkOutput(mappedOut1File, 0.2, 0.2): print("FOUND %s MAPPING DATA FOR BAM: %s" % (stitchedGFFFile, mappedOut1File)) else: cmd1 = bamliquidator_path + " --sense . -e 200 --match_bamToGFF -r %s -o %s %s" % (stitchedGFFFile, mappedOut1Folder, bamFile) print(cmd1) os.system(cmd1) if utils.checkOutput(mappedOut1File,0.2,5): print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (stitchedGFFFile, bamFileName)) else: print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (stitchedGFFFile, bamFileName)) sys.exit() print('BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS') # CALCULATE DENSITY BY REGION # NEED TO FIX THIS FUNCTION TO ACCOUNT FOR DIFFERENT OUTPUTS OF LIQUIDATOR mapCollection(stitchedCollection, referenceCollection, bamFileList, mappedFolder, outputFile1, refName=stitchedGFFName) print('FINDING AVERAGE SIGNAL AMONGST BAMS') metaOutputFile = collapseRegionMap(outputFile1,inputName + '_MERGED_SIGNAL',controlBams=options.control) #now try the merging print('CALLING AND PLOTTING SUPER-ENHANCERS') rankbyName = inputName + '_MERGED_SIGNAL' controlName = 'NONE' cmd = 'Rscript %sROSE2_callSuper.R %s %s %s %s' % (pipeline_dir,outFolder, metaOutputFile, inputName, controlName) print(cmd) os.system(cmd) # calling the gene mapper print('CALLING GENE MAPPING') superTableFile = "%s_SuperEnhancers.table.txt" % (inputName) #for now don't use ranking bam to call top genes cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % (pipeline_dir,genome, outFolder, superTableFile) print(cmd) os.system(cmd) stretchTableFile = "%s_StretchEnhancers.table.txt" % (inputName) cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % (pipeline_dir,genome, outFolder, stretchTableFile) print(cmd) os.system(cmd) superStretchTableFile = "%s_SuperStretchEnhancers.table.txt" % (inputName) cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % (pipeline_dir,genome, outFolder, superStretchTableFile) os.system(cmd)
def make_shep_on_mycn_landscape(shep_on_dataFile): ''' finds mycn peaks in shep21 that are conserved in nb and segregates them into promoter or enhancer ''' dataDict = pipeline_dfci.loadDataTable(shep_on_dataFile) print('LOADING SHEP ON MYCN SITES') #load all of the shep_on sites # shep_on_gff_path = '%smeta_rose/SHEP_ON_MYC/gff/HG19_SHEP_ON_MYC_ALL_-0_+0.gff' % (projectFolder) # shep_on_gff = utils.parseTable(shep_on_gff_path,'\t') shep_on_bed_path = '%sSHEP_6HR_MYCN_peaks.bed' % (macsEnrichedFolder) shep_on_bed = utils.parseTable(shep_on_bed_path,'\t') shep_on_gff = utils.bedToGFF(shep_on_bed) #now get the conserved NB MYCN regions nb_conserved_mycn_gff_file = '%sHG19_NB_MYCN_CONSERVED_-0_+0.gff' % (gffFolder) nb_conserved_mycn_collection = utils.gffToLocusCollection(nb_conserved_mycn_gff_file) print('LOADING SHEP ACTIVE ENHANCERS') #make a collection of enhancers shep_enhancer_file = '%smeta_rose/SHEP_ON_H3K27AC/SHEP_ON_H3K27AC_AllEnhancers.table.txt' % (projectFolder) shep_enhancer_collection = utils.makeSECollection(shep_enhancer_file,'SHEP_H3K27AC') #now get the active promoters print('LOADING SHEP ACTIVE PROMOTERS') startDict = utils.makeStartDict(annotFile) shep_transcribed_file = '%sHG19_SHEP_ON_H3K27AC_ACTIVE.txt' % (geneListFolder) shep_transcribed_table = utils.parseTable(shep_transcribed_file,'\t') transcribedList = [line[1] for line in shep_transcribed_table] tssLoci = [] for refID in transcribedList: tssLoci.append(utils.makeTSSLocus(refID,startDict,1000,1000)) shep_tss_collection = utils.LocusCollection(tssLoci,50) #now initialize the 6 gffs we will need shep_mycn_gff = [] shep_mycn_gff_5kb = [] shep_mycn_gff_1kb = [] shep_mycn_promoter_gff = [] shep_mycn_promoter_gff_1kb = [] shep_mycn_promoter_gff_5kb = [] shep_mycn_enhancer_gff = [] shep_mycn_enhancer_gff_1kb = [] shep_mycn_enhancer_gff_5kb = [] #and their respective file names shep_mycn_gff_file = '%sHG19_SHEP_MYCN_CONSERVED_-0_+0.gff' % (gffFolder) shep_mycn_gff_5kb_file = '%sHG19_SHEP_MYCN_CONSERVED_-5kb_+5kb.gff' % (gffFolder) shep_mycn_gff_1kb_file = '%sHG19_SHEP_MYCN_CONSERVED_-1kb_+1kb.gff' % (gffFolder) shep_mycn_promoter_gff_file = '%sHG19_SHEP_MYCN_CONSERVED_PROMOTER_-0_+0.gff' % (gffFolder) shep_mycn_promoter_gff_5kb_file = '%sHG19_SHEP_MYCN_CONSERVED_PROMOTER_-5kb_+5kb.gff' % (gffFolder) shep_mycn_promoter_gff_1kb_file = '%sHG19_SHEP_MYCN_CONSERVED_PROMOTER_-1kb_+1kb.gff' % (gffFolder) shep_mycn_enhancer_gff_file = '%sHG19_SHEP_MYCN_CONSERVED_ENHANCER_-0_+0.gff' % (gffFolder) shep_mycn_enhancer_gff_5kb_file = '%sHG19_SHEP_MYCN_CONSERVED_ENHANCER_-5kb_+5kb.gff' % (gffFolder) shep_mycn_enhancer_gff_1kb_file = '%sHG19_SHEP_MYCN_CONSERVED_ENHANCER_-1kb_+1kb.gff' % (gffFolder) print('ITERATING THROUGH SHEP MYCN PEAKS') ticker = 0 enhancer = 0 promoter = 0 other = 0 for line in shep_on_gff: if ticker % 1000 == 0: print ticker ticker+=1 peakID = '%s_%s' % ('SHEP_MYCN',str(ticker)) lineLocus = utils.Locus(line[0],line[3],line[4],'.',peakID) if nb_conserved_mycn_collection.getOverlap(lineLocus): gffLine = [line[0],peakID,peakID,line[3],line[4],'','.','',peakID] peakCenter = (int(line[3]) + int(line[4]))/2 gffLine_5kb = [line[0],peakID,peakID,peakCenter - 5000,peakCenter + 5000,'','.','',peakID] #the 1kb is not a center +/- but a flank gffLine_1kb = [line[0],peakID,peakID,int(line[3]) - 1000,int(line[4]) + 1000,'','.','',peakID] shep_mycn_gff.append(gffLine) shep_mycn_gff_5kb.append(gffLine_5kb) shep_mycn_gff_1kb.append(gffLine_1kb) #tss overlap should take precedence over enhancer overlap if shep_tss_collection.getOverlap(lineLocus,'both'): shep_mycn_promoter_gff.append(gffLine) shep_mycn_promoter_gff_5kb.append(gffLine_5kb) shep_mycn_promoter_gff_1kb.append(gffLine_1kb) promoter+=1 #now check for enhancer overlap elif shep_enhancer_collection.getOverlap(lineLocus,'both'): shep_mycn_enhancer_gff.append(gffLine) shep_mycn_enhancer_gff_5kb.append(gffLine_5kb) shep_mycn_enhancer_gff_1kb.append(gffLine_1kb) enhancer+=1 else: other+=1 print('Of %s shep on mycn peaks' % (len(shep_on_gff))) print('%s are promoter' % (promoter)) print('%s are enhancer' % (enhancer)) print('%s are other' % (other)) #now write out the gffs utils.unParseTable(shep_mycn_gff,shep_mycn_gff_file,'\t') utils.unParseTable(shep_mycn_gff_5kb,shep_mycn_gff_5kb_file,'\t') utils.unParseTable(shep_mycn_gff_1kb,shep_mycn_gff_1kb_file,'\t') utils.unParseTable(shep_mycn_promoter_gff,shep_mycn_promoter_gff_file,'\t') utils.unParseTable(shep_mycn_promoter_gff_5kb,shep_mycn_promoter_gff_5kb_file,'\t') utils.unParseTable(shep_mycn_promoter_gff_1kb,shep_mycn_promoter_gff_1kb_file,'\t') utils.unParseTable(shep_mycn_enhancer_gff,shep_mycn_enhancer_gff_file,'\t') utils.unParseTable(shep_mycn_enhancer_gff_5kb,shep_mycn_enhancer_gff_5kb_file,'\t') utils.unParseTable(shep_mycn_enhancer_gff_1kb,shep_mycn_enhancer_gff_1kb_file,'\t')
new_table = [] for line in bed_table[1:]: chrom = line[0] start = int(line[1]) - 50 stop = int(line[2]) + 50 edge = line[3] strand = line[4] new_line = [chrom, start, stop, edge, strand] new_table.append(new_line) tmp_path = '%s%s' % (temp_dir, bed) if len(new_table) > 0: utils.unParseTable(new_table, tmp_path, '\t') gff = bed.split('.')[0] + '.gff' print(gff) gff_path = '%s%s' % (temp_dir, gff) utils.bedToGFF(tmp_path, gff_path) genome_directory = genome_dir_dict[genome] print('gffToFasta Tool running on ' + gff_path + ' for ' + genome) fasta = utils.gffToFasta(genome, genome_directory, gff_path, UCSC=True, useID=False) print('Creating density table') table = [] header = ['DENSITY', 'POSITIONS', 'POS_COUNT', 'SUBPEAK_LENGTH'] table.append(header)
def main(): ''' main run call ''' debug = False from optparse import OptionParser usage = "usage: %prog [options] -g [GENOME] -i [INPUT_REGION_GFF] -r [RANKBY_BAM_FILE] -o [OUTPUT_FOLDER] [OPTIONAL_FLAGS]" parser = OptionParser(usage=usage) # required flags parser.add_option("-i", "--i", dest="input", nargs=1, default=None, help="Enter a .gff or .bed file of binding sites used to make enhancers") parser.add_option("-r", "--rankby", dest="rankby", nargs=1, default=None, help="bamfile to rank enhancer by") parser.add_option("-o", "--out", dest="out", nargs=1, default=None, help="Enter an output folder") parser.add_option("-g", "--genome", dest="genome", nargs=1, default=None, help="Enter the genome build (MM9,MM8,HG18,HG19)") # optional flags parser.add_option("-b", "--bams", dest="bams", nargs=1, default=None, help="Enter a comma separated list of additional bam files to map to") parser.add_option("-c", "--control", dest="control", nargs=1, default=None, help="bamfile to rank enhancer by") parser.add_option("-s", "--stitch", dest="stitch", nargs=1, default='', help="Enter a max linking distance for stitching. Default will determine optimal stitching parameter") parser.add_option("-t", "--tss", dest="tss", nargs=1, default=0, help="Enter a distance from TSS to exclude. 0 = no TSS exclusion") parser.add_option("--mask", dest="mask", nargs=1, default=None, help="Mask a set of regions from analysis. Provide a .bed or .gff of masking regions") # RETRIEVING FLAGS (options, args) = parser.parse_args() if not options.input or not options.rankby or not options.out or not options.genome: print('hi there') parser.print_help() exit() # making the out folder if it doesn't exist outFolder = utils.formatFolder(options.out, True) # figuring out folder schema gffFolder = utils.formatFolder(outFolder + 'gff/', True) mappedFolder = utils.formatFolder(outFolder + 'mappedGFF/', True) # GETTING INPUT FILE if options.input.split('.')[-1] == 'bed': # CONVERTING A BED TO GFF inputGFFName = options.input.split('/')[-1][0:-4] inputGFFFile = '%s%s.gff' % (gffFolder, inputGFFName) utils.bedToGFF(options.input, inputGFFFile) elif options.input.split('.')[-1] == 'gff': # COPY THE INPUT GFF TO THE GFF FOLDER inputGFFFile = options.input os.system('cp %s %s' % (inputGFFFile, gffFolder)) else: print('WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT') # COPY THE INPUT GFF TO THE GFF FOLDER inputGFFFile = options.input os.system('cp %s %s' % (inputGFFFile, gffFolder)) # GETTING THE LIST OF BAMFILES TO PROCESS if options.control: bamFileList = [options.rankby, options.control] else: bamFileList = [options.rankby] if options.bams: bamFileList += options.bams.split(',') bamFileList = utils.uniquify(bamFileList) # optional args # Stitch parameter if options.stitch == '': stitchWindow = '' else: stitchWindow = int(options.stitch) # tss options tssWindow = int(options.tss) if tssWindow != 0: removeTSS = True else: removeTSS = False # GETTING THE BOUND REGION FILE USED TO DEFINE ENHANCERS print('USING %s AS THE INPUT GFF' % (inputGFFFile)) inputName = inputGFFFile.split('/')[-1].split('.')[0] # GETTING THE GENOME genome = options.genome print('USING %s AS THE GENOME' % genome) # GETTING THE CORRECT ANNOT FILE cwd = os.getcwd() genomeDict = { 'HG18': '%s/annotation/hg18_refseq.ucsc' % (cwd), 'MM9': '%s/annotation/mm9_refseq.ucsc' % (cwd), 'HG19': '%s/annotation/hg19_refseq.ucsc' % (cwd), 'MM8': '%s/annotation/mm8_refseq.ucsc' % (cwd), 'MM10': '%s/annotation/mm10_refseq.ucsc' % (cwd), } annotFile = genomeDict[genome.upper()] # MAKING THE START DICT print('MAKING START DICT') startDict = utils.makeStartDict(annotFile) # LOADING IN THE BOUND REGION REFERENCE COLLECTION print('LOADING IN GFF REGIONS') referenceCollection = utils.gffToLocusCollection(inputGFFFile) # MASKING REFERENCE COLLECTION # see if there's a mask if options.mask: maskFile = options.mask # if it's a bed file if maskFile.split('.')[-1].upper() == 'BED': maskGFF = utils.bedToGFF(maskFile) elif maskFile.split('.')[-1].upper() == 'GFF': maskGFF = utils.parseTable(maskFile, '\t') else: print("MASK MUST BE A .gff or .bed FILE") sys.exit() maskCollection = utils.gffToLocusCollection(maskGFF) # now mask the reference loci referenceLoci = referenceCollection.getLoci() filteredLoci = [locus for locus in referenceLoci if len(maskCollection.getOverlap(locus, 'both')) == 0] print("FILTERED OUT %s LOCI THAT WERE MASKED IN %s" % (len(referenceLoci) - len(filteredLoci), maskFile)) referenceCollection = utils.LocusCollection(filteredLoci, 50) # NOW STITCH REGIONS print('STITCHING REGIONS TOGETHER') stitchedCollection, debugOutput, stitchWindow = regionStitching(inputGFFFile, inputName, outFolder, stitchWindow, tssWindow, annotFile, removeTSS) # NOW MAKE A STITCHED COLLECTION GFF print('MAKING GFF FROM STITCHED COLLECTION') stitchedGFF = utils.locusCollectionToGFF(stitchedCollection) # making sure start/stop ordering are correct for i in range(len(stitchedGFF)): line = stitchedGFF[i] start = int(line[3]) stop = int(line[4]) if start > stop: line[3] = stop line[4] = start print(stitchWindow) print(type(stitchWindow)) if not removeTSS: stitchedGFFFile = '%s%s_%sKB_STITCHED.gff' % (gffFolder, inputName, str(stitchWindow / 1000)) stitchedGFFName = '%s_%sKB_STITCHED' % (inputName, str(stitchWindow / 1000)) debugOutFile = '%s%s_%sKB_STITCHED.debug' % (gffFolder, inputName, str(stitchWindow / 1000)) else: stitchedGFFFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.gff' % (gffFolder, inputName, str(stitchWindow / 1000)) stitchedGFFName = '%s_%sKB_STITCHED_TSS_DISTAL' % (inputName, str(stitchWindow / 1000)) debugOutFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.debug' % (gffFolder, inputName, str(stitchWindow / 1000)) # WRITING DEBUG OUTPUT TO DISK if debug: print('WRITING DEBUG OUTPUT TO DISK AS %s' % (debugOutFile)) utils.unParseTable(debugOutput, debugOutFile, '\t') # WRITE THE GFF TO DISK print('WRITING STITCHED GFF TO DISK AS %s' % (stitchedGFFFile)) utils.unParseTable(stitchedGFF, stitchedGFFFile, '\t') # SETTING UP THE OVERALL OUTPUT FILE outputFile1 = outFolder + stitchedGFFName + '_ENHANCER_REGION_MAP.txt' print('OUTPUT WILL BE WRITTEN TO %s' % (outputFile1)) # MAPPING TO THE NON STITCHED (ORIGINAL GFF) # MAPPING TO THE STITCHED GFF # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail. bamliquidator_path = '/ark/home/jdm/pipeline/bamliquidator_batch.py' if not os.path.isfile(bamliquidator_path): bamliquidator_path = 'bamliquidator_batch.py' if not os.path.isfile(bamliquidator_path): raise ValueError('bamliquidator_batch.py not found in path') for bamFile in bamFileList: bamFileName = bamFile.split('/')[-1] # MAPPING TO THE STITCHED GFF mappedOut1Folder = '%s%s_%s_MAPPED' % (mappedFolder, stitchedGFFName, bamFileName) mappedOut1File = '%s%s_%s_MAPPED/matrix.gff' % (mappedFolder, stitchedGFFName, bamFileName) if utils.checkOutput(mappedOut1File, 0.2, 0.2): print("FOUND %s MAPPING DATA FOR BAM: %s" % (stitchedGFFFile, mappedOut1File)) else: cmd1 = "python " + bamliquidator_path + " --sense . -e 200 --match_bamToGFF -r %s -o %s %s" % (stitchedGFFFile, mappedOut1Folder, bamFile) print(cmd1) output1 = subprocess.Popen(cmd1, stdout=subprocess.PIPE, shell=True) output1 = output1.communicate() if len(output1[0]) > 0: # test if mapping worked correctly print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (stitchedGFFFile, bamFileName)) else: print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (stitchedGFFFile, bamFileName)) sys.exit() # MAPPING TO THE ORIGINAL GFF mappedOut2Folder = '%s%s_%s_MAPPED' % (mappedFolder, inputName, bamFileName) mappedOut2File = '%s%s_%s_MAPPED/matrix.gff' % (mappedFolder, inputName, bamFileName) if utils.checkOutput(mappedOut2File, 0.2, 0.2): print("FOUND %s MAPPING DATA FOR BAM: %s" % (stitchedGFFFile, mappedOut2File)) else: cmd2 = "python " + bamliquidator_path + " --sense . -e 200 --match_bamToGFF -r %s -o %s %s" % (inputGFFFile, mappedOut2Folder, bamFile) print(cmd2) output2 = subprocess.Popen(cmd2, stdout=subprocess.PIPE, shell=True) output2 = output2.communicate() if len(output2[0]) > 0: # test if mapping worked correctly print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (inputGFFFile, bamFileName)) else: print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (inputGFFFile, bamFileName)) sys.exit() print('BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS') # CALCULATE DENSITY BY REGION # NEED TO FIX THIS FUNCTION TO ACCOUNT FOR DIFFERENT OUTPUTS OF LIQUIDATOR mapCollection(stitchedCollection, referenceCollection, bamFileList, mappedFolder, outputFile1, refName=stitchedGFFName) print('CALLING AND PLOTTING SUPER-ENHANCERS') if options.control: rankbyName = options.rankby.split('/')[-1] controlName = options.control.split('/')[-1] cmd = 'R --no-save %s %s %s %s < ROSE2_callSuper.R' % (outFolder, outputFile1, inputName, controlName) else: rankbyName = options.rankby.split('/')[-1] controlName = 'NONE' cmd = 'R --no-save %s %s %s %s < ROSE2_callSuper.R' % (outFolder, outputFile1, inputName, controlName) print(cmd) os.system(cmd) # calling the gene mapper time.sleep(20) superTableFile = "%s_SuperEnhancers.table.txt" % (inputName) if options.control: cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s" % (genome, options.rankby, options.control, outFolder, superTableFile) else: cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s" % (genome, options.rankby, outFolder, superTableFile) os.system(cmd) stretchTableFile = "%s_StretchEnhancers.table.txt" % (inputName) if options.control: cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s" % (genome, options.rankby, options.control, outFolder, stretchTableFile) else: cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s" % (genome, options.rankby, outFolder, stretchTableFile) os.system(cmd) superStretchTableFile = "%s_SuperStretchEnhancers.table.txt" % (inputName) if options.control: cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s" % (genome, options.rankby, options.control, outFolder, superStretchTableFile) else: cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s" % (genome, options.rankby, outFolder, superStretchTableFile) os.system(cmd)
def main(): print('main analysis for MYCN project') print('changing directory to project folder') os.chdir(projectFolder) print('\n\n') print( '#======================================================================' ) print( '#======================I, LOADING DATA ANNOTATION======================' ) print( '#======================================================================' ) print('\n\n') #This section sanity checks each data table and makes sure both bam and .bai files are accessible #for ChIP-Seq pipeline_dfci.summary(mouse_dataFile) print('\n\n') print( '#======================================================================' ) print( '#============II. MAKING A BED OUT OF HG19 FIGURE REGIONS===============' ) print( '#======================================================================' ) print('\n\n') hg19_gff_path = '%sgff/HG19_NB_FIGURE_GENES.gff' % (hg19_projectFolder) hg19_gff = utils.parseTable(hg19_gff_path, '\t') print(hg19_gff) hg19_bed = utils.gffToBed(hg19_gff) print(hg19_bed) hg19_bed_path = '%sbeds/HG19_NB_FIGURE_GENES.bed' % (hg19_projectFolder) utils.unParseTable(hg19_bed, hg19_bed_path, '\t') #need to manually lift this over to mm9 #https://genome.ucsc.edu/cgi-bin/hgLiftOver mm9_bed_path = '%sMM9_NB_FIGURE_GENES_LIFTOVER.bed' % (bedFolder) mm9_gff_path = '%sMM9_NB_FIGURE_GENES_LIFTOVER.gff' % (gffFolder) mm9_gff = utils.bedToGFF(mm9_bed_path) #now add some additional manual regions added_gff_regions = [ [ 'chr12', 'TWIST1_ENHANCER', 'TWIST1_ENHANCER', 34639818, 34656263, '', '-', '', 'TWIST1_ENHANCER' ], [ 'chr11', 'NPM1_PROMOTER_2', 'NPM1_PROMOTER_2', 33049820, 33065883, '', '+', '', 'NPM1_PROMOTER_2' ], [ 'chr6', 'GATA2_ENHANCER', 'GATA2_ENHANCER', 88135802, 88159867, '', '+', '', 'GATA2_ENHANCER' ], [ 'chr7', 'PHOX2A', 'PHOX2A', 108964211, 108974610, '', '+', '', 'PHOX2A' ], [ 'chr15', 'LET7B', 'LET7B', 85497440, 85538754, '', '+', '', 'LET7B', ], [ 'chr10', 'LIN28B', 'LIN28B', 45161233, 45217227, '', '-', '', 'LIN28B' ], ] mm9_gff_full = mm9_gff + added_gff_regions utils.unParseTable(mm9_gff_full, mm9_gff_path, '\t') print('\n\n') print( '#======================================================================' ) print( '#=======================III. PLOTTING DATA IN MOUSE====================' ) print( '#======================================================================' ) print('\n\n') #plot mouse regions plot_mouse_genes(mouse_dataFile, mm9_gff_path)