def main(): ''' main run function ''' from optparse import OptionParser usage = "usage: %prog [options] -g [GENOME] -d [DATAFILE] -r [ROSE_FOLDERS] -o [OUTPUT_FOLDER]" parser = OptionParser(usage = usage) #required flags parser.add_option("-g","--genome", dest="genome",nargs = 1, default=None, help = "Enter the genome build (HG18,HG19,MM9) for the project") parser.add_option("-d","--data", dest="data",nargs = 1, default=None, help = "Enter the data file for the project") parser.add_option("-r","--rose", dest="rose",nargs = 1, default=None, help = "Enter a comma separated list of rose folder") parser.add_option("-o","--output", dest="output",nargs = 1, default=None, help = "Enter the output folder for the project") #additional options parser.add_option("-n","--names", dest="names",nargs = 1, default=None, help = "Enter a comma separated list of names to go with the datasets") parser.add_option("-p","--plot", dest="plot",action = 'store_true', default=False, help = "If flagged, will plot differential regions") parser.add_option("-a","--all", dest="all",action = 'store_true', default=False, help = "If flagged, will run analysis for all enhancers and not just supers.") (options,args) = parser.parse_args() print(options) print(args) if options.genome and options.data and options.rose and options.output: genome = string.upper(options.genome) dataFile = options.data roseFolderString = options.rose [roseFolder1,roseFolder2] = roseFolderString.split(',') parentFolder = utils.formatFolder(options.output,True) if options.names: nameString = options.names [name1,name2] =nameString.split(',') else: name1 = roseFolder1.split('/')[-1] name1 = string.replace(name1,'_ROSE','') name2 = roseFolder2.split('/')[-1] name2 = string.replace(name2,'_ROSE','') mergeName = "%s_%s_merged" % (name1,name2) plotBam = options.plot if options.all: superOnly = False else: superOnly = True if superOnly and plotBam: print "Running dynamic enhancer analysis on all super enhancers in %s and %s and plotting output to %s" % (name1,name2,parentFolder) if superOnly and not plotBam: print "Running dynamic enhancer analysis on all super enhancers in %s and %s and writing output to %s" % (name1,name2,parentFolder) if not superOnly and plotBam: print "Running dynamic enhancer analysis on all enhancers in %s and %s and plotting output to %s. WARNING: Plotting all differential enhancers could take a while" % (name1,name2,parentFolder) if not superOnly and not plotBam: print "Running dynamic enhancer analysis on all enhancers in %s and %s and writing output to %s." % (name1,name2,parentFolder) #part 1 print "PART1: analyzing ROSE output from %s and %s" % (name1,name2) #start with the all enhancer tables from the initial rose calls roseFolder1 = pipeline_dfci.formatFolder(roseFolder1,False) roseFolder2 = pipeline_dfci.formatFolder(roseFolder2,False) superFile1 = '%s%s_peaks_SuperEnhancers.table.txt' % (roseFolder1,name1) superFile2 = '%s%s_peaks_SuperEnhancers.table.txt' % (roseFolder2,name2) allFile1 = '%s/%s_peaks_AllEnhancers.table.txt' % (roseFolder1,name1) allFile2 = '%s/%s_peaks_AllEnhancers.table.txt' % (roseFolder2,name2) print('\tMERGING ENHANCERS AND CALLING ROSE') if superOnly: mergedGFFFile = '%s%s_%s_MERGED_SUPERS_-0_+0.gff' % (parentFolder,string.upper(genome),mergeName) #callMergeSupers(dataFile,superFile1,superFile2,name1,name2,mergedGFFFile,parentFolder) else: mergedGFFFile = '%s%s_%s_MERGED_ENHANCERS_-0_+0.gff' % (parentFolder,string.upper(genome),mergeName) #callMergeSupers(dataFile,allFile1,allFile2,name1,name2,mergedGFFFile,parentFolder) if superOnly: superOutput = "%s%s_ROSE/%s_%s_MERGED_SUPERS_-0_+0_SuperEnhancers_ENHANCER_TO_GENE.txt" % (parentFolder,name1,string.upper(genome),mergeName) else: superOutput = "%s%s_ROSE/%s_%s_MERGED_ENHANCERS_-0_+0_SuperEnhancers_ENHANCER_TO_GENE.txt" % (parentFolder,name1,string.upper(genome),mergeName) print('\tCALCULATING ENHANCER DELTA AND MAKING PLOTS') if utils.checkOutput(superOutput): #part2 is the R script rcmd = callDeltaRScript(mergedGFFFile,parentFolder,name1,name2) print(rcmd) os.system(rcmd) time.sleep(30) callRoseGeneMapper(mergedGFFFile,genome,parentFolder,name1) else: print('ERROR: ROSE CALL FAILED') sys.exit() #rank the genes #part 3 #rank the delta print "PART 3: assinging ranks to differential enhancers" print('\tASSIGNING SUPER RANK TO MERGED ENHANCERS') if superOnly: gffName = '%s_%s_MERGED_SUPERS_-0_+0' % (string.upper(genome),mergeName) else: gffName = '%s_%s_MERGED_ENHANCERS_-0_+0' % (string.upper(genome),mergeName) enhancerToGeneFile = "%s%s_ROSE/%s_0KB_STITCHED_ENHANCER_DELTA_ENHANCER_TO_GENE_100KB.txt" % (parentFolder,name1,gffName) if utils.checkOutput(enhancerToGeneFile): rankOutput = "%s%s_ROSE/%s_0KB_STITCHED_ENHANCER_DELTA_ENHANCER_TO_GENE_100KB_RANK.txt" % (parentFolder,name1,gffName) assignEnhancerRank(enhancerToGeneFile,allFile1,allFile2,name1,name2,rankOutput) else: print('ERROR: DELTA SCRIPT OR ROSE GENE MAPPER FAILED TO RUN') sys.exit() #make the rank plot print('MAKING RANK PLOTS') if utils.checkOutput(rankOutput): rcmd = callRankRScript(rankOutput,name1,name2,superFile1,superFile2) print(rcmd) os.system(rcmd) else: print('ERROR: RANK PLOT SCRIPT FAILED TO RUN') sys.exit() time.sleep(30) print('FINISHING OUTPUT') finishRankOutput(dataFile,rankOutput,genome,parentFolder,mergeName,name1,name2,1,100000,superOnly,plotBam) else: parser.print_help() exit()
# name2 = 'MAC_H4K12AC_1H_MINUS' # main(dataFile,genome,mergeFolder,roseFolder1,roseFolder2,name1,name2,mergeName,True,True) #===================================================== #===================MYC NB LEVELS===================== #===================================================== #MYC vs MYCN in BE(2)C mergeName = 'BE2C_MYC_SUPERS' genome ='hg18' dataFile = '/ark/home/cl512/projects/neuroblastoma/NEURO_TABLE.txt' mergeFolder = '/ark/home/cl512/projects/neuroblastoma/MYC_analysis/dynamicEnhancer/%s/' % (mergeName) mergeFolder = pipeline_dfci.formatFolder(mergeFolder,True) roseFolder1 = '/ark/home/cl512/projects/neuroblastoma/MYC_analysis/be2c_mycn_rose/' roseFolder2 = '/ark/home/cl512/projects/neuroblastoma/MYC_analysis/be2c_myc_rose/' name1 = 'BE2C_MYCN' name2 = 'BE2C_MYC' #main(dataFile,genome,mergeFolder,roseFolder1,roseFolder2,name1,name2,mergeName,True,True) # #===================================================== # #=======================786-O_SUPERS================== # #=====================================================
def finishRankOutput(dataFile,rankOutput,genome,mergeFolder,mergeName,name1,name2,cutOff=1.5,window = 100000,superOnly=True,plotBam=True): ''' cleans up the rank output table makes a gff of all of the gained/lost supers beyond a certain cutoff w/ a window makes a list of gained genes and lost genes makes a bed of gained loss ''' dataDict = pipeline_dfci.loadDataTable(dataFile) #making sure window and cutoff are int/float cutOff = float(cutOff) window = int(window) genome = string.upper(genome) #make the output folder outputFolder =pipeline_dfci.formatFolder(mergeFolder+'output/',True) #bring in the old rank table rankEnhancerTable = utils.parseTable(rankOutput,'\t') #make a new formatted table header = rankEnhancerTable[0] header[-4] = 'DELTA RANK' header[-3] = 'IS_SUPER' formattedRankTable =[header] #the gffs gainedGFF = [] lostGFF = [] gainedWindowGFF = [] lostWindowGFF = [] if superOnly: enhancerType = 'SUPERS' else: enhancerType = 'ENHANCERS' #the beds if superOnly: gainedTrackHeader = 'track name="%s %s only SEs" description="%s super enhancers that are found only in %s vs %s" itemRGB=On color=255,0,0' % (genome,name2,genome,name2,name1) gainedBed = [[gainedTrackHeader]] conservedTrackHeader = 'track name="%s %s and %s SEs" description="%s super enhancers that are found in both %s vs %s" itemRGB=On color=0,0,0' % (genome,name1,name2,genome,name1,name2) conservedBed = [[conservedTrackHeader]] lostTrackHeader = 'track name="%s %s only SEs" description="%s super enhancers that are found only in %s vs %s" itemRGB=On color=0,255,0' % (genome,name1,genome,name1,name2) lostBed = [[lostTrackHeader]] else: gainedTrackHeader = 'track name="%s %s only enhancers" description="%s enhancers that are found only in %s vs %s" itemRGB=On color=255,0,0' % (genome,name2,genome,name2,name1) gainedBed = [[gainedTrackHeader]] conservedTrackHeader = 'track name="%s %s and %s enhancers" description="%s enhancers that are found in both %s vs %s" itemRGB=On color=0,0,0' % (genome,name1,name2,genome,name1,name2) conservedBed = [[conservedTrackHeader]] lostTrackHeader = 'track name="%s %s only enhancers" description="%s enhancers that are found only in %s vs %s" itemRGB=On color=0,255,0' % (genome,name1,genome,name1,name2) lostBed = [[lostTrackHeader]] #the genes geneTable =[['GENE','ENHANCER_ID','ENHANCER_CHROM','ENHANCER_START','ENHANCER_STOP',header[6],header[7],header[8],'STATUS']] for line in rankEnhancerTable[1:]: #fixing the enhancer ID line[0] = line[0].replace('_lociStitched','') formattedRankTable.append(line) #getting the genes geneList = [] geneList += line[9].split(',') geneList += line[10].split(',') geneList += line[11].split(',') geneList = [x for x in geneList if len(x) >0] geneList = utils.uniquify(geneList) geneString = string.join(geneList,',') bedLine = [line[1],line[2],line[3],line[0],line[-4]] #for gained if float(line[6]) > cutOff: gffLine = [line[1],line[0],'',line[2],line[3],'','.','',geneString] gffWindowLine = [line[1],line[0],'',int(line[2])-window,int(line[3])+window,'','.','',geneString] gainedGFF.append(gffLine) gainedWindowGFF.append(gffWindowLine) geneStatus = name2 gainedBed.append(bedLine) #for lost elif float(line[6]) < (-1 * cutOff): gffLine = [line[1],line[0],'',line[2],line[3],'','.','',geneString] gffWindowLine = [line[1],line[0],'',int(line[2])-window,int(line[3])+window,'','.','',geneString] lostGFF.append(gffLine) lostWindowGFF.append(gffWindowLine) geneStatus = name1 lostBed.append(bedLine) #for conserved else: geneStatus = 'CONSERVED' conservedBed.append(bedLine) #now fill in the gene Table for gene in geneList: geneTableLine = [gene,line[0],line[1],line[2],line[3],line[6],line[7],line[8],geneStatus] geneTable.append(geneTableLine) #concat the bed fullBed = gainedBed + conservedBed + lostBed #start writing the output #there's the two gffs, the bed,the formatted table, the gene table #formatted table formattedFilename = "%s%s_%s_MERGED_%s_RANK_TABLE.txt" % (outputFolder,genome,mergeName,enhancerType) utils.unParseTable(formattedRankTable,formattedFilename,'\t') #gffs gffFolder = pipeline_dfci.formatFolder(outputFolder+'gff/',True) gffFilename_gained = "%s%s_%s_%s_ONLY_%s_-0_+0.gff" % (gffFolder,genome,mergeName,string.upper(name2),enhancerType) gffFilenameWindow_gained = "%s%s_%s_%s_ONLY_%s_-%sKB_+%sKB.gff" % (gffFolder,genome,mergeName,string.upper(name2),enhancerType,window/1000,window/1000) gffFilename_lost = "%s%s_%s_%s_ONLY_%s_-0_+0.gff" % (gffFolder,genome,mergeName,string.upper(name1),enhancerType) gffFilenameWindow_lost = "%s%s_%s_%s_ONLY_%s_-%sKB_+%sKB.gff" % (gffFolder,genome,mergeName,string.upper(name1),enhancerType,window/1000,window/1000) utils.unParseTable(gainedGFF,gffFilename_gained,'\t') utils.unParseTable(gainedWindowGFF,gffFilenameWindow_gained,'\t') utils.unParseTable(lostGFF,gffFilename_lost,'\t') utils.unParseTable(lostWindowGFF,gffFilenameWindow_lost,'\t') #bed bedFilename = "%s%s_%s_MERGED_%s.bed" % (outputFolder,genome,mergeName,enhancerType) utils.unParseTable(fullBed,bedFilename,'\t') #geneTable geneFilename = "%s%s_%s_MERGED_%s_GENE_TABLE.txt" % (outputFolder,genome,mergeName,enhancerType) utils.unParseTable(geneTable,geneFilename,'\t') #finally, move all of the plots to the output folder cmd = "cp %s%s_ROSE/*.pdf %s%s_%s_MERGED_%s_DELTA.pdf" % (mergeFolder,name1,outputFolder,genome,mergeName,enhancerType) os.system(cmd) cmd = "cp %s%s_ROSE/*RANK_PLOT.png %s%s_%s_MERGED_%s_RANK_PLOT.png" % (mergeFolder,name1,outputFolder,genome,mergeName,enhancerType) os.system(cmd) #now execute the bamPlot_turbo.py commands if plotBam: bam1 = dataDict[name1]['bam'] bam2 = dataDict[name2]['bam'] bamString = "%s,%s" % (bam1,bam2) nameString = "%s,%s" % (name1,name2) colorString = "0,0,0:100,100,100" #change dir os.chdir(pipelineDir) if len(gainedGFF) > 0: #gained command plotTitle = "%s_ONLY_SE" % (name2) cmd = 'python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE' % (genome,bamString,gffFilename_gained,outputFolder,nameString,colorString,plotTitle) os.system(cmd) #gained window command plotTitle = "%s_ONLY_SE_%sKB_WINDOW" % (name2,window/1000) cmd = 'python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE' % (genome,bamString,gffFilenameWindow_gained,outputFolder,nameString,colorString,plotTitle) os.system(cmd) if len(lostGFF) > 0: #lost command plotTitle = "%s_ONLY_SE" % (name1) cmd = 'python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE' % (genome,bamString,gffFilename_lost,outputFolder,nameString,colorString,plotTitle) os.system(cmd) #lost command plotTitle = "%s_ONLY_SE_%sKB_WINDOW" % (name1,window/1000) cmd = 'python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE' % (genome,bamString,gffFilenameWindow_lost,outputFolder,nameString,colorString,plotTitle) os.system(cmd) return
def main(): """ main run function """ #usage = "usage: %prog [options] -g [GENOME] -b [SORTED BAMFILE(S)] -i [INPUTFILE] -o [OUTPUTFOLDER]" parser = argparse.ArgumentParser(usage='%(prog)s [options]') # required flags parser.add_argument( "-b", "--bam", dest="bam", nargs='*', help="Enter a comma separated list of .bam files to be processed.", required=True) parser.add_argument( "-i", "--input", dest="input", type=str, help="Enter .gff or genomic region e.g. chr1:+:1-1000.", required=True) parser.add_argument( "-g", "--genome", dest="genome", type=str, help="specify a genome, HG18,HG19,MM8,MM9,MM10 are currently supported", required=True) # output flag parser.add_argument("-o", "--output", dest="output", type=str, help="Enter the output folder.", required=True) # additional options parser.add_argument( "--stretch-input", dest="stretch_input", default=None, type=int, help= "Stretch the input regions to a minimum length in bp, e.g. 10000 (for 10kb)" ) parser.add_argument( "-c", "--color", dest="color", default=None, help= "Enter a colon separated list of colors e.g. 255,0,0:255,125,0, default samples the rainbow" ) parser.add_argument( "-s", "--sense", dest="sense", default='both', help="Map to '+','-' or 'both' strands. Default maps to both.") parser.add_argument("-e", "--extension", dest="extension", default=200, help="Extends reads by n bp. Default value is 200bp") parser.add_argument( "-r", "--rpm", dest="rpm", action='store_true', default=False, help="Normalizes density to reads per million (rpm) Default is False") parser.add_argument( "-y", "--yScale", dest="yScale", default="relative", help= "Choose either relative or uniform y axis scaling. options = 'relative,uniform' Default is relative scaling" ) parser.add_argument( "-n", "--names", dest="names", default=None, help="Enter a comma separated list of names for your bams") parser.add_argument( "-p", "--plot", dest="plot", default="MULTIPLE", help= "Choose either all lines on a single plot or multiple plots. options = 'SINGLE,MULTIPLE,MERGE'" ) parser.add_argument( "-t", "--title", dest="title", default='', help= "Specify a title for the output plot(s), default will be the coordinate region" ) # DEBUG OPTION TO SAVE TEMP FILES parser.add_argument( "--scale", dest="scale", default='', help= "Enter a comma separated list of scaling factors for your bams. Default is none" ) parser.add_argument( "--save-temp", dest="save", action='store_true', default=False, help="If flagged will save temporary files made by bamPlot") parser.add_argument("--bed", dest="bed", help="Add a space-delimited list of bed files to plot") parser.add_argument( "--multi-page", dest="multi", action='store_true', default=False, help="If flagged will create a new pdf for each region") args = parser.parse_args() print(args) if args.bam and args.input and args.genome and args.output: # Support a legacy mode where a ',' delimited multiple files bamFileList = args.bam if len(args.bam) == 1: bamFileList = args.bam[0].split(',') # Make sure these are actually files & readable (!) for filename in bamFileList: assert (os.access(filename, os.R_OK)) # bringing in any beds if args.bed: bedFileList = args.bed if type(bedFileList) == str: bedFileList = args.bed.split(',') print(bedFileList) bedCollection = makeBedCollection(bedFileList) else: bedCollection = utils.LocusCollection([], 50) # Load the input for graphing. One of: # - A .gff # - A .bed # - a specific input region (e.g. chr10:.:93150000-93180000) valid_sense_options = {'+', '-', '.'} if os.access(args.input, os.R_OK): if args.input.endswith('.bed'): # Uniquely graph every input of this bed parsed_input_bed = utils.parseTable(args.input, '\t') gffName = os.path.basename(args.input) # Graph title gff = None try: if parsed_input_bed[0][5] in valid_sense_options: # This .bed might have a sense parameter gff = [[ e[0], '', args.input, e[1], e[2], '', e[5], '', '' ] for e in parsed_input_bed] except IndexError: pass if gff is None: print( "Your bed doesn't have a valid senese parameter. Defaulting to both strands, '.'" ) # We only take chr/start/stop and ignore everything else. gff = [[e[0], '', args.input, e[1], e[2], '', '.', '', ''] for e in parsed_input_bed] else: # Default to .gff, since that's the original behavior gff = utils.parseTable(args.input, '\t') gffName = args.input.split('/')[-1].split('.')[0] else: # means a coordinate line has been given e.g. chr1:+:1-100 chromLine = args.input.split(':') try: chrom = chromLine[0] sense = chromLine[1] except IndexError: print( 'Invalid input line or inaccessible file. Try: chr1:.:1-5000' ) exit() assert (sense in valid_sense_options) [start, end] = chromLine[2].split('-') if chrom[0:3] != 'chr': print('ERROR: UNRECOGNIZED GFF OR CHROMOSOME LINE INPUT') exit() gffLine = [chrom, '', args.input, start, end, '', sense, '', ''] gffName = "%s_%s_%s_%s" % (chrom, sense, start, end) gff = [gffLine] # Consider stretching the regions to a fixed minimum size if args.stretch_input: print('Stretching inputs to a minimum of: %d bp' % (args.stretch_input)) minLength = args.stretch_input stretchGff = [] for e in gff: difference = int(e[4]) - int(e[3]) if difference < minLength: pad = int((minLength - difference) / 2) stretchGff.append([ e[0], e[1], e[2], int(e[3]) - pad, int(e[4]) + pad, e[5], e[6], e[7], e[8] ]) else: stretchGff.append(e) gff = stretchGff # Sanity test the gff object assert (all([e[6] in valid_sense_options for e in gff])) # All strands are sane #assert(all([int(e[3]) < int(e[4]) for e in gff])) # All start/stops are ordered # bring in the genome genome = args.genome.upper() if ['HG18', 'HG19', 'HG19_RIBO', 'MM9', 'MM10', 'RN4'].count(genome) == 0: print( 'ERROR: UNSUPPORTED GENOME TYPE %s. USE HG19,HG18, RN4, MM9, or MM10' % (genome)) parser.print_help() exit() # bring in the rest of the options # output rootFolder = args.output if rootFolder[-1] != '/': rootFolder += '/' try: os.listdir(rootFolder) except OSError: print('ERROR: UNABLE TO FIND OUTPUT DIRECTORY %s' % (rootFolder)) exit() # Get analysis title if len(args.title) == 0: title = gffName else: title = args.title # make a temp folder tempFolder = rootFolder + title + '/' print("CREATING TEMP FOLDER %s" % (tempFolder)) pipeline_dfci.formatFolder(tempFolder, create=True) # colors if args.color: colorList = args.color.split(':') colorList = [x.split(',') for x in colorList] if len(colorList) < len(bamFileList): print( 'WARNING: FEWER COLORS THAN BAMS SPECIFIED. COLORS WILL BE RECYCLED' ) # recycling the color list colorList += colorList * (len(bamFileList) / len(colorList)) colorList = colorList[0:len(bamFileList)] else: # cycles through the colors of the rainbow colorList = tasteTheRainbow(len(bamFileList)) # sense sense = args.sense extension = int(args.extension) rpm = args.rpm scale = args.scale yScale = args.yScale.upper() # names if args.names: names = args.names.split(',') if len(names) != len(bamFileList): print( 'ERROR: NUMBER OF NAMES AND NUMBER OF BAMS DO NOT CORRESPOND' ) parser.print_help() exit() else: names = [x.split('/')[-1] for x in bamFileList] # plot style plotStyle = args.plot.upper() if ['SINGLE', 'MULTIPLE', 'MERGE'].count(plotStyle) == 0: print('ERROR: PLOT STYLE %s NOT AN OPTION' % (plotStyle)) parser.print_help() exit() # now run! summaryTableFileName = makeBamPlotTables(gff, genome, bamFileList, colorList, nBins, sense, extension, rpm, tempFolder, names, title, bedCollection, scale) print("%s is the summary table" % (summaryTableFileName)) #running the R command to plot multi = args.multi outFile = "%s%s_plots.pdf" % (rootFolder, title) rCmd = callRPlot(summaryTableFileName, outFile, yScale, plotStyle, multi) # open a bash file bashFileName = "%s%s_Rcmd.sh" % (tempFolder, title) bashFile = open(bashFileName, 'w') bashFile.write('#!/usr/bin/bash\n') bashFile.write(rCmd) bashFile.close() print("Wrote R command to %s" % (bashFileName)) os.system("bash %s" % (bashFileName)) # delete temp files if not args.save: if utils.checkOutput(outFile, 1, 10): # This is super dangerous (!). Add some sanity checks. assert (" " not in tempFolder) assert (tempFolder is not "/") removeCommand = "rm -rf %s" % (tempFolder) print(removeCommand) os.system(removeCommand) else: print("ERROR: NO OUTPUT FILE %s DETECTED" % (outFile)) else: parser.print_help() sys.exit()
#mask Files maskFile = '%smasks/hg19_encode_blacklist.bed' % (projectFolder) #genomeDirectory genomeDirectory = '/grail/genomes/Homo_sapiens/UCSC/hg19/Sequence/Chromosomes/' #making folders folderList = [ gffFolder, macsFolder, macsEnrichedFolder, mappedEnrichedFolder, mappedFolder, wiggleFolder, metaFolder, metaRoseFolder, fastaFolder, figureCodeFolder, figuresFolder, geneListFolder, bedFolder, signalFolder, tableFolder ] for folder in folderList: pipeline_dfci.formatFolder(folder, True) #========================================================================== #============================LIST OF DATAFILES============================= #========================================================================== #this project will utilize multiple datatables #data tables are organized largely by type/system #some data tables overlap for ease of analysis #ATAC-Seq atac_dataFile = '%sdata_tables/ATAC_TABLE.txt' % (projectFolder) #ChIP-Seq be2c_dataFile = '%sdata_tables/BE2C_TABLE.txt' % (projectFolder) mm1s_dataFile = '%sdata_tables/MM1S_TABLE.txt' % (projectFolder)
projectFolder = '/grail/projects/%s/' % (projectName) #PATH TO YOUR PROJECT FOLDER #standard folder names gffFolder ='%sgff/' % (projectFolder) macsFolder = '%smacsFolder/' % (projectFolder) macsEnrichedFolder = '%smacsEnriched/' % (projectFolder) mappedEnrichedFolder = '%smappedEnriched/' % (projectFolder) mappedFolder = '%smappedFolder/' % (projectFolder) wiggleFolder = '%swiggles/' % (projectFolder) metaFolder = '%smeta/' % (projectFolder) #making folders folderList = [gffFolder,macsFolder,macsEnrichedFolder,mappedEnrichedFolder,mappedFolder,wiggleFolder,metaFolder] for folder in folderList: pipeline_dfci.formatFolder(folder,True) #========================================================================== #========================FORMATTING SAMPLE TABLE=========================== #========================================================================== ##THIS SECTION CREATES A DATA TABLE FROM A WHITEHEAD ANNOTATION SPREADSHEET ##give full path ##sampleTableFile = 'YOUR_WIGTC_ANNOTATION.xls' #<- the .xls file in the seq data folder provided by WI #dirpath = '' <- provide full path of folder containing raw seq files ##e.g. /ark/home/jr246/raw/130925_..../QualityScore/
def finishRankOutput(dataFile, rankOutput, genome, mergeFolder, mergeName, name1, name2, cutOff=1.5, window=100000): """ cleans up the rank output table makes a gff of all of the gained/lost supers beyond a certain cutoff w/ a window makes a list of gained genes and lost genes makes a bed of gained loss """ dataDict = pipeline_dfci.loadDataTable(dataFile) # making sure window and cutoff are int/float cutOff = float(cutOff) window = int(window) genome = string.upper(genome) # make the output folder outputFolder = pipeline_dfci.formatFolder(mergeFolder + "output/", True) # bring in the old rank table rankEnhancerTable = utils.parseTable(rankOutput, "\t") # make a new formatted table header = rankEnhancerTable[0] header[-4] = "DELTA RANK" header[-3] = "IS_SUPER" formattedRankTable = [header] # the gffs gainedGFF = [] lostGFF = [] gainedWindowGFF = [] lostWindowGFF = [] # the beds gainedTrackHeader = ( 'track name="%s %s only SEs" description="%s super enhancers that are found only in %s vs %s" itemRGB=On color=255,0,0' % (genome, name2, genome, name2, name1) ) gainedBed = [[gainedTrackHeader]] conservedTrackHeader = ( 'track name="%s %s and %s SEs" description="%s super enhancers that are found in both %s vs %s" itemRGB=On color=0,0,0' % (genome, name1, name2, genome, name1, name2) ) conservedBed = [[conservedTrackHeader]] lostTrackHeader = ( 'track name="%s %s only SEs" description="%s super enhancers that are found only in %s vs %s" itemRGB=On color=0,255,0' % (genome, name1, genome, name1, name2) ) lostBed = [[lostTrackHeader]] # the genes geneTable = [ [ "GENE", "ENHANCER_ID", "ENHANCER_CHROM", "ENHANCER_START", "ENHANCER_STOP", header[6], header[7], header[8], "STATUS", ] ] for line in rankEnhancerTable[1:]: # fixing the enhancer ID line[0] = line[0].replace("_lociStitched", "") formattedRankTable.append(line) # getting the genes geneList = [] geneList += line[9].split(",") geneList += line[10].split(",") geneList += line[11].split(",") geneList = [x for x in geneList if len(x) > 0] geneList = utils.uniquify(geneList) geneString = string.join(geneList, ",") bedLine = [line[1], line[2], line[3], line[0], line[-4]] # for gained if float(line[6]) > cutOff: gffLine = [line[1], line[0], "", line[2], line[3], "", ".", "", geneString] gffWindowLine = [ line[1], line[0], "", int(line[2]) - window, int(line[3]) + window, "", ".", "", geneString, ] gainedGFF.append(gffLine) gainedWindowGFF.append(gffWindowLine) geneStatus = name2 gainedBed.append(bedLine) # for lost elif float(line[6]) < (-1 * cutOff): gffLine = [line[1], line[0], "", line[2], line[3], "", ".", "", geneString] gffWindowLine = [ line[1], line[0], "", int(line[2]) - window, int(line[3]) + window, "", ".", "", geneString, ] lostGFF.append(gffLine) lostWindowGFF.append(gffWindowLine) geneStatus = name1 lostBed.append(bedLine) # for conserved else: geneStatus = "CONSERVED" conservedBed.append(bedLine) # now fill in the gene Table for gene in geneList: geneTableLine = [gene, line[0], line[1], line[2], line[3], line[6], line[7], line[8], geneStatus] geneTable.append(geneTableLine) # concat the bed fullBed = gainedBed + conservedBed + lostBed # start writing the output # there's the two gffs, the bed,the formatted table, the gene table # formatted table formattedFilename = "%s%s_%s_MERGED_SUPERS_RANK_TABLE.txt" % (outputFolder, genome, mergeName) utils.unParseTable(formattedRankTable, formattedFilename, "\t") # gffs gffFolder = pipeline_dfci.formatFolder(outputFolder + "gff/", True) gffFilename_gained = "%s%s_%s_%s_ONLY_SUPERS_-0_+0.gff" % (gffFolder, genome, mergeName, string.upper(name2)) gffFilenameWindow_gained = "%s%s_%s_%s_ONLY_SUPERS_-%sKB_+%sKB.gff" % ( gffFolder, genome, mergeName, string.upper(name2), window / 1000, window / 1000, ) gffFilename_lost = "%s%s_%s_%s_ONLY_SUPERS_-0_+0.gff" % (gffFolder, genome, mergeName, string.upper(name1)) gffFilenameWindow_lost = "%s%s_%s_%s_ONLY_SUPERS_-%sKB_+%sKB.gff" % ( gffFolder, genome, mergeName, string.upper(name1), window / 1000, window / 1000, ) utils.unParseTable(gainedGFF, gffFilename_gained, "\t") utils.unParseTable(gainedWindowGFF, gffFilenameWindow_gained, "\t") utils.unParseTable(lostGFF, gffFilename_lost, "\t") utils.unParseTable(lostWindowGFF, gffFilenameWindow_lost, "\t") # bed bedFilename = "%s%s_%s_MERGED_SUPERS.bed" % (outputFolder, genome, mergeName) utils.unParseTable(fullBed, bedFilename, "\t") # geneTable geneFilename = "%s%s_%s_MERGED_SUPERS_GENE_TABLE.txt" % (outputFolder, genome, mergeName) utils.unParseTable(geneTable, geneFilename, "\t") # finally, move all of the plots to the output folder cmd = "cp %s%s_ROSE/*.pdf %s%s_%s_MERGED_SUPERS_DELTA.pdf" % (mergeFolder, name1, outputFolder, genome, mergeName) os.system(cmd) cmd = "cp %s%s_ROSE/*RANK_PLOT.png %s%s_%s_MERGED_SUPERS_RANK_PLOT.png" % ( mergeFolder, name1, outputFolder, genome, mergeName, ) os.system(cmd) # now execute the bamPlot_turbo.py commands bam1 = dataDict[name1]["bam"] bam2 = dataDict[name2]["bam"] bamString = "%s,%s" % (bam1, bam2) nameString = "%s,%s" % (name1, name2) colorString = "0,0,0:100,100,100" # change dir os.chdir("/ark/home/cl512/pipeline/") if len(gainedGFF) > 0: # gained command plotTitle = "%s_ONLY_SE" % (name2) cmd = "python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE" % ( genome, bamString, gffFilename_gained, outputFolder, nameString, colorString, plotTitle, ) os.system(cmd) # gained window command plotTitle = "%s_ONLY_SE_%sKB_WINDOW" % (name2, window / 1000) cmd = "python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE" % ( genome, bamString, gffFilenameWindow_gained, outputFolder, nameString, colorString, plotTitle, ) os.system(cmd) if len(lostGFF) > 0: # lost command plotTitle = "%s_ONLY_SE" % (name1) cmd = "python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE" % ( genome, bamString, gffFilename_lost, outputFolder, nameString, colorString, plotTitle, ) os.system(cmd) # lost command plotTitle = "%s_ONLY_SE_%sKB_WINDOW" % (name1, window / 1000) cmd = "python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE" % ( genome, bamString, gffFilenameWindow_lost, outputFolder, nameString, colorString, plotTitle, ) os.system(cmd) return
def finishRankOutput(dataFile, rankOutput, genome, mergeFolder, mergeName, name1, name2, cutOff=1.5, window=100000, superOnly=True, plotBam=True): ''' cleans up the rank output table makes a gff of all of the gained/lost supers beyond a certain cutoff w/ a window makes a list of gained genes and lost genes makes a bed of gained loss ''' dataDict = pipeline_dfci.loadDataTable(dataFile) #making sure window and cutoff are int/float cutOff = float(cutOff) window = int(window) genome = string.upper(genome) #make the output folder outputFolder = pipeline_dfci.formatFolder(mergeFolder + 'output/', True) #bring in the old rank table rankEnhancerTable = utils.parseTable(rankOutput, '\t') #make a new formatted table header = rankEnhancerTable[0] header[-4] = 'DELTA RANK' header[-3] = 'IS_SUPER' formattedRankTable = [header] #the gffs gainedGFF = [] lostGFF = [] gainedWindowGFF = [] lostWindowGFF = [] if superOnly: enhancerType = 'SUPERS' else: enhancerType = 'ENHANCERS' #the beds if superOnly: gainedTrackHeader = 'track name="%s %s only SEs" description="%s super enhancers that are found only in %s vs %s" itemRGB=On color=255,0,0' % ( genome, name2, genome, name2, name1) gainedBed = [[gainedTrackHeader]] conservedTrackHeader = 'track name="%s %s and %s SEs" description="%s super enhancers that are found in both %s vs %s" itemRGB=On color=0,0,0' % ( genome, name1, name2, genome, name1, name2) conservedBed = [[conservedTrackHeader]] lostTrackHeader = 'track name="%s %s only SEs" description="%s super enhancers that are found only in %s vs %s" itemRGB=On color=0,255,0' % ( genome, name1, genome, name1, name2) lostBed = [[lostTrackHeader]] else: gainedTrackHeader = 'track name="%s %s only enhancers" description="%s enhancers that are found only in %s vs %s" itemRGB=On color=255,0,0' % ( genome, name2, genome, name2, name1) gainedBed = [[gainedTrackHeader]] conservedTrackHeader = 'track name="%s %s and %s enhancers" description="%s enhancers that are found in both %s vs %s" itemRGB=On color=0,0,0' % ( genome, name1, name2, genome, name1, name2) conservedBed = [[conservedTrackHeader]] lostTrackHeader = 'track name="%s %s only enhancers" description="%s enhancers that are found only in %s vs %s" itemRGB=On color=0,255,0' % ( genome, name1, genome, name1, name2) lostBed = [[lostTrackHeader]] #the genes geneTable = [[ 'GENE', 'ENHANCER_ID', 'ENHANCER_CHROM', 'ENHANCER_START', 'ENHANCER_STOP', header[6], header[7], header[8], 'STATUS' ]] for line in rankEnhancerTable[1:]: #fixing the enhancer ID line[0] = line[0].replace('_lociStitched', '') formattedRankTable.append(line) #getting the genes geneList = [] geneList += line[9].split(',') geneList += line[10].split(',') geneList += line[11].split(',') geneList = [x for x in geneList if len(x) > 0] geneList = utils.uniquify(geneList) geneString = string.join(geneList, ',') bedLine = [line[1], line[2], line[3], line[0], line[-4]] #for gained if float(line[6]) > cutOff: gffLine = [ line[1], line[0], '', line[2], line[3], '', '.', '', geneString ] gffWindowLine = [ line[1], line[0], '', int(line[2]) - window, int(line[3]) + window, '', '.', '', geneString ] gainedGFF.append(gffLine) gainedWindowGFF.append(gffWindowLine) geneStatus = name2 gainedBed.append(bedLine) #for lost elif float(line[6]) < (-1 * cutOff): gffLine = [ line[1], line[0], '', line[2], line[3], '', '.', '', geneString ] gffWindowLine = [ line[1], line[0], '', int(line[2]) - window, int(line[3]) + window, '', '.', '', geneString ] lostGFF.append(gffLine) lostWindowGFF.append(gffWindowLine) geneStatus = name1 lostBed.append(bedLine) #for conserved else: geneStatus = 'CONSERVED' conservedBed.append(bedLine) #now fill in the gene Table for gene in geneList: geneTableLine = [ gene, line[0], line[1], line[2], line[3], line[6], line[7], line[8], geneStatus ] geneTable.append(geneTableLine) #concat the bed fullBed = gainedBed + conservedBed + lostBed #start writing the output #there's the two gffs, the bed,the formatted table, the gene table #formatted table formattedFilename = "%s%s_%s_MERGED_%s_RANK_TABLE.txt" % ( outputFolder, genome, mergeName, enhancerType) utils.unParseTable(formattedRankTable, formattedFilename, '\t') #gffs gffFolder = pipeline_dfci.formatFolder(outputFolder + 'gff/', True) gffFilename_gained = "%s%s_%s_%s_ONLY_%s_-0_+0.gff" % ( gffFolder, genome, mergeName, string.upper(name2), enhancerType) gffFilenameWindow_gained = "%s%s_%s_%s_ONLY_%s_-%sKB_+%sKB.gff" % ( gffFolder, genome, mergeName, string.upper(name2), enhancerType, window / 1000, window / 1000) gffFilename_lost = "%s%s_%s_%s_ONLY_%s_-0_+0.gff" % ( gffFolder, genome, mergeName, string.upper(name1), enhancerType) gffFilenameWindow_lost = "%s%s_%s_%s_ONLY_%s_-%sKB_+%sKB.gff" % ( gffFolder, genome, mergeName, string.upper(name1), enhancerType, window / 1000, window / 1000) utils.unParseTable(gainedGFF, gffFilename_gained, '\t') utils.unParseTable(gainedWindowGFF, gffFilenameWindow_gained, '\t') utils.unParseTable(lostGFF, gffFilename_lost, '\t') utils.unParseTable(lostWindowGFF, gffFilenameWindow_lost, '\t') #bed bedFilename = "%s%s_%s_MERGED_%s.bed" % (outputFolder, genome, mergeName, enhancerType) utils.unParseTable(fullBed, bedFilename, '\t') #geneTable geneFilename = "%s%s_%s_MERGED_%s_GENE_TABLE.txt" % ( outputFolder, genome, mergeName, enhancerType) utils.unParseTable(geneTable, geneFilename, '\t') #finally, move all of the plots to the output folder cmd = "cp %s%s_ROSE/*.pdf %s%s_%s_MERGED_%s_DELTA.pdf" % ( mergeFolder, name1, outputFolder, genome, mergeName, enhancerType) os.system(cmd) cmd = "cp %s%s_ROSE/*RANK_PLOT.png %s%s_%s_MERGED_%s_RANK_PLOT.png" % ( mergeFolder, name1, outputFolder, genome, mergeName, enhancerType) os.system(cmd) #now execute the bamPlot_turbo.py commands if plotBam: bam1 = dataDict[name1]['bam'] bam2 = dataDict[name2]['bam'] bamString = "%s,%s" % (bam1, bam2) nameString = "%s,%s" % (name1, name2) colorString = "0,0,0:100,100,100" #change dir os.chdir(pipelineDir) if len(gainedGFF) > 0: #gained command plotTitle = "%s_ONLY_SE" % (name2) cmd = 'python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE' % ( genome, bamString, gffFilename_gained, outputFolder, nameString, colorString, plotTitle) os.system(cmd) #gained window command plotTitle = "%s_ONLY_SE_%sKB_WINDOW" % (name2, window / 1000) cmd = 'python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE' % ( genome, bamString, gffFilenameWindow_gained, outputFolder, nameString, colorString, plotTitle) os.system(cmd) if len(lostGFF) > 0: #lost command plotTitle = "%s_ONLY_SE" % (name1) cmd = 'python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE' % ( genome, bamString, gffFilename_lost, outputFolder, nameString, colorString, plotTitle) os.system(cmd) #lost command plotTitle = "%s_ONLY_SE_%sKB_WINDOW" % (name1, window / 1000) cmd = 'python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE' % ( genome, bamString, gffFilenameWindow_lost, outputFolder, nameString, colorString, plotTitle) os.system(cmd) return
def main(): ''' main run function ''' from optparse import OptionParser usage = "usage: %prog [options] -g [GENOME] -d [DATAFILE] -n [DATA_NAMES] -r [ROSE_FOLDERS] -o [OUTPUT_FOLDER]" parser = OptionParser(usage=usage) #required flags parser.add_option( "-g", "--genome", dest="genome", nargs=1, default=None, help="Enter the genome build (HG18,HG19,MM9,RN4,RN6) for the project") parser.add_option("-d", "--data", dest="data", nargs=1, default=None, help="Enter the data file for the project") parser.add_option("-r", "--rose", dest="rose", nargs=1, default=None, help="Enter a comma separated list of rose folder") parser.add_option("-o", "--output", dest="output", nargs=1, default=None, help="Enter the output folder for the project") parser.add_option( "-n", "--names", dest="names", nargs=1, default=None, help="Enter a comma separated list of names to go with the datasets") #additional options parser.add_option("-p", "--plot", dest="plot", action='store_true', default=False, help="If flagged, will plot differential regions") parser.add_option( "-a", "--all", dest="all", action='store_true', default=False, help= "If flagged, will run analysis for all enhancers and not just supers.") parser.add_option("-m", "--median", dest="median", action='store_true', default=False, help="If flagged, will use median enhancer scaling") parser.add_option( "-e", "--enhancer-type", dest="enhancer_type", nargs=1, default='super', help="specify type of enhancer to analyze: super, stretch, superStretch" ) (options, args) = parser.parse_args() print(options) print(args) if options.genome and options.data and options.rose and options.output and options.names: genome = string.upper(options.genome) dataFile = options.data roseFolderString = options.rose [roseFolder1, roseFolder2] = roseFolderString.split(',') parentFolder = utils.formatFolder(options.output, True) nameString = options.names [name1, name2] = nameString.split(',') mergeName = "%s_%s_merged" % (name1, name2) #option for median scaling medianScale = options.median plotBam = options.plot if options.all: superOnly = False else: superOnly = True if superOnly and plotBam: print "Running dynamic enhancer analysis on all super enhancers in %s and %s and plotting output to %s" % ( name1, name2, parentFolder) if superOnly and not plotBam: print "Running dynamic enhancer analysis on all super enhancers in %s and %s and writing output to %s" % ( name1, name2, parentFolder) if not superOnly and plotBam: print "Running dynamic enhancer analysis on all enhancers in %s and %s and plotting output to %s. WARNING: Plotting all differential enhancers could take a while" % ( name1, name2, parentFolder) if not superOnly and not plotBam: print "Running dynamic enhancer analysis on all enhancers in %s and %s and writing output to %s." % ( name1, name2, parentFolder) #part 1 print "PART1: analyzing ROSE output from %s and %s" % (name1, name2) #start with the all enhancer tables from the initial rose calls roseFolder1 = pipeline_dfci.formatFolder(roseFolder1, False) roseFolder2 = pipeline_dfci.formatFolder(roseFolder2, False) roseDict1 = makeRoseDict(roseFolder1) roseDict2 = makeRoseDict(roseFolder2) #choosing the type of enhancer to analyze enhancerCallType = string.lower(options.enhancer_type) if superOnly: print("ANALYZING ENHANCER TYPE: %s" % (string.upper(enhancerCallType))) superFile1 = roseDict1[enhancerCallType] superFile2 = roseDict2[enhancerCallType] allFile1 = roseDict1['AllEnhancer'] allFile2 = roseDict2['AllEnhancer'] print('\tMERGING ENHANCERS AND CALLING ROSE') if superOnly: if len(superFile1) == 0: print "ERROR: UNABLE TO FIND %s FILES IN %s" % ( enhancerCallType, roseFolder1) sys.exit() if len(superFile2) == 0: print "ERROR: UNABLE TO FIND %s FILES IN %s" % ( enhancerCallType, roseFolder2) sys.exit() roseOutput = callMergeSupers(dataFile, superFile1, superFile2, name1, name2, mergeName, genome, parentFolder) else: roseOutput = callMergeSupers(dataFile, allFile1, allFile2, name1, name2, mergeName, genome, parentFolder) print('\tCALCULATING ENHANCER DELTA AND MAKING PLOTS') #part2 is the R script mergedGFFFile = '%s%s_%s_MERGED_REGIONS_-0_+0.gff' % ( parentFolder, string.upper(genome), mergeName) rcmd = callDeltaRScript(mergedGFFFile, parentFolder, dataFile, name1, name2, allFile1, allFile2, medianScale) print(rcmd) os.system(rcmd) time.sleep(30) callRoseGeneMapper(mergedGFFFile, genome, parentFolder, name1) #rank the genes #part 3 #rank the delta print "PART 3: assinging ranks to differential enhancers" print('\tASSIGNING SUPER RANK TO MERGED ENHANCERS') gffName = '%s_%s_MERGED_REGIONS_-0_+0' % (string.upper(genome), mergeName) enhancerToGeneFile = "%s%s_ROSE/%s_0KB_STITCHED_ENHANCER_DELTA_ENHANCER_TO_GENE_100KB.txt" % ( parentFolder, name1, gffName) if utils.checkOutput(enhancerToGeneFile): rankOutput = "%s%s_ROSE/%s_0KB_STITCHED_ENHANCER_DELTA_ENHANCER_TO_GENE_100KB_RANK.txt" % ( parentFolder, name1, gffName) assignEnhancerRank(enhancerToGeneFile, allFile1, allFile2, name1, name2, rankOutput) else: print('ERROR: DELTA SCRIPT OR ROSE GENE MAPPER FAILED TO RUN') sys.exit() #make the rank plot print('MAKING RANK PLOTS') if utils.checkOutput(rankOutput): rcmd = callRankRScript(rankOutput, name1, name2, superFile1, superFile2) print(rcmd) os.system(rcmd) else: print('ERROR: RANK PLOT SCRIPT FAILED TO RUN') sys.exit() time.sleep(30) print('FINISHING OUTPUT') finishRankOutput(dataFile, rankOutput, genome, parentFolder, mergeName, name1, name2, 1, 100000, superOnly, plotBam) else: parser.print_help() sys.exit()
def main(): ''' main run function ''' from optparse import OptionParser usage = "usage: %prog [options] -g [GENOME] -b [SORTED BAMFILE(S)] -i [INPUTFILE] -o [OUTPUTFOLDER]" parser = OptionParser(usage = usage) #required flags parser.add_option("-b","--bam", dest="bam",nargs = 1, default=None, help = "Enter a comma separated list of .bam files to be processed.") parser.add_option("-i","--input", dest="input",nargs = 1, default=None, help = "Enter .gff or genomic region e.g. chr1:+:1-1000.") parser.add_option("-g","--genome",dest="genome",nargs =1, default = None, help = "specify a genome, options are hg18 or mm9 right now") #output flag parser.add_option("-o","--output", dest="output",nargs = 1, default=None, help = "Enter the output folder.") #additional options parser.add_option("-c","--color", dest="color",nargs = 1, default=None, help = "Enter a colon separated list of colors e.g. 255,0,0:255,125,0, default samples the rainbow") parser.add_option("-s","--sense", dest="sense",nargs = 1, default='both', help = "Map to '+','-' or 'both' strands. Default maps to both.") parser.add_option("-e","--extension", dest="extension",nargs = 1, default=200, help = "Extends reads by n bp. Default value is 200bp") parser.add_option("-r","--rpm", dest="rpm",action = 'store_true', default=False, help = "Normalizes density to reads per million (rpm) Default is True") parser.add_option("-y","--yScale",dest="yScale",nargs =1, default = "relative", help = "Choose either relative or uniform y axis scaling. options = 'relative,uniform' Default is relative scaling") parser.add_option("-n","--names",dest="names",nargs =1, default = None, help = "Enter a comma separated list of names for your bams") parser.add_option("-p","--plot",dest="plot",nargs =1, default = "multiple", help = "Choose either all lines on a single plot or multiple plots. options = 'single,multiple'") parser.add_option("-t","--title",dest ="title",nargs=1,default = '', help = "Specify a title for the output plot(s), default will be the coordinate region") (options,args) = parser.parse_args() print(options) print(args) if options.bam and options.input and options.genome and options.output: #bring in the bams bamFileList = options.bam.split(',') #bring in the gff try: gff = parseTable(options.input,'\t') gffName = options.input.split('/')[-1].split('.')[0] except IOError: #means a coordinate line has been given e.g. chr1:+:1-100 chromLine = options.input.split(':') chrom = chromLine[0] sense = chromLine[1] [start,end] = chromLine[2].split('-') if chrom[0:3] != 'chr': print('ERROR: UNRECOGNIZED GFF OR CHROMOSOME LINE INPUT') exit() gffLine = [chrom,'',options.input,start,end,'',sense,'',''] gffName = "%s_%s_%s_%s" % (chrom,sense,start,end) gff = [gffLine] #bring in the genome genome = upper(options.genome) if ['HG18','HG19','MM9','RN5'].count(genome) == 0: print('ERROR: UNSUPPORTED GENOME TYPE %s. USE HG18, RN5, OR MM9' % (genome)) parser.print_help() exit() #bring in the rest of the options #output rootFolder = options.output if rootFolder[-1] != '/': rootFolder+='/' try: foo = os.listdir(rootFolder) except OSError: print('ERROR: UNABLE TO FIND OUTPUT DIRECTORY %S' % (rootFolder)) exit() #Get analysis title if len(options.title) == 0: title = gffName else: title = options.title #make a temp folder tempFolder = rootFolder + title + '/' print("CREATING TEMP FOLDER %s" % (tempFolder)) pipeline_dfci.formatFolder(tempFolder,create=True) #colors if options.color: colorList = options.color.split(':') colorList = [x.split(',') for x in colorList] if len(colorList) < len(bamFileList): print('WARNING: FEWER COLORS THAN BAMS SPECIFIED. COLORS WILL BE RECYCLED') #recycling the color list colorList += colorList*(len(bamFileList)/len(colorList)) colorList = colorList[0:len(bamFileList)] else: #cycles through the colors of the rainbow colorList = tasteTheRainbow(len(bamFileList)) #sense sense = options.sense extension = int(options.extension) rpm = options.rpm yScale = upper(options.yScale) #names if options.names: names = options.names.split(',') if len(names) != len(bamFileList): print('ERROR: NUMBER OF NAMES AND NUMBER OF BAMS DO NOT CORRESPOND') parser.print_help() exit() else: names = [x.split('/')[-1] for x in bamFileList] #plot style plotStyle = upper(options.plot) if ['SINGLE','MULTIPLE'].count(plotStyle) == 0: print('ERROR: PLOT STYLE %s NOT AN OPTION' % (plotStyle)) parser.print_help() exit() #now run! summaryTableFileName = makeBamPlotTables(gff,genome,bamFileList,colorList,nBins,sense,extension,rpm,tempFolder,names,title) print ("%s is the summary table" % (summaryTableFileName)) outFile = "%s%s_plots.pdf" % (rootFolder,title) rCmd = callRPlot(summaryTableFileName,outFile,yScale,plotStyle) #open a bash file to get shit done bashFileName = "%s%s_Rcmd.sh" % (tempFolder,title) bashFile = open(bashFileName,'w') bashFile.write(rCmd) bashFile.close() print("Wrote R command to %s" % (bashFileName)) os.system("bash %s" % (bashFileName)) else: parser.print_help() exit()
def main(): ''' main run function ''' from optparse import OptionParser usage = "usage: %prog [options] -g [GENOME] -d [DATAFILE] -n [DATA_NAMES] -r [ROSE_FOLDERS] -o [OUTPUT_FOLDER]" parser = OptionParser(usage = usage) #required flags parser.add_option("-g","--genome", dest="genome",nargs = 1, default=None, help = "Enter the genome build (HG18,HG19,MM9,RN4,RN6) for the project") parser.add_option("-d","--data", dest="data",nargs = 1, default=None, help = "Enter the data file for the project") parser.add_option("-r","--rose", dest="rose",nargs = 1, default=None, help = "Enter a comma separated list of rose folder") parser.add_option("-o","--output", dest="output",nargs = 1, default=None, help = "Enter the output folder for the project") parser.add_option("-n","--names", dest="names",nargs = 1, default=None, help = "Enter a comma separated list of names to go with the datasets") #additional options parser.add_option("-p","--plot", dest="plot",action = 'store_true', default=False, help = "If flagged, will plot differential regions") parser.add_option("-a","--all", dest="all",action = 'store_true', default=False, help = "If flagged, will run analysis for all enhancers and not just supers.") parser.add_option("-m","--median", dest="median",action = 'store_true', default=False, help = "If flagged, will use median enhancer scaling") parser.add_option("-e","--enhancer-type", dest="enhancer_type",nargs = 1,default='super', help = "specify type of enhancer to analyze: super, stretch, superStretch") (options,args) = parser.parse_args() print(options) print(args) if options.genome and options.data and options.rose and options.output and options.names: genome = string.upper(options.genome) dataFile = options.data roseFolderString = options.rose [roseFolder1,roseFolder2] = roseFolderString.split(',') parentFolder = utils.formatFolder(options.output,True) nameString = options.names [name1,name2] =nameString.split(',') mergeName = "%s_%s_merged" % (name1,name2) #option for median scaling medianScale = options.median plotBam = options.plot if options.all: superOnly = False else: superOnly = True if superOnly and plotBam: print "Running dynamic enhancer analysis on all super enhancers in %s and %s and plotting output to %s" % (name1,name2,parentFolder) if superOnly and not plotBam: print "Running dynamic enhancer analysis on all super enhancers in %s and %s and writing output to %s" % (name1,name2,parentFolder) if not superOnly and plotBam: print "Running dynamic enhancer analysis on all enhancers in %s and %s and plotting output to %s. WARNING: Plotting all differential enhancers could take a while" % (name1,name2,parentFolder) if not superOnly and not plotBam: print "Running dynamic enhancer analysis on all enhancers in %s and %s and writing output to %s." % (name1,name2,parentFolder) #part 1 print "PART1: analyzing ROSE output from %s and %s" % (name1,name2) #start with the all enhancer tables from the initial rose calls roseFolder1 = pipeline_dfci.formatFolder(roseFolder1,False) roseFolder2 = pipeline_dfci.formatFolder(roseFolder2,False) roseDict1 = makeRoseDict(roseFolder1) roseDict2 = makeRoseDict(roseFolder2) #choosing the type of enhancer to analyze enhancerCallType = string.lower(options.enhancer_type) if superOnly: print("ANALYZING ENHANCER TYPE: %s" % (string.upper(enhancerCallType))) superFile1 = roseDict1[enhancerCallType] superFile2 = roseDict2[enhancerCallType] allFile1 = roseDict1['AllEnhancer'] allFile2 = roseDict2['AllEnhancer'] print('\tMERGING ENHANCERS AND CALLING ROSE') if superOnly: if len(superFile1) ==0: print "ERROR: UNABLE TO FIND %s FILES IN %s" % (enhancerCallType,roseFolder1) sys.exit() if len(superFile2) == 0: print "ERROR: UNABLE TO FIND %s FILES IN %s" % (enhancerCallType,roseFolder2) sys.exit() roseOutput = callMergeSupers(dataFile,superFile1,superFile2,name1,name2,mergeName,genome,parentFolder) else: roseOutput = callMergeSupers(dataFile,allFile1,allFile2,name1,name2,mergeName,genome,parentFolder) print('\tCALCULATING ENHANCER DELTA AND MAKING PLOTS') #part2 is the R script mergedGFFFile = '%s%s_%s_MERGED_REGIONS_-0_+0.gff' % (parentFolder,string.upper(genome),mergeName) rcmd = callDeltaRScript(mergedGFFFile,parentFolder,dataFile,name1,name2,allFile1,allFile2,medianScale) print(rcmd) os.system(rcmd) time.sleep(30) callRoseGeneMapper(mergedGFFFile,genome,parentFolder,name1) #rank the genes #part 3 #rank the delta print "PART 3: assinging ranks to differential enhancers" print('\tASSIGNING SUPER RANK TO MERGED ENHANCERS') gffName = '%s_%s_MERGED_REGIONS_-0_+0' % (string.upper(genome),mergeName) enhancerToGeneFile = "%s%s_ROSE/%s_0KB_STITCHED_ENHANCER_DELTA_ENHANCER_TO_GENE_100KB.txt" % (parentFolder,name1,gffName) if utils.checkOutput(enhancerToGeneFile): rankOutput = "%s%s_ROSE/%s_0KB_STITCHED_ENHANCER_DELTA_ENHANCER_TO_GENE_100KB_RANK.txt" % (parentFolder,name1,gffName) assignEnhancerRank(enhancerToGeneFile,allFile1,allFile2,name1,name2,rankOutput) else: print('ERROR: DELTA SCRIPT OR ROSE GENE MAPPER FAILED TO RUN') sys.exit() #make the rank plot print('MAKING RANK PLOTS') if utils.checkOutput(rankOutput): rcmd = callRankRScript(rankOutput,name1,name2,superFile1,superFile2) print(rcmd) os.system(rcmd) else: print('ERROR: RANK PLOT SCRIPT FAILED TO RUN') sys.exit() time.sleep(30) print('FINISHING OUTPUT') finishRankOutput(dataFile,rankOutput,genome,parentFolder,mergeName,name1,name2,1,100000,superOnly,plotBam) else: parser.print_help() sys.exit()
def makeGEORNATable(dataFile, namesList, geoName, outputFolder=''): ''' makes a geo table and a bash script to format everything ''' dataDict = pipeline_dfci.loadDataTable(dataFile) if len(namesList) == 0: namesList = dataDict.keys() #set up bash script and output folder outputFolder = pipeline_dfci.formatFolder(outputFolder, True) bashFileName = '%s%s_bash.sh' % (outputFolder, geoName) bashFile = open(bashFileName, 'w') geoTable = [['SAMPLE_NAME', 'TITLE', 'CELL_TYPE', 'RAW_FILE', 'BARCODE']] namesList.sort() for name in namesList: sampleName = dataDict[name]['uniqueID'] title = name cell_type = name.split('_')[0] raw_file = "%s.fastq.gz" % (name) fastqFile = dataDict[name]['fastq'] uniqueID = dataDict[name]['uniqueID'] try: barcode = pipeline_dfci.getTONYInfo(uniqueID, 38) except IndexError: barcode = '' newLine = [sampleName, title, cell_type, raw_file, barcode] geoTable.append(newLine) utils.unParseTable(geoTable, "%s%s_meta.xls" % (outputFolder, geoName), '\t') #now make the folder to hold everything and the relevant bash script if len(outputFolder) == 0: outputFolder = './%s/' % (geoName) else: outputFolder = outputFolder + geoName + '/' pipeline_dfci.formatFolder(outputFolder, True) #now make the bash file bashFile.write('#!/usr/bin/bash\n') bashFile.write('cd %s\n' % (outputFolder)) bashFile.write('\n\n') #write the untar command for name in namesList: fastqFile = dataDict[name]['fastq'] if len(fastqFile) == 0: print "NO FASTQ FILE FOR %s" % (name) continue if fastqFile.count( 'tar.gz' ) > 0: #for files generated by whitehead that have tar header #####RACHEL READ HERE tarCmd = 'tar --strip-components 5 --to-stdout -xzvf %s | gzip > %s.fastq.gz\n' % ( fastqFile, name) else: tarCmd = 'cp %s %s.fastq.gz\n' % (fastqFile, name) bashFile.write(tarCmd) bashFile.write('\n\n\n') #write the md5sums for the wiggles bashFile.write('\n\n\n') bashFile.write("echo '' > md5sum.txt\n") #write md5sums for the fastqs for name in namesList: md5Cmd = 'md5sum %s.fastq.gz >> md5sum.txt\n' % (name) bashFile.write(md5Cmd) #the big tar command tarCmd = '#tar -cvzf %s.tar.gz %s\n' % (geoName, outputFolder) bashFile.write(tarCmd) bashFile.close()
def main(): """ main run function """ #usage = "usage: %prog [options] -g [GENOME] -b [SORTED BAMFILE(S)] -i [INPUTFILE] -o [OUTPUTFOLDER]" parser = argparse.ArgumentParser(usage='%(prog)s [options]') # required flags parser.add_argument("-b", "--bam", dest="bam", nargs='*', help="Enter a comma separated list of .bam files to be processed.", required=True) parser.add_argument("-i", "--input", dest="input", type=str, help="Enter .gff or genomic region e.g. chr1:+:1-1000.", required=True) parser.add_argument("-g", "--genome", dest="genome", type=str, help="specify a genome, HG18,HG19,MM8,MM9,MM10 are currently supported", required=True) # output flag parser.add_argument("-o", "--output", dest="output", type=str, help="Enter the output folder.", required=True) # additional options parser.add_argument("--stretch-input", dest="stretch_input", default=None, type=int, help="Stretch the input regions to a minimum length in bp, e.g. 10000 (for 10kb)") parser.add_argument("-c", "--color", dest="color", default=None, help="Enter a colon separated list of colors e.g. 255,0,0:255,125,0, default samples the rainbow") parser.add_argument("-s", "--sense", dest="sense", default='both', help="Map to '+','-' or 'both' strands. Default maps to both.") parser.add_argument("-e", "--extension", dest="extension", default=200, help="Extends reads by n bp. Default value is 200bp") parser.add_argument("-r", "--rpm", dest="rpm", action='store_true', default=False, help="Normalizes density to reads per million (rpm) Default is False") parser.add_argument("-y", "--yScale", dest="yScale", default="relative", help="Choose either relative or uniform y axis scaling. options = 'relative,uniform' Default is relative scaling") parser.add_argument("-n", "--names", dest="names", default=None, help="Enter a comma separated list of names for your bams") parser.add_argument("-p", "--plot", dest="plot", default="MULTIPLE", help="Choose either all lines on a single plot or multiple plots. options = 'SINGLE,MULTIPLE,MERGE'") parser.add_argument("-t", "--title", dest="title", default='', help="Specify a title for the output plot(s), default will be the coordinate region") # DEBUG OPTION TO SAVE TEMP FILES parser.add_argument("--scale", dest="scale", default='', help="Enter a comma separated list of scaling factors for your bams. Default is none") parser.add_argument("--save-temp", dest="save", action='store_true', default=False, help="If flagged will save temporary files made by bamPlot") parser.add_argument("--bed", dest="bed", help="Add a space-delimited list of bed files to plot") parser.add_argument("--multi-page", dest="multi", action='store_true', default=False, help="If flagged will create a new pdf for each region") args = parser.parse_args() print(args) if args.bam and args.input and args.genome and args.output: # Support a legacy mode where a ',' delimited multiple files bamFileList = args.bam if len(args.bam) == 1: bamFileList = args.bam[0].split(',') # Make sure these are actually files & readable (!) for filename in bamFileList: assert(os.access(filename, os.R_OK)) # bringing in any beds if args.bed: bedFileList = args.bed if type(bedFileList) == str: bedFileList = args.bed.split(',') print(bedFileList) bedCollection = makeBedCollection(bedFileList) else: bedCollection = utils.LocusCollection([], 50) # Load the input for graphing. One of: # - A .gff # - A .bed # - a specific input region (e.g. chr10:.:93150000-93180000) valid_sense_options = {'+', '-', '.'} if os.access(args.input, os.R_OK): if args.input.endswith('.bed'): # Uniquely graph every input of this bed parsed_input_bed = utils.parseTable(args.input, '\t') gffName = os.path.basename(args.input) # Graph title gff = None try: if parsed_input_bed[0][5] in valid_sense_options: # This .bed might have a sense parameter gff = [[e[0], '', args.input, e[1], e[2], '', e[5], '', ''] for e in parsed_input_bed] except IndexError: pass if gff is None: print("Your bed doesn't have a valid sense parameter. Defaulting to both strands, '.'") # We only take chr/start/stop and ignore everything else. gff = [[e[0], '', args.input, e[1], e[2], '', '.', '', ''] for e in parsed_input_bed] else: # Default to .gff, since that's the original behavior gff = utils.parseTable(args.input, '\t') gffName = args.input.split('/')[-1].split('.')[0] else: # means a coordinate line has been given e.g. chr1:+:1-100 chromLine = args.input.split(':') try: chrom = chromLine[0] sense = chromLine[1] except IndexError: print('Invalid input line or inaccessible file. Try: chr1:.:1-5000') exit() assert(sense in valid_sense_options) [start, end] = chromLine[2].split('-') if chrom[0:3] != 'chr': print('ERROR: UNRECOGNIZED GFF OR CHROMOSOME LINE INPUT') exit() gffLine = [chrom, '', args.input, start, end, '', sense, '', ''] gffName = "%s_%s_%s_%s" % (chrom, sense, start, end) gff = [gffLine] # Consider stretching the regions to a fixed minimum size if args.stretch_input: print('Stretching inputs to a minimum of: %d bp' % (args.stretch_input)) minLength = args.stretch_input stretchGff = [] for e in gff: difference = int(e[4]) - int(e[3]) if difference < minLength: pad = int((minLength - difference) / 2) stretchGff.append([e[0], e[1], e[2], int(e[3])-pad, int(e[4])+pad, e[5], e[6], e[7], e[8]]) else: stretchGff.append(e) gff = stretchGff # Sanity test the gff object assert(all([e[6] in valid_sense_options for e in gff])) # All strands are sane #assert(all([int(e[3]) < int(e[4]) for e in gff])) # All start/stops are ordered # bring in the genome genome = args.genome.upper() if ['HG18', 'HG19', 'HG19_RIBO','HG38','MM9', 'MM10', 'RN4','RN6'].count(genome) == 0: print('ERROR: UNSUPPORTED GENOME TYPE %s. USE HG19,HG18, RN4, MM9, or MM10' % (genome)) parser.print_help() exit() # bring in the rest of the options # output rootFolder = args.output if rootFolder[-1] != '/': rootFolder += '/' try: os.listdir(rootFolder) except OSError: print('ERROR: UNABLE TO FIND OUTPUT DIRECTORY %s' % (rootFolder)) exit() # Get analysis title if len(args.title) == 0: title = gffName else: title = args.title # make a temp folder tempFolder = rootFolder + title + '/' print("CREATING TEMP FOLDER %s" % (tempFolder)) pipeline_dfci.formatFolder(tempFolder, create=True) # colors if args.color: colorList = args.color.split(':') colorList = [x.split(',') for x in colorList] if len(colorList) < len(bamFileList): print('WARNING: FEWER COLORS THAN BAMS SPECIFIED. COLORS WILL BE RECYCLED') # recycling the color list colorList += colorList * (len(bamFileList) / len(colorList)) colorList = colorList[0:len(bamFileList)] else: # cycles through the colors of the rainbow colorList = tasteTheRainbow(len(bamFileList)) # sense sense = args.sense extension = int(args.extension) rpm = args.rpm scale = args.scale yScale = args.yScale.upper() # names if args.names: names = args.names.split(',') if len(names) != len(bamFileList): print('ERROR: NUMBER OF NAMES AND NUMBER OF BAMS DO NOT CORRESPOND') parser.print_help() exit() else: names = [x.split('/')[-1] for x in bamFileList] # plot style plotStyle = args.plot.upper() if ['SINGLE', 'MULTIPLE','MERGE'].count(plotStyle) == 0: print('ERROR: PLOT STYLE %s NOT AN OPTION' % (plotStyle)) parser.print_help() exit() # now run! summaryTableFileName = makeBamPlotTables(gff, genome, bamFileList, colorList, nBins, sense, extension, rpm, tempFolder, names, title, bedCollection,scale) print ("%s is the summary table" % (summaryTableFileName)) #running the R command to plot multi = args.multi outFile = "%s%s_plots.pdf" % (rootFolder, title) rCmd = callRPlot(summaryTableFileName, outFile, yScale, plotStyle,multi) # open a bash file bashFileName = "%s%s_Rcmd.sh" % (tempFolder, title) bashFile = open(bashFileName, 'w') bashFile.write('#!/usr/bin/bash\n') bashFile.write(rCmd) bashFile.close() print("Wrote R command to %s" % (bashFileName)) os.system("bash %s" % (bashFileName)) # delete temp files if not args.save: if utils.checkOutput(outFile, 1, 10): # This is super dangerous (!). Add some sanity checks. assert(" " not in tempFolder) assert(tempFolder is not "/") removeCommand = "rm -rf %s" % (tempFolder) print(removeCommand) os.system(removeCommand) else: print("ERROR: NO OUTPUT FILE %s DETECTED" % (outFile)) else: parser.print_help() sys.exit()
def makeGEOTable(dataFile, wiggleFolder, macsFolder, namesList, geoName, outputFolder=''): ''' makes a geo table and a bash script to format everything ''' dataDict = pipeline_dfci.loadDataTable(dataFile) #first make a reverse wce dict backgroundDict = {} if len(namesList) == 0: namesList = dataDict.keys() for name in namesList: background = dataDict[name]['background'] backgroundDict[background] = name outputFolder = pipeline_dfci.formatFolder(outputFolder, True) bashFileName = '%s%s_bash.sh' % (outputFolder, geoName) bashFile = open(bashFileName, 'w') geoTable = [[ 'SAMPLE_NAME', 'TITLE', 'CELL_TYPE', 'PROCESSED_FILE', 'RAW_FILE', 'BARCODE' ]] namesList.sort() for name in namesList: print(name) sampleName = dataDict[name]['uniqueID'] title = name cell_type = name.split('_')[0] processed_file = "%s.wig.gz" % (name) raw_file = "%s.fastq.gz" % (name) fastqFile = dataDict[name]['fastq'] uniqueID = dataDict[name]['uniqueID'] try: barcode = pipeline_dfci.getTONYInfo(uniqueID, 38) except IndexError: barcode = '' newLine = [ sampleName, title, cell_type, processed_file, raw_file, barcode ] print(newLine) geoTable.append(newLine) utils.unParseTable(geoTable, "%s%s_meta.xls" % (outputFolder, geoName), '\t') #now make the folder to hold everything and the relevant bash script if len(outputFolder) == 0: outputFolder = './%s/' % (geoName) else: outputFolder = outputFolder + geoName + '/' pipeline_dfci.formatFolder(outputFolder, True) wiggleFolder = pipeline_dfci.formatFolder(wiggleFolder, False) macsFolder = pipeline_dfci.formatFolder(macsFolder, False) #now make the bash file bashFile.write('#!/usr/bin/bash\n') bashFile.write('cd %s\n' % (outputFolder)) bashFile.write('\n') #write the untar command for name in namesList: fastqFile = dataDict[name]['fastq'] if len(fastqFile) == 0: print "NO FASTQ FILE FOR %s" % (name) continue tarCmd = 'cp %s %s.fastq.gz\n' % (fastqFile, name) bashFile.write(tarCmd) bashFile.write('\n\n\n') #write the wiggle cp command for name in namesList: if name.count('WCE') == 1 or name.count('INPUT') == 1: refName = backgroundDict[name] controlWiggleFile = '%s%s/%s_MACS_wiggle/control/%s_control_afterfiting_all.wig.gz' % ( macsFolder, refName, refName, refName) wigCmd = "cp '%s' %s.wig.gz\n" % (controlWiggleFile, name) #wigCmd = "cp '%swceWiggles/%s_control_afterfiting_all.wig.gz' %s.wig.gz\n" % (wiggleFolder,refName,name) else: wigCmd = "cp '%s%s_treat_afterfiting_all.wig.gz' %s.wig.gz\n" % ( wiggleFolder, name, name) bashFile.write(wigCmd) #write the md5sums for the wiggles bashFile.write('\n\n\n') bashFile.write("echo '' > md5sum.txt\n") for name in namesList: md5Cmd = 'md5sum %s.wig.gz >> md5sum.txt\n' % (name) bashFile.write(md5Cmd) #write md5sums for the fastqs for name in namesList: md5Cmd = 'md5sum %s.fastq.gz >> md5sum.txt\n' % (name) bashFile.write(md5Cmd) #the big tar command tarCmd = '#tar -cvzf %s.tar.gz %s\n' % (geoName, outputFolder) bashFile.write(tarCmd) bashFile.close()
def main(): ''' main run function ''' from optparse import OptionParser usage = "usage: %prog [options] -g [GENOME] -d [DATAFILE] {-r [ROSE_FOLDERS] | -i [INPUT_GFF]} -o [OUTPUT_FOLDER] --group1 [GROUP1_NAMES] --group2 [GROUP2_NAMES] --name1 [GROUP1_NAME] --name2 [GROUP2_NAME]" parser = OptionParser(usage = usage) #required flags parser.add_option("-g","--genome", dest="genome",nargs = 1, default=None, help = "Enter the genome build (HG18,HG19,MM9,RN4) for the project") parser.add_option("-d","--data", dest="data",nargs = 1, default=None, help = "Enter the data file for the project") parser.add_option("-o","--output", dest="output",nargs = 1, default=None, help = "Enter the output folder for the project") parser.add_option("--group1", dest="group1",nargs = 1, default=None, help = "Enter a comma separated list of dataset names associated with the first group") parser.add_option("--group2", dest="group2",nargs = 1, default=None, help = "Enter a comma separated list of dataset names associated with the second group") parser.add_option("--name1", dest="name1",nargs = 1, default=None, help = "Enter a name for the first group of datasets") parser.add_option("--name2", dest="name2",nargs = 1, default=None, help = "Enter a name for the second group of datasets") #the input options parser.add_option("-r","--rose", dest="rose",nargs = 1, default=None, help = "Enter a comma separated list of meta rose folders") #optional input to supercede the meta rose (this is kinda sad but will fix later) #should have had this code run clustering from the get go parser.add_option("-i","--input", dest="input",nargs = 1, default=None, help = "enter a gff, bed or table of regions to perform dyanmic analysis on") #additional options parser.add_option("-p","--plot", dest="plot",action = 'store_true', default=False, help = "If flagged, will plot differential regions") parser.add_option("-a","--all", dest="all",action = 'store_true', default=False, help = "If flagged, will run analysis for all enhancers and not just supers.") parser.add_option("-m","--median", dest="median",action = 'store_true', default=False, help = "If flagged, will use median enhancer scaling") parser.add_option("-e","--enhancer-type", dest="enhancer_type",nargs = 1,default='super', help = "specify type of enhancer to analyze: super, stretch, superStretch") parser.add_option("--use-background", dest="background",action = 'store_true',default=False, help = "If flagged will use background datasets as in data table") (options,args) = parser.parse_args() print(options) print(args) requiredArgs = [options.genome,options.data,options.rose,options.output,options.group1,options.group2,options.name1,options.name2] try: assert(all(requiredArgs)) except AssertionError: parser.print_help() sys.exit() #now the main run of the function #getting the genoe and data file genome = string.upper(options.genome) dataFile = options.data #getting the rose folders roseFolderString = options.rose [roseFolder1,roseFolder2] = roseFolderString.split(',') parentFolder = utils.formatFolder(options.output,True) #getting the analysis names name1 = options.name1 name2 = options.name2 mergeName = "%s_%s_merged" % (name1,name2) #getting the datasets names associated with each group namesList1 = options.group1.split(',') namesList2 = options.group2.split(',') #options for background corection useBackground = options.background #option for median scaling medianScale = options.median #option for an overriding set of input regions if options.input != None: #for now only works w/ gffs print('Using %s as a set of predifined input regions' % (options.input)) inputGFF = options.input else: inputGFF= '' plotBam = options.plot if options.all: superOnly = False else: superOnly = True if superOnly and plotBam: print "Running dynamic enhancer analysis on all super enhancers in %s and %s and plotting output to %s" % (name1,name2,parentFolder) if superOnly and not plotBam: print "Running dynamic enhancer analysis on all super enhancers in %s and %s and writing output to %s" % (name1,name2,parentFolder) if not superOnly and plotBam: print "Running dynamic enhancer analysis on all enhancers in %s and %s and plotting output to %s. WARNING: Plotting all differential enhancers could take a while" % (name1,name2,parentFolder) if not superOnly and not plotBam: print "Running dynamic enhancer analysis on all enhancers in %s and %s and writing output to %s." % (name1,name2,parentFolder) #part 1 print "PART1: analyzing ROSE output from %s and %s" % (name1,name2) #start with the all enhancer tables from the initial rose calls roseFolder1 = pipeline_dfci.formatFolder(roseFolder1,False) roseFolder2 = pipeline_dfci.formatFolder(roseFolder2,False) roseDict1 = makeRoseDict(roseFolder1) roseDict2 = makeRoseDict(roseFolder2) #choosing the type of enhancer to analyze enhancerCallType = string.lower(options.enhancer_type) if superOnly: print("ANALYZING ENHANCER TYPE: %s" % (string.upper(enhancerCallType))) superFile1 = roseDict1[enhancerCallType] superFile2 = roseDict2[enhancerCallType] allFile1 = roseDict1['AllEnhancer'] allFile2 = roseDict2['AllEnhancer'] regionFile1 = roseDict1['RegionMap'] regionFile2 = roseDict1['RegionMap'] #this is where we can toggle either using meta rose or clustering print('\tMERGING ENHANCERS AND CALLING ROSE') if superOnly: if len(superFile1) ==0: print "ERROR: UNABLE TO FIND %s FILES IN %s" % (enhancerCallType,roseFolder1) sys.exit() if len(superFile2) == 0: print "ERROR: UNABLE TO FIND %s FILES IN %s" % (enhancerCallType,roseFolder2) sys.exit() roseOutput = callMergeSupers(dataFile,superFile1,superFile2,name1,name2,mergeName,genome,parentFolder,namesList1,namesList2,useBackground,inputGFF) else: print('doing it right') print(allFile1) print(allFile2) roseOutput = callMergeSupers(dataFile,allFile1,allFile2,name1,name2,mergeName,genome,parentFolder,namesList1,namesList2,useBackground,inputGFF) print('this is rose output') print(roseOutput) print('\tMERGING ROSE OUTPUT') mergedRoseOutput,normRoseOutput = mergeRoseSignal(dataFile,roseOutput,roseDict1,roseDict2,name1,name2,namesList1,namesList2,useBackground,medianScale) print('\tCALCULATING ENHANCER DELTA AND MAKING PLOTS') #part2 is the R script mergedGFFFile = '%s%s_%s_MERGED_REGIONS_-0_+0.gff' % (parentFolder,string.upper(genome),mergeName) rcmd = callDeltaRScript(mergedGFFFile,parentFolder,dataFile,name1,name2,allFile1,allFile2,medianScale,namesList1) print(rcmd) os.system(rcmd) time.sleep(5) callRoseGeneMapper(mergedGFFFile,genome,parentFolder,namesList1) #rank the genes #part 3 #rank the delta print "PART 3: assinging ranks to differential enhancers" print('\tASSIGNING SUPER RANK TO MERGED ENHANCERS') gffName = '%s_%s_MERGED_REGIONS_-0_+0' % (string.upper(genome),mergeName) enhancerToGeneFile = "%s%s_ROSE/%s_0KB_STITCHED_ENHANCER_DELTA_MERGED_ENHANCER_TO_GENE_100KB.txt" % (parentFolder,namesList1[0],gffName) if utils.checkOutput(enhancerToGeneFile): rankOutput = "%s%s_ROSE/%s_0KB_STITCHED_ENHANCER_DELTA_MERGED_ENHANCER_TO_GENE_100KB_RANK.txt" % (parentFolder,namesList1[0],gffName) assignEnhancerRank(enhancerToGeneFile,allFile1,allFile2,name1,name2,rankOutput) else: print('ERROR: DELTA SCRIPT OR ROSE GENE MAPPER FAILED TO RUN') sys.exit() #make the rank plot print('MAKING RANK PLOTS') if utils.checkOutput(rankOutput): print('checking for rank output %s' % (rankOutput)) rcmd = callRankRScript(rankOutput,name1,name2,superFile1,superFile2) print(rcmd) os.system(rcmd) else: print('ERROR: RANK PLOT SCRIPT FAILED TO RUN') sys.exit() print('MAKING REGION SIGNAL PLOTS AND FINDING DIFFERENTIAL REGIONS') if utils.checkOutput(normRoseOutput): print('checking for %s' % (normRoseOutput)) rcmd = callRegionPlotRScript(normRoseOutput,name1,name2,namesList1,namesList2) print(rcmd) os.system(rcmd) else: print('ERROR: REGION PLOT SCRIPT FAILED TO RUN') sys.exit() #NOW MAP GENES print('mapping genes to differential enhancers') statOutput,diffOutput = callRoseGeneMapper_stats(mergedGFFFile,genome,parentFolder,namesList1) if utils.checkOutput(statOutput): print('checking for gene mapping output %s' % (statOutput)) print('FINISHED WITH GENE MAPPING') else: print('GENE MAPPING FAILED') sys.exit() print('FINISHING OUTPUT') finishRankOutput(dataFile,statOutput,diffOutput,genome,parentFolder,mergeName,name1,name2,namesList1,namesList2,1.0,100000,superOnly,plotBam)
def finishRankOutput(dataFile,statOutput,diffOutput,genome,mergeFolder,mergeName,name1,name2,namesList1,namesList2,cutOff=1.0,window = 100000,superOnly=True,plotBam=True): ''' cleans up the rank output table makes a gff of all of the gained/lost supers beyond a certain cutoff w/ a window makes a list of gained genes and lost genes makes a bed of gained loss ''' dataDict = pipeline_dfci.loadDataTable(dataFile) #making sure window and cutoff are int/float cutOff = float(cutOff) window = int(window) genome = string.upper(genome) #make the output folder outputFolder =pipeline_dfci.formatFolder(mergeFolder+'output/',True) #bring in the old rank table rankEnhancerTable = utils.parseTable(statOutput,'\t') #make a new formatted table header = rankEnhancerTable[0] formattedRankTable =[header] #the gffs gainedGFF = [] lostGFF = [] gainedWindowGFF = [] lostWindowGFF = [] if superOnly: enhancerType = 'SUPERS' else: enhancerType = 'ENHANCERS' #the beds if superOnly: gainedTrackHeader = 'track name="%s %s only SEs" description="%s super enhancers that are found only in %s vs %s" itemRGB=On color=255,0,0' % (genome,name2,genome,name2,name1) gainedBed = [[gainedTrackHeader]] conservedTrackHeader = 'track name="%s %s and %s SEs" description="%s super enhancers that are found in both %s vs %s" itemRGB=On color=0,0,0' % (genome,name1,name2,genome,name1,name2) conservedBed = [[conservedTrackHeader]] lostTrackHeader = 'track name="%s %s only SEs" description="%s super enhancers that are found only in %s vs %s" itemRGB=On color=0,255,0' % (genome,name1,genome,name1,name2) lostBed = [[lostTrackHeader]] else: gainedTrackHeader = 'track name="%s %s only enhancers" description="%s enhancers that are found only in %s vs %s" itemRGB=On color=255,0,0' % (genome,name2,genome,name2,name1) gainedBed = [[gainedTrackHeader]] conservedTrackHeader = 'track name="%s %s and %s enhancers" description="%s enhancers that are found in both %s vs %s" itemRGB=On color=0,0,0' % (genome,name1,name2,genome,name1,name2) conservedBed = [[conservedTrackHeader]] lostTrackHeader = 'track name="%s %s only enhancers" description="%s enhancers that are found only in %s vs %s" itemRGB=On color=0,255,0' % (genome,name1,genome,name1,name2) lostBed = [[lostTrackHeader]] #the genes geneTable =[['GENE','ENHANCER_ID','ENHANCER_CHROM','ENHANCER_START','ENHANCER_STOP',header[6],header[7],header[8],'STATUS']] headerLength = len(rankEnhancerTable[0]) for line in rankEnhancerTable[1:]: #fix line lengths if len(line) != headerLength: line += ['']*(headerLength-len(line)) #fixing the enhancer ID line[0] = line[0].replace('_lociStitched','') formattedRankTable.append(line) #getting the genes geneList = [] geneList += line[-1].split(',') geneList += line[-2].split(',') geneList += line[-3].split(',') geneList = [x for x in geneList if len(x) >0] geneList = utils.uniquify(geneList) geneString = string.join(geneList,',') bedLine = [line[1],line[2],line[3],line[0],line[-4]] #for gained #this applies both the statistical test chosen (default fdr <= 0.05) and the cutoff #the cutoff is hard wired, but we can add an option to change the test #stats are done in the R script. FDR norm can kinda suck if no genes are considered diff #print(line) if float(line[-8]) > cutOff and int(line[-4]) == 1: gffLine = [line[1],line[0],'',line[2],line[3],'','.','',geneString] gffWindowLine = [line[1],line[0],'',int(line[2])-window,int(line[3])+window,'','.','',geneString] gainedGFF.append(gffLine) gainedWindowGFF.append(gffWindowLine) geneStatus = name2 gainedBed.append(bedLine) #for lost elif float(line[-8]) < (-1 * cutOff) and int(line[-4]) == 1: gffLine = [line[1],line[0],'',line[2],line[3],'','.','',geneString] gffWindowLine = [line[1],line[0],'',int(line[2])-window,int(line[3])+window,'','.','',geneString] lostGFF.append(gffLine) lostWindowGFF.append(gffWindowLine) geneStatus = name1 lostBed.append(bedLine) #for conserved else: geneStatus = 'UNCHANGED' conservedBed.append(bedLine) #now fill in the gene Table for gene in geneList: geneTableLine = [gene,line[0],line[1],line[2],line[3],line[6],line[7],line[8],geneStatus] geneTable.append(geneTableLine) #concat the bed fullBed = gainedBed + conservedBed + lostBed #start writing the output #there's the two gffs, the bed,the formatted table, the gene table #formatted table formattedFilename = "%s%s_%s_MERGED_%s_RANK_TABLE.txt" % (outputFolder,genome,mergeName,enhancerType) utils.unParseTable(formattedRankTable,formattedFilename,'\t') #formatted diff table #possible that no genes are differential rankEnhancerDiffTable = utils.parseTable(diffOutput,'\t') #make a new formatted table header = rankEnhancerDiffTable[0] formattedRankDiffTable =[header] for line in rankEnhancerDiffTable[1:]: #fixing the enhancer ID line[0] = line[0].replace('_lociStitched','') formattedRankDiffTable.append(line) formattedDiffFilename = "%s%s_%s_MERGED_%s_RANK_DIFF_TABLE.txt" % (outputFolder,genome,mergeName,enhancerType) utils.unParseTable(formattedRankDiffTable,formattedDiffFilename,'\t') #gffs gffFolder = pipeline_dfci.formatFolder(outputFolder+'gff/',True) gffFilename_gained = "%s%s_%s_%s_ONLY_%s_-0_+0.gff" % (gffFolder,genome,mergeName,string.upper(name2),enhancerType) gffFilenameWindow_gained = "%s%s_%s_%s_ONLY_%s_-%sKB_+%sKB.gff" % (gffFolder,genome,mergeName,string.upper(name2),enhancerType,window/1000,window/1000) gffFilename_lost = "%s%s_%s_%s_ONLY_%s_-0_+0.gff" % (gffFolder,genome,mergeName,string.upper(name1),enhancerType) gffFilenameWindow_lost = "%s%s_%s_%s_ONLY_%s_-%sKB_+%sKB.gff" % (gffFolder,genome,mergeName,string.upper(name1),enhancerType,window/1000,window/1000) utils.unParseTable(gainedGFF,gffFilename_gained,'\t') utils.unParseTable(gainedWindowGFF,gffFilenameWindow_gained,'\t') utils.unParseTable(lostGFF,gffFilename_lost,'\t') utils.unParseTable(lostWindowGFF,gffFilenameWindow_lost,'\t') #bed bedFilename = "%s%s_%s_MERGED_%s.bed" % (outputFolder,genome,mergeName,enhancerType) utils.unParseTable(fullBed,bedFilename,'\t') #geneTable geneFilename = "%s%s_%s_MERGED_%s_GENE_TABLE.txt" % (outputFolder,genome,mergeName,enhancerType) utils.unParseTable(geneTable,geneFilename,'\t') #finally, move all of the plots to the output folder cmd = "cp %s%s_ROSE/*DELTA*.pdf %s%s_%s_MERGED_%s_DELTA.pdf" % (mergeFolder,namesList1[0],outputFolder,genome,mergeName,enhancerType) os.system(cmd) cmd = "cp %s%s_ROSE/*REGION_GAINED*.pdf %s%s_%s_MERGED_%s_REGION_GAINED.pdf" % (mergeFolder,namesList1[0],outputFolder,genome,mergeName,enhancerType) os.system(cmd) cmd = "cp %s%s_ROSE/*REGION_LOST*.pdf %s%s_%s_MERGED_%s_REGION_LOST.pdf" % (mergeFolder,namesList1[0],outputFolder,genome,mergeName,enhancerType) os.system(cmd) cmd = "cp %s%s_ROSE/*REGION_LOST*.pdf %s%s_%s_MERGED_%s_REGION_UNCHANGED.pdf" % (mergeFolder,namesList1[0],outputFolder,genome,mergeName,enhancerType) os.system(cmd) cmd = "cp %s%s_ROSE/*RANK_PLOT.png %s%s_%s_MERGED_%s_RANK_PLOT.png" % (mergeFolder,namesList1[0],outputFolder,genome,mergeName,enhancerType) os.system(cmd) #now execute the bamPlot_turbo.py commands if plotBam: bamList1 = [dataDict[name]['bam'] for name in namesList1] bamList2 = [dataDict[name]['bam'] for name in namesList2] bamList = bamList1 + bamList2 bamString = string.join(bamList,',') nameList = [name1]*len(namesList1) + [name2]*len(namesList2) nameString = string.join(nameList,',') print(namesList1[0]) print(namesList2[0]) print(namesList1) print(namesList2) print(dataDict[namesList1[0]]['color']) if dataDict[namesList1[0]]['color'] != dataDict[namesList2[0]]['color']: colorList = [dataDict[namesList1[0]]['color']]*len(namesList1) + [dataDict[namesList2[0]]['color']]*len(namesList2) else: colorList = ['0,0,0']*len(namesList1) + ['100,100,100']*len(namesList2) colorString = string.join(colorList,':') #change dir if len(gainedGFF) > 0: #gained command plotTitle = "%s_ONLY_SE" % (name2) cmd = 'python %sbamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MERGE' % (pipelineDir,genome,bamString,gffFilename_gained,outputFolder,nameString,colorString,plotTitle) os.system(cmd) #gained window command plotTitle = "%s_ONLY_SE_%sKB_WINDOW" % (name2,window/1000) cmd = 'python %sbamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MERGE' % (pipelineDir,genome,bamString,gffFilenameWindow_gained,outputFolder,nameString,colorString,plotTitle) os.system(cmd) if len(lostGFF) > 0: #lost command plotTitle = "%s_ONLY_SE" % (name1) cmd = 'python %sbamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MERGE' % (pipelineDir,genome,bamString,gffFilename_lost,outputFolder,nameString,colorString,plotTitle) os.system(cmd) #lost command plotTitle = "%s_ONLY_SE_%sKB_WINDOW" % (name1,window/1000) cmd = 'python %sbamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MERGE' % (pipelineDir,genome,bamString,gffFilenameWindow_lost,outputFolder,nameString,colorString,plotTitle) os.system(cmd) return