def map_regions(dataFile,gffList,names_list=[]): ''' making a normalized binding signal table at all regions ''' #since each bam has different read lengths, important to carefully normalize quantification dataDict = pipeline_dfci.loadDataTable(dataFile) dataFile_name = dataFile.split('/')[-1].split('.')[0] if len(names_list) == 0: names_list = dataDict.keys() names_list.sort() for name in names_list: bam = utils.Bam(dataDict[name]['bam']) read_length = bam.getReadLengths()[0] bam_extension = 200-read_length print('For dataset %s using an extension of %s' % (name,bam_extension)) pipeline_dfci.mapBamsBatch(dataFile,gffList,mappedFolder,overWrite =False,namesList = [name],extension=bam_extension,rpm=True) #want a signal table of all datasets to each gff print('Writing signal tables for each gff:') for gffFile in gffList: gffName = gffFile.split('/')[-1].split('.')[0] signal_table_path = '%s%s_%s_SIGNAL.txt' % (signalFolder,gffName,dataFile_name) print(signal_table_path) pipeline_dfci.makeSignalTable(dataFile,gffFile,mappedFolder,namesList = names_list,medianNorm=False,output =signal_table_path)
def calculatePromoterActivity(annotationFile, bamFile, projectName, projectFolder, refseqToNameDict, background=False): ''' calculates the level of acetylation at each TF promoter ''' print 'GENERATING AN ACTIVITY TABLE USING CHIP DATA' annotTable = utils.parseTable(annotationFile, '\t') output = [] counter = 0 bam = utils.Bam(bamFile) if background: background = utils.Bam(background) startDict = utils.makeStartDict(annotationFile) tssLoci = [] for gene in startDict: tssLoci.append(utils.makeTSSLocus(gene, startDict, 2500, 2500)) tssCollection = utils.LocusCollection(tssLoci, 50) gff = utils.locusCollectionToGFF(tssCollection) outputname = projectFolder + projectName + '_TSS.gff' utils.unParseTable(gff, outputname, '\t') mappingCmd = 'bamliquidator_batch' mappingCmd += ' -r ' + outputname mappingCmd += ' -o ' + projectFolder + 'bamliquidator' mappingCmd += ' -m -e 200 ' mappingCmd += bamFile subprocess.call(mappingCmd, shell=True) print mappingCmd
def findValleys(gene_to_enhancer_dict, bamFileList, projectName, projectFolder, cutoff=0.2): ''' takes in the super dict returns a dictionary of refseqs with all valley loci that are associated returns 2 kinds of bed files... 1 = all ''' #first make the bamDict all_valley_bed = [] valleyDict = {} #start w/ a bamFileList and make a list of bam type objects bam_list = [utils.Bam(bam_path) for bam_path in bamFileList] max_read_length = max([bam.getReadLengths()[0] for bam in bam_list]) gene_list = gene_to_enhancer_dict.keys() gene_list.sort() ticker = 0 print('number of regions processed:') for gene in gene_list: valleyDict[gene] = [] for region in gene_to_enhancer_dict[gene]: if ticker % 100 == 0: print(ticker) ticker += 1 scoreArray = scoreValley(region, bam_list, max_read_length, projectName, projectFolder) for index, score in enumerate(scoreArray): if score > cutoff: valley = utils.Locus(region.chr(), region.start() + index * 10, region.start() + (index + 1) * 10, '.') valleyDict[gene].append(valley) stitchedValleys = stitchValleys(valleyDict[gene]) for valley in stitchedValleys: all_valley_bed.append([valley.chr(), valley.start(), valley.end()]) valleyDict[gene] = stitchedValleys all_bed_path = projectFolder + projectName + '_all_valleys.bed' utils.unParseTable(all_valley_bed, all_bed_path, '\t') return all_bed_path
def plot_mm_genes(mm1s_dataFile, nb_figure_gff_path, bed_string): ''' plots all varieties and iterations of tracks for shep on data ''' #first establish the plot folder plotFolder = utils.formatFolder('%sMM1S/' % (genePlotFolder), True) plot_prefix = 'HG19_NB_FIGURE_GENES' #we also have to set the extension properly between datasets #go by data file dataDict = pipeline_dfci.loadDataTable(mm1s_dataFile) names_list = dataDict.keys() bam = utils.Bam(dataDict[names_list[0]]['bam']) read_length = bam.getReadLengths()[0] bam_extension = 200 - read_length print('For datasets in %s using an extension of %s' % (mm1s_dataFile, bam_extension)) #first do individuals for plot_group in ['MYC', 'H3K27AC']: plotList = [ name for name in dataDict.keys() if name.count(plot_group) > 0 ] plotName = '%s_MM1S_%s' % (plot_prefix, plot_group) print(plotName) pipeline_dfci.callBatchPlot(mm1s_dataFile, nb_figure_gff_path, plotName, plotFolder, plotList, uniform=True, bed=bed_string, plotType='MULTIPLE', extension=bam_extension, multiPage=False, debug=False, nameString='', rpm=True, rxGenome='')
def make_summary_table(data_file_list, output, bed_path=''): ''' exports a table w/ name, million mapped reads and number of peaks ''' print('WRITING SUMMARY OUTPUT TO %s' % (output)) if bed_path != '': print('COPYING BEDS TO %s' % (bed_path)) summary_table = [['NAME', 'READ_LENGTH', 'MAPPED_READS', 'PEAKS']] for data_file in data_file_list: print('GETTING DATA SUMMARY FOR %s' % (data_file)) dataDict = pipeline_dfci.loadDataTable(data_file) names_list = dataDict.keys() names_list.sort() for name in names_list: print(name) uniqueID = dataDict[name]['uniqueID'] bam = utils.Bam(dataDict[name]['bam']) read_length = bam.getReadLengths()[0] mmr = round(float(bam.getTotalReads()) / 1000000, 2) #get the peak count try: peak_path = '%s%s' % (macsEnrichedFolder, dataDict[name]['enrichedMacs']) peakCollection = utils.importBoundRegion(peak_path, name) peakCount = len(peakCollection) except IOError: peakCount = 'NA' newLine = [name, read_length, mmr, peakCount] #print(newLine) summary_table.append(newLine) utils.unParseTable(summary_table, output, '\t')
def calculatePromoterActivity(annotationFile, bamFile, projectName, projectFolder, refseqToNameDict): ''' calculates the level of H3K27ac at each promoter from a H3K27ac bam file ''' print 'IDENTIFY EXPRESSED GENES' annotTable = utils.parseTable(annotationFile, '\t') output = [] counter = 0 bam = utils.Bam(bamFile) startDict = utils.makeStartDict(annotationFile) tssLoci = [] for gene in startDict: tssLoci.append(utils.makeTSSLocus(gene,startDict,1000,1000)) tssCollection = utils.LocusCollection(tssLoci,50) gff = utils.locusCollectionToGFF(tssCollection) outputname = projectFolder + projectName + '_TSS.gff' utils.unParseTable(gff, outputname, '\t') # run bamToGFF.py to quantify signal at each TSS +/- 1kb mappingCmd = 'python ./bamToGFF.py' mappingCmd += ' -r ' mappingCmd += ' -d ' mappingCmd += ' -o ' + projectFolder + 'matrix.gff' mappingCmd += ' -m 1 -f 0 -e 200 ' mappingCmd += ' -i ' + projectFolder + projectName + '_TSS.gff' mappingCmd += ' -b ' + bamFile call(mappingCmd, shell=True) print mappingCmd
def main(): from optparse import OptionParser usage = "usage: %prog [options] -e [ENHANCER_FILE] -b [BAM_FILE] -g [GENOME] -o [OUTPUTFOLDER] -n [NAME] -s [SUBPEAKS] -x [EXP_CUTOFF] -l [EXTENSION_LENGTH]" parser = OptionParser(usage = usage) # Required flags parser.add_option("-e","--enhancer_file", dest="enhancers",nargs = 1, default=None, help = "Provide a ROSE generated enhancer table (_AllEnhancers.table.txt)") parser.add_option("-b","--bam_file",dest="bam",nargs =1, default = None, help = "Provide a sorted indexed bam file for H3K27ac sequencing reads") parser.add_option("-g","--genome",dest="genome",nargs =1, default = None, help = "Provide the build of the genome to be used for the analysis. Currently supports HG19, HG18 and MM9") parser.add_option("-f","--fasta",dest="fasta",nargs =1, default = None, help = "Enter location of the fasta files for the genome version used") parser.add_option("-s","--subpeaks", dest="subpeaks",nargs=1,default=None, help = "Enter a bedfile of peaks output from MACS used to identify SE constituents") parser.add_option("-x","--exp_Cutoff", dest="expCutoff",nargs=1,default=33, help = "Enter the percentage of transcripts that are not considered expressed, default=33") parser.add_option("-l","--extension_length", dest="extension",nargs = 1, default = 500, help = "Enter the length (in bp) to extend constituents for motif search, default=500") parser.add_option("-n","--name",dest="name",nargs =1, default = None, help = "Enter the sample name") parser.add_option("-o","--output",dest="output",nargs =1, default = None, help = "Enter directory to be used for storing output") # Options parser.add_option("-a","--activity", dest="activity",nargs = 1, default=None, help = "Enter a two column table with refseq in the first column and the associated activity (expression or promoter acetylation level) in the second column") parser.add_option("-E","--enhancer_number", dest="Enumber",nargs = 1, default='supers', help = "Enter the number of top ranked enhancers to include in the anlaysis, default = supers") (options,args) = parser.parse_args() print(options) if options.enhancers and options.bam and options.genome and options.fasta and options.subpeaks and options.expCutoff and options.extension and options.name and options.output: # Set parameters genomeDirectory = options.fasta genome = options.genome genome = upper(genome) if genome == 'HG19': annotationFile = './annotation/hg19_refseq.ucsc' TFfile = './TFlist_NMid_hg.txt' if genome == 'HG18': annotationFile = './annotation/hg18_refseq.ucsc' TFfile = './TFlist_NMid_hg.txt' if genome == 'MM9': annotationFile = './annotation/mm9_refseq.ucsc' TFfile = './TFlist_NMid_ms.txt' motifConvertFile = './MotifDictionary.txt' motifDatabaseFile = './VertebratePWMs.txt' TFtable = utils.parseTable(TFfile, '\t') TFlist = [line[0] for line in TFtable] TFlistGene = [line[1] for line in TFtable] superFile = options.enhancers superTable = utils.parseTable(superFile, '\t') bamFile = options.bam bam = utils.Bam(bamFile) subpeaks = options.subpeaks expCutoff = int(options.expCutoff) motifExtension = int(options.extension) projectName = options.name projectFolder = options.output refseqToNameDict = {} expressionFile = options.activity if expressionFile: expressionTable = utils.parseTable(expressionFile, '\t') else: calculatePromoterActivity(annotationFile, bamFile, projectName, projectFolder, refseqToNameDict) expresionFilename = projectFolder + 'matrix.gff' expressionTable = utils.parseTable(expresionFilename, '\t') if options.Enumber != 'super': enhancerNumber = options.Enumber else: enhancerNumber = 'super' # Run the program superLoci = createSuperLoci(superTable) expressedNM = createExpressionDict(annotationFile, projectFolder, projectName, refseqToNameDict, expressionTable) TFandSuperDict = findCanidateTFs(annotationFile, superLoci, expressedNM, TFlist, refseqToNameDict, projectFolder, projectName) formatOutput(TFandSuperDict, refseqToNameDict, projectName, projectFolder) candidateGenes = [upper(refseqToNameDict[x]) for x in TFandSuperDict.keys()] candidateGenes = utils.uniquify(candidateGenes) generateSubpeakFASTA(TFandSuperDict, subpeaks, genomeDirectory, projectName, projectFolder, motifExtension) findMotifs(candidateGenes, projectFolder, projectName, motifConvertFile, motifDatabaseFile) graph = buildNetwork(projectFolder, projectName, candidateGenes, refseqToNameDict, motifConvertFile) formatNetworkOutput(graph, projectFolder, projectName, candidateGenes) # Return help else: parser.print_help() sys.exit()
def plot_nb_atac_genes(atac_dataFile, nb_figure_gff_path, bed_string): ''' plots all varieties and iterations of tracks for shep on data ''' #first establish the plot folder plotFolder = utils.formatFolder('%sNB_ATAC/' % (genePlotFolder), True) plot_prefix = 'HG19_NB_FIGURE_GENES' #we also have to set the extension properly between datasets #go by data file dataDict = pipeline_dfci.loadDataTable(atac_dataFile) names_list = dataDict.keys() print(names_list) #initial check for consistency of read lengths # for name in names_list: # bam = utils.Bam(dataDict[name]['bam']) # read_length = bam.getReadLengths()[0] # bam_extension = 200-read_length # print('For dataset %s in %s using an extension of %s' % (name,atac_dataFile,bam_extension)) print(dataDict[names_list[1]]['bam']) bam = utils.Bam(dataDict[names_list[0]]['bam']) read_length = bam.getReadLengths()[0] bam_extension = 200 - read_length print('For datasets in %s using an extension of %s' % (atac_dataFile, bam_extension)) #first do individuals for plot_group in ['ATAC']: plotList = [ name for name in dataDict.keys() if name.count(plot_group) > 0 and name.count('MM1S') == 0 ] plotName = '%s_NB_%s' % (plot_prefix, plot_group) print(plotName) pipeline_dfci.callBatchPlot(atac_dataFile, nb_figure_gff_path, plotName, plotFolder, plotList, uniform=True, bed=bed_string, plotType='MULTIPLE', extension=bam_extension, multiPage=False, debug=False, nameString='', rpm=True, rxGenome='') #now as metas plotList = [ 'BE2C_ATAC_rep1', 'KELLY_ATAC', 'NGP_ATAC', 'SHEP21_ATAC', ] groupString = 'ATAC,ATAC,ATAC,ATAC' plotName = '%s_NB_ATAC_META_RELATIVE' % (plot_prefix) pipeline_dfci.callBatchPlot(atac_dataFile, nb_figure_gff_path, plotName, plotFolder, plotList, uniform=False, bed=bed_string, plotType='MERGE', extension=bam_extension, multiPage=False, debug=False, nameString=groupString, rpm=True, rxGenome='') plotName = '%s_NB_ATAC_META_UNIFORM' % (plot_prefix) pipeline_dfci.callBatchPlot(atac_dataFile, nb_figure_gff_path, plotName, plotFolder, plotList, uniform=True, bed=bed_string, plotType='MERGE', extension=bam_extension, multiPage=False, debug=False, nameString=groupString, rpm=True, rxGenome='')
def plot_be2c_genes(be2c_dataFile, nb_figure_gff_path, bed_string): ''' plots all varieties and iterations of tracks for shep on data ''' #first establish the plot folder plotFolder = utils.formatFolder('%sBE2C/' % (genePlotFolder), True) plot_prefix = 'HG19_NB_FIGURE_GENES' #we also have to set the extension properly between datasets #go by data file dataDict = pipeline_dfci.loadDataTable(be2c_dataFile) names_list = dataDict.keys() print(names_list) # #initial check for consistency of read lengths # for name in names_list: # print(name) # bam = utils.Bam(dataDict[name]['bam']) # read_length = bam.getReadLengths()[0] # bam_extension = 200-read_length # print('For dataset %s in %s using an extension of %s' % (name,be2c_dataFile,bam_extension)) print(dataDict[names_list[0]]['bam']) bam = utils.Bam(dataDict[names_list[0]]['bam']) read_length = bam.getReadLengths()[0] bam_extension = 200 - read_length print('For datasets in %s using an extension of %s' % (be2c_dataFile, bam_extension)) #first do individuals except for twist using relative scaling plotList = [ name for name in dataDict.keys() if name.count('TWIST') == 0 and name.count('INPUT') == 0 ] plotName = '%s_BE2C_RELATIVE' % (plot_prefix) print(plotName) pipeline_dfci.callBatchPlot(be2c_dataFile, nb_figure_gff_path, plotName, plotFolder, plotList, uniform=False, bed=bed_string, plotType='MULTIPLE', extension=bam_extension, multiPage=False, debug=False, nameString='', rpm=True, rxGenome='') plotName = '%s_BE2C_UNIFORM' % (plot_prefix) print(plotName) pipeline_dfci.callBatchPlot(be2c_dataFile, nb_figure_gff_path, plotName, plotFolder, plotList, uniform=True, bed=bed_string, plotType='MULTIPLE', extension=bam_extension, multiPage=False, debug=False, nameString='', rpm=True, rxGenome='') #now for twist plotList = ['BE2C_TWIST'] twist_extension = 125 plotName = '%s_BE2C_TWIST' % (plot_prefix) print(plotName) pipeline_dfci.callBatchPlot(be2c_dataFile, nb_figure_gff_path, plotName, plotFolder, plotList, uniform=False, bed=bed_string, plotType='MULTIPLE', extension=twist_extension, multiPage=False, debug=False, nameString='', rpm=True, rxGenome='')
def plot_shep21_chiprx_genes(shep21_chiprx_dataFile, scale_path, nb_figure_gff_path, bed_string): ''' plots all varieties and iterations of tracks for shep21 chiprx data with both spikey normey and without spikey normey ''' #we want a multiplicative scale factor for the data and to not have rpm on scale_table = utils.parseTable(scale_path, '\t') scale_dict = {} for line in scale_table[1:]: scale_dict[line[0]] = line[2] #first establish the plot folder plotFolder_scaled = utils.formatFolder( '%sSHEP21_CHIPRX_SCALED/' % (genePlotFolder), True) plotFolder_rpm = utils.formatFolder( '%sSHEP21_CHIPRX_RPM_NOSCALE/' % (genePlotFolder), True) plotFolder_raw = utils.formatFolder( '%sSHEP21_CHIPRX_RAW_NOSCALE/' % (genePlotFolder), True) plot_prefix = 'HG19_NB_FIGURE_GENES' #we also have to set the extension properly between datasets #go by data file #for shep21_dataFile dataDict = pipeline_dfci.loadDataTable(shep21_chiprx_dataFile) names_list = dataDict.keys() #initial check for consistency of read lengths # for name in names_list: # bam = utils.Bam(dataDict[name]['bam']) # read_length = bam.getReadLengths()[0] # bam_extension = 200-read_length # print('For dataset %s in %s using an extension of %s' % (name,shep21_chiprx_dataFile,bam_extension)) bam = utils.Bam(dataDict[names_list[0]]['bam']) read_length = bam.getReadLengths()[0] bam_extension = 200 - read_length print('For datasets in %s using an extension of %s' % (shep21_chiprx_dataFile, bam_extension)) #for shep21 we want meta of k27ac, pol2, mycn, and twist #individual of k27ac, pol2, mycn, and twist #first do individuals rpm scaled for plot_group in ['MYCN', 'H3K4ME3', 'H3K27AC', 'POL2', 'CTCF']: plotList = [ name for name in dataDict.keys() if name.count(plot_group) > 0 ] scaleList = [ round(1 / float(scale_dict[name]), 4) for name in plotList ] scaleList = [str(x) for x in scaleList] plot_scale_string = ','.join(scaleList) #first raw no scaling plotName = '%s_SHEP21_%s_RX_RAW_NOSCALE' % (plot_prefix, plot_group) print(plotName) pipeline_dfci.callBatchPlot(shep21_chiprx_dataFile, nb_figure_gff_path, plotName, plotFolder_raw, plotList, uniform=True, bed=bed_string, plotType='MULTIPLE', extension=bam_extension, multiPage=False, debug=False, nameString='', rpm=False, rxGenome='') #first rpm no scaling plotName = '%s_SHEP21_%s_RX_RPM_NOSCALE' % (plot_prefix, plot_group) print(plotName) pipeline_dfci.callBatchPlot(shep21_chiprx_dataFile, nb_figure_gff_path, plotName, plotFolder_rpm, plotList, uniform=True, bed=bed_string, plotType='MULTIPLE', extension=bam_extension, multiPage=False, debug=False, nameString='', rpm=True, rxGenome='') #next w/ scaling plotName = '%s_SHEP21_%s_RX_SCALED' % (plot_prefix, plot_group) print(plotName) pipeline_dfci.callBatchPlot(shep21_chiprx_dataFile, nb_figure_gff_path, plotName, plotFolder_scaled, plotList, uniform=True, bed=bed_string, plotType='MULTIPLE', extension=bam_extension, multiPage=False, debug=False, nameString='', rpm=False, rxGenome='', scaleFactorString=plot_scale_string) #now as metas plotList = [ 'SHEP21_0HR_MYCN_NOSPIKE', 'SHEP21_2HR_MYCN_NOSPIKE', 'SHEP21_24HR_MYCN_NOSPIKE', 'SHEP21_0HR_H3K27AC_NOSPIKE', 'SHEP21_2HR_H3K27AC_NOSPIKE', 'SHEP21_24HR_H3K27AC_NOSPIKE', 'SHEP21_0HR_TWIST', 'SHEP21_2HR_TWIST', 'SHEP21_24HR_B_TWIST', 'SHEP21_0HR_POL2_NOSPIKE_R2', 'SHEP21_2HR_POL2_NOSPIKE', 'SHEP21_24HR_POL2_NOSPIKE', ] groupString = 'MYCN,MYCN,MYCN,H3K27AC,H3K27AC,H3K27AC,TWIST,TWIST,TWIST,POL2,POL2,POL2' plotName = '%s_SHEP21_NOSPIKE_META_RELATIVE' % (plot_prefix) pipeline_dfci.callBatchPlot(shep21_dataFile, nb_figure_gff_path, plotName, plotFolder_rpm, plotList, uniform=False, bed=bed_string, plotType='MERGE', extension=bam_extension, multiPage=False, debug=False, nameString=groupString, rpm=True, rxGenome='') plotName = '%s_SHEP21_NOSPIKE_META_UNIFORM' % (plot_prefix) pipeline_dfci.callBatchPlot(shep21_dataFile, nb_figure_gff_path, plotName, plotFolder_rpm, plotList, uniform=True, bed=bed_string, plotType='MERGE', extension=bam_extension, multiPage=False, debug=False, nameString=groupString, rpm=True, rxGenome='')
def plot_shep21_genes(nb_figure_gff_path, bed_string): ''' plots all varieties and iterations of tracks for shep21 data ''' #we will have a variety of different plot types #all nb_meta baseline #chiprx_scaled #chiprx w/o scaling #just shep21 nospike #shep on #first establish the plot folder plotFolder = utils.formatFolder('%sSHEP21_NOSPIKE/' % (genePlotFolder), True) plot_prefix = 'HG19_NB_FIGURE_GENES' #we also have to set the extension properly between datasets #go by data file #for shep21_dataFile dataDict = pipeline_dfci.loadDataTable(shep21_dataFile) names_list = dataDict.keys() bam = utils.Bam(dataDict[names_list[0]]['bam']) read_length = bam.getReadLengths()[0] bam_extension = 200 - read_length print('For datasets in %s using an extension of %s' % (shep21_dataFile, bam_extension)) #for shep21 we want meta of k27ac, pol2, mycn, and twist #individual of k27ac, pol2, mycn, and twist #first do individuals for plot_group in ['MYCN', 'TWIST', 'H3K27AC', 'POL2']: plotList = [ name for name in dataDict.keys() if name.count(plot_group) > 0 ] plotName = '%s_SHEP21_%s_NOSPIKE' % (plot_prefix, plot_group) print(plotName) pipeline_dfci.callBatchPlot(shep21_dataFile, nb_figure_gff_path, plotName, plotFolder, plotList, uniform=True, bed=bed_string, plotType='MULTIPLE', extension=bam_extension, multiPage=False, debug=False, nameString='', rpm=True, rxGenome='') #now as metas plotList = [ 'SHEP21_0HR_MYCN_NOSPIKE', 'SHEP21_2HR_MYCN_NOSPIKE', 'SHEP21_24HR_MYCN_NOSPIKE', 'SHEP21_0HR_H3K27AC_NOSPIKE', 'SHEP21_2HR_H3K27AC_NOSPIKE', 'SHEP21_24HR_H3K27AC_NOSPIKE', 'SHEP21_0HR_TWIST', 'SHEP21_2HR_TWIST', 'SHEP21_24HR_B_TWIST', 'SHEP21_0HR_POL2_NOSPIKE_R2', 'SHEP21_2HR_POL2_NOSPIKE', 'SHEP21_24HR_POL2_NOSPIKE', ] groupString = 'MYCN,MYCN,MYCN,H3K27AC,H3K27AC,H3K27AC,TWIST,TWIST,TWIST,POL2,POL2,POL2' plotName = '%s_SHEP21_NOSPIKE_META_RELATIVE' % (plot_prefix) pipeline_dfci.callBatchPlot(shep21_dataFile, nb_figure_gff_path, plotName, plotFolder, plotList, uniform=False, bed=bed_string, plotType='MERGE', extension=bam_extension, multiPage=False, debug=False, nameString=groupString, rpm=True, rxGenome='') plotName = '%s_SHEP21_NOSPIKE_META_UNIFORM' % (plot_prefix) pipeline_dfci.callBatchPlot(shep21_dataFile, nb_figure_gff_path, plotName, plotFolder, plotList, uniform=True, bed=bed_string, plotType='MERGE', extension=bam_extension, multiPage=False, debug=False, nameString=groupString, rpm=True, rxGenome='')
def main(): from optparse import OptionParser usage = "usage: %prog [options] -e [ENHANCER_FILE] -b [BAM_FILE] -g [GENOME] -o [OUTPUTFOLDER] -n [NAME]" parser = OptionParser(usage = usage) #required flags parser.add_option("-e","--enhancer_file", dest="enhancers",nargs = 1, default=None, help = "Provide a ROSE generated enhancer table (_AllEnhancers.table.txt)") parser.add_option("-b","--bam",dest="bam",nargs =1, default = None, help = "Provide a bam that corresponds to the super enhancer table") parser.add_option("-g","--genome",dest="genome",nargs =1, default = None, help = "Provide the build of the genome to be used for the analysis. Currently supports HG19, HG18 and MM9") parser.add_option("-o","--output",dest="output",nargs =1, default = None, help = "Enter an output folder") parser.add_option("-n","--name",dest="name",nargs =1, default = None, help = "Provide a name for the job") #additional options parser.add_option("-s","--subpeaks", dest="subpeaks",nargs=1,default=None, help = "Enter a BED file of regions to search for motifs") parser.add_option("-x","--expCutoff", dest="expCutoff",nargs=1,default=33, help = "Enter the expression cutoff to be used to define canidate TFs") parser.add_option("-l","--extension-length", dest="extension",nargs = 1, default=100, help = "Enter the length to extend subpeak regions for motif finding") parser.add_option("-B","--background", dest="background",nargs = 1, default=None, help = "Provide a background BAM file") parser.add_option("-a","--activity", dest="activity",nargs = 1, default=None, help = "A table with refseq in the first column and activity (expression or promoter acetylation) in second") parser.add_option("-E","--enhancer_number", dest="Enumber",nargs = 1, default='super', help = "Enter the number of top ranked enhancers to include in the anlaysis. Default is all super-enhancers") parser.add_option("-N", "--number", dest="number",nargs = 1, default=2, help = "Enter the number of motifs required to assign a binding event") #I have modified the destination of -N option so that it is different from the destination of -E option parser.add_option("--promoter", dest="promoter",nargs = 1, default=False, help = "Enter True if the promoters should be included in the analysis") parser.add_option("--motifs", dest="motifs",nargs = 1, default=False, help = "Enter an alternative PWM file for the analysis") parser.add_option("-t","--tfs", dest="tfs",nargs=1,default=None, help = "Enter additional TFs (comma separated) to be used in the bindinf analysis") parser.add_option("-u","--ucsc", dest="is_ucsc", action='store_true', default=False, help = "If set, use the ucsc folders or files with chromosome names as chr1, chr2, etc.") (options,args) = parser.parse_args() print(options) if options.enhancers and options.genome and options.output and options.name: ### # Define all global file names ### if options.motifs: motifDatabaseFile = options.motifs else: motifConvertFile = '/home/rad/users/gaurav/projects/ctrc/scripts/CLL_TFnetworks_2018/annotations/MotifDictionary.txt' motifDatabaseFile = '/home/rad/users/gaurav/projects/ctrc/scripts/CLL_TFnetworks_2018/annotations/VertebratePWMs.txt' # User input files enhancerFile = options.enhancers enhancerTable = utils.parseTable(enhancerFile, '\t') if options.bam: bamFile = options.bam bam = utils.Bam(bamFile) if options.background: background = options.background else: background = None genome = options.genome genome = upper(genome) if genome == 'HG19': genomeDirectory = '/home/rad/packages/data/fasta/human/hg19/chromosomes/' annotationFile = '/home/rad/users/gaurav/projects/ctrc/scripts/pipeline/annotation/hg19_refseq.ucsc' TFfile = '/home/rad/users/gaurav/projects/ctrc/scripts/CLL_TFnetworks_2018/annotations/TFlist_NMid_hg19.txt' if genome == 'HG18': genomeDirectory = '/grail/genomes/Homo_sapiens/human_gp_mar_06_no_random/fasta/' annotationFile = '/ark/home/cl512/src/pipeline/annotation/hg18_refseq.ucsc' TFfile = '/home/rad/users/gaurav/projects/ctrc/scripts/CLL_TFnetworks_2018/annotations/TFlist_NMid_hg19.txt' if genome == 'MM9': genomeDirectory = '/grail/genomes/Mus_musculus/UCSC/mm9/Sequence/Chromosomes/' annotationFile = '/home/rad/users/gaurav/projects/ctrc/scripts/pipeline/annotation/mm9_refseq.ucsc' TFfile = '/home/rad/users/gaurav/projects/ctrc/scripts/CLL_TFnetworks_2018/annotations/TFlist_NMid_mm9.txt' if genome == 'MM10': TFfile = '/home/rad/users/gaurav/projects/ctrc/scripts/CLL_TFnetworks_2018/annotations/TFlist_NMid_mm10.txt' if options.is_ucsc: genomeDirectory = '/home/rad/packages/data/fasta/mouse/mm10/ucsc_chromosomes/' annotationFile = '/home/rad/users/gaurav/projects/ctrc/scripts/pipeline/annotation/ucsc/mm10_refseq.ucsc' else: genomeDirectory = '/home/rad/packages/data/fasta/mouse/mm10/chromosomes/' annotationFile = '/home/rad/users/gaurav/projects/ctrc/scripts/pipeline/annotation/mm10_refseq.ucsc' TFtable = utils.parseTable(TFfile, '\t') TFlist = [line[0] for line in TFtable] TFlistGene = [line[1] for line in TFtable] projectFolder = options.output projectName = options.name if options.subpeaks: subpeakFile = options.subpeaks else: subpeakFile = None refseqToNameDict = {} expressionFile = options.activity if expressionFile: expressionTable = utils.parseTable(expressionFile, '\t') else: expressionTable = calculatePromoterActivity(annotationFile, bamFile, projectName, projectFolder, refseqToNameDict, background) expCutoff = int(options.expCutoff) constExtension = int(options.extension) enhancerNumber = options.Enumber if options.Enumber != 'super': enhancerNumber = options.Enumber else: enhancerNumber = 'super' promoter = options.promoter additionalTFs = options.tfs number = options.number annotTable = utils.parseTable(annotationFile, '\t') for line in annotTable: gid = line[1] genename = upper(line[12]) refseqToNameDict[gid] = genename ### # Now run all the functions ### enhancerLoci = createEnhancerLoci(enhancerTable, enhancerNumber) expressedNM, expressionDictNM = createExpressionDict(annotationFile, projectFolder, projectName, refseqToNameDict, expCutoff,expressionFile) TFtoEnhancerDict = findCanidateTFs(annotationFile, enhancerLoci, expressedNM, expressionDictNM, bamFile, TFlist, refseqToNameDict, projectFolder, projectName, promoter) # print TFtoEnhancerDict # sys.exit() formatOutput(TFtoEnhancerDict, refseqToNameDict, projectName, projectFolder) canidateGenes = [upper(refseqToNameDict[x]) for x in TFtoEnhancerDict.keys()] if additionalTFs: for tf in additionalTFs.split(','): canidateGenes.append(tf) canidateGenes = utils.uniquify(canidateGenes) print canidateGenes if subpeakFile == None: subpeakFile = findValleys(TFtoEnhancerDict, bamFile, projectName, projectFolder, cutoff = 0.2) generateSubpeakFASTA(TFtoEnhancerDict, subpeakFile, genomeDirectory, projectName, projectFolder, constExtension) subpeakFile = projectFolder + projectName + '_SUBPEAKS.fa' findMotifs(canidateGenes, projectFolder, projectName, motifConvertFile, motifDatabaseFile) graph = buildGraph(projectFolder, projectName, motifConvertFile, refseqToNameDict, canidateGenes) formatNetworkOutput(graph, projectFolder, projectName, canidateGenes) else: parser.print_help() sys.exit()
def mapBamToGFF(bamFile, gff, sense='.', extension=200, rpm=False, clusterGram=None, matrix=None): '''maps reads from a bam to a gff''' #creating a new gff to output newGFF = [] #reading in the bam bam = utils.Bam(bamFile) #getting RPM normalization if rpm: MMR = round(float(bam.getTotalReads('mapped')) / 1000000, 4) else: MMR = 1 print('using a MMR value of %s' % (MMR)) #creating a sense trans senseTrans = string.maketrans('-+.', '+-+') #reading in the gff if type(gff) == str: gff = utils.parseTable(gff, '\t') #setting up a clustergram table if clusterGram: binSize = int(clusterGram) binSizeList = [] #now go through each line of the gff and make sure they're all the same length for i in range(0, len(gff), 1): line = gff[i] gffLocus = utils.Locus(line[0], int(line[3]), int(line[4]), line[6], line[1]) binSizeList.append(gffLocus.len() / binSize) binSizeList = utils.uniquify(binSizeList) if len(binSizeList) > 1: print( 'WARNING: lines in gff are of different length. Output clustergram will have variable row length' ) newGFF.append(['GENE_ID', 'locusLine'] + [ str(x * binSize) + '_' + bamFile.split('/')[-1] for x in range(1, max(binSizeList) + 1, 1) ]) #setting up a maxtrix table if matrix: newGFF.append(['GENE_ID', 'locusLine'] + [ 'bin_' + str(n) + '_' + bamFile.split('/')[-1] for n in range(1, int(matrix) + 1, 1) ]) nBin = int(matrix) # Try to use the bamliquidatior script on cluster, otherwise, failover to local (in path), otherwise fail. bamliquidatorString = '/usr/bin/bamliquidator' if not os.path.isfile(bamliquidatorString): bamliquidatorString = './bamliquidator' if not os.path.isfile(bamliquidatorString): raise ValueError('bamliquidator not found in path') #getting and processing reads for gff lines ticker = 0 print('Number lines processed') for line in gff: line = line[0:9] if ticker % 100 == 0: print(ticker) ticker += 1 gffLocus = utils.Locus(line[0], int(line[3]), int(line[4]), line[6], line[1]) #get the nBin and binSize if clusterGram: nBin = gffLocus.len() / int(clusterGram) binSize = int(clusterGram) if matrix: nBin = int(matrix) binSize = gffLocus.len() / nBin #some regions will be too short to get info on if binSize == 0: clusterLine = [gffLocus.ID(), gffLocus.__str__()] + ['NA'] * nBin newGFF.append(clusterLine) continue #flippy flip if sense is negative if sense == '-': bamSense = string.translate(gffLocus.sense(), senseTrans) elif sense == '+': bamSense = gffLocus.sense() else: bamSense = '.' #using the bamLiquidator to get the readstring #print('using nBin of %s' % nBin) bamCommand = "%s %s %s %s %s %s %s %s" % ( bamliquidatorString, bamFile, line[0], gffLocus.start(), gffLocus.end(), bamSense, nBin, extension) #print(bamCommand) getReads = subprocess.Popen(bamCommand, stdin=subprocess.PIPE, stderr=subprocess.PIPE, stdout=subprocess.PIPE, shell=True) readString, stderr = getReads.communicate() if stderr: print("STDERR out: %s" % (stderr)) denList = readString.split('\n')[:-1] #print("denlist is: %s" % denList) #flip the denList if the actual gff region is - if gffLocus.sense() == '-': denList = denList[::-1] #converting from units of total bp of read sequence per bin to rpm/bp denList = [round(float(x) / binSize / MMR, 4) for x in denList] #if the gff region is - strand, flip the clusterLine = [gffLocus.ID(), gffLocus.__str__()] + denList newGFF.append(clusterLine) return newGFF
def main(): ''' main run function ''' from optparse import OptionParser usage = "usage: %prog [options] -t [TEST_BAM] -c [CONTROL_BAM] -g [GENOME]" parser = OptionParser(usage=usage) #required flags parser.add_option("-t", "--test", dest="test", nargs=1, default=None, help="Enter the full path of the test bam") parser.add_option("-c", "--control", dest="control", nargs=1, default=None, help="Enter the full path of the control bam") parser.add_option( "-g", "--genome", dest="genome", nargs=1, default=None, help= "Enter the build for the GeCKO library (currently only supports geckov2)" ) #optional arguments parser.add_option("-n", "--name", dest="name", nargs=1, default=0, help="Comma separated test,control name") parser.add_option( "-s", "--scoring", dest="scoring", nargs=1, default='WtSum', help="Scoring method (KSbyScore,WtSum,SecondBestRank) defulat: WtSum") parser.add_option( "-o", "--output", dest="output", nargs=1, default=None, help= "Enter the full path of the output folder. Default is the current working directory" ) (options, args) = parser.parse_args() #three required parameters to get started if options.test and options.control and options.genome: #get the names of the datasets if options.name: if len(options.name.split(',')) == 2: [testName, controlName] = options.name.split(',') else: print( "ERROR: Must provide a comma separated test,control name if using -n flag" ) parser.print_help() sys.exit() else: #try to extract names from file #strip extension from filename testName = options.test.split('/')[-1].split('.')[0] controlName = options.control.split('/')[-1].split('.')[0] #names print("using %s as name for test dataset" % (testName)) print("using %s as name for control dataset" % (controlName)) #get the analysis name analysisName = '%s_%s' % (testName, controlName) print("using %s as analysis name" % (analysisName)) #get the scoring method scoringMethod = options.scoring if ['KSbyScore', 'WtSum', 'SecondBestRank'].count(scoringMethod) == 0: print( "ERROR: please specify one of the following scoring methods:('KSbyScore','WtSum','SecondBestRank') or leave blank (default WtSum)" ) parser.print_help() sys.exit() #set up output folder if options.output: outputFolder = utils.formatFolder(options.output, True) else: outputFolder = utils.formatFolder('./%s/' % (analysisName), True) print("using %s as an output folder" % (outputFolder)) #get the right annotation genomeDict = { 'geckov2': '/grail/genomes/gecko/GeCKOv2/Annotation/Human_GeCKOv2_Library.txt', } #load the annotation dictionary annotFile = genomeDict[string.lower(options.genome)] print("using %s as the annotation file" % (annotFile)) #guideDict,geneDict = makeAnnotDict(annotFile) #now set up each bam testBam = utils.Bam(options.test) controlBam = utils.Bam(options.control) #get the MMR for each testMMR = round(float(testBam.getTotalReads()) / 1000000, 4) controlMMR = round(float(controlBam.getTotalReads()) / 1000000, 4) print("Test dataset: %s has an MMR of %s" % (testName, testMMR)) print("Control dataset: %s has an MMR of %s" % (controlName, controlMMR)) #now get the idxstats output testIdxFile = '%s%s_idxstats.txt' % (outputFolder, testName) testIdxCmd = '%s idxstats %s > %s' % (samtoolsString, options.test, testIdxFile) print("Test idxstats command:") print(testIdxCmd) os.system(testIdxCmd) controlIdxFile = '%s%s_idxstats.txt' % (outputFolder, controlName) controlIdxCmd = '%s idxstats %s > %s' % ( samtoolsString, options.control, controlIdxFile) print("Control idxstats command:") print(controlIdxCmd) os.system(controlIdxCmd) print("Checking for output") if not utils.checkOutput(testIdxFile, 0.1, 5): print("ERROR: UNABLE TO GENERATE IDX OUTPUT FOR %s" % (options.test)) print("Found test IdxStats file") if not utils.checkOutput(controlIdxFile, 0.1, 5): print("ERROR: UNABLE TO GENERATE IDX OUTPUT FOR %s" % (options.control)) print("Found control IdxStats file") #now make the fold table foldTableFile = makeFoldTable(annotFile, analysisName, testName, controlName, testMMR, controlMMR, testIdxFile, controlIdxFile, outputFolder, epsilon=1) print('writing output to %s' % (foldTableFile)) print("MAING FRIGER TABLE") rigerTableFile = makeRigerTable(foldTableFile, output='') print('writing FRIGER table to %s' % (rigerTableFile)) rigerBashFileName = callRiger(rigerTableFile, scoring=scoringMethod, output='', callRiger=True) else: parser.print_help() sys.exit()
def plot_mouse_genes(mouse_dataFile, mouse_figure_gff_path): ''' plots all varieties and iterations of tracks @ lifted over mouse regions ''' #first establish the plot folder plotFolder = utils.formatFolder('%sTHMYCN/' % (genePlotFolder), True) plot_prefix = 'MM9_NB_FIGURE_GENES_LIFTOVER' #we also have to set the extension properly between datasets #go by data file dataDict = pipeline_dfci.loadDataTable(mouse_dataFile) names_list = dataDict.keys() #initial check for consistency of read lengths # for name in names_list: # bam = utils.Bam(dataDict[name]['bam']) # read_length = bam.getReadLengths()[0] # bam_extension = 200-read_length # print('For dataset %s in %s using an extension of %s' % (name,mouse_dataFile,bam_extension)) # sys.exit() bam = utils.Bam(dataDict[names_list[0]]['bam']) read_length = bam.getReadLengths()[0] bam_extension = 200 - read_length print('For datasets in %s using an extension of %s' % (mouse_dataFile, bam_extension)) #first do individuals for plot_group in ['_MYCN', 'H3K27AC']: plotList = [ name for name in dataDict.keys() if name.upper().count(plot_group) > 0 ] print(plotList) if plot_group == '_MYCN': plotName = '%s_THMYCN%s' % (plot_prefix, plot_group) else: plotName = '%s_THMYCN_%s' % (plot_prefix, plot_group) print(plotName) pipeline_dfci.callBatchPlot(mouse_dataFile, mouse_figure_gff_path, plotName, plotFolder, plotList, uniform=True, bed='', plotType='MULTIPLE', extension=bam_extension, multiPage=False, debug=False, nameString='', rpm=True, rxGenome='') #now as metas #we only have 3 good k27ac and 3 good mycn datasets plotList = [ 'CG_H3K27Ac', 'SCG_H3K27Ac', 'THMYCN1_H3K27Ac', 'THMYCN_139423_H3K27Ac', 'THMYCN_139076_H3K27Ac', 'THMYCN2_MYCN', 'THMYCN_139076_MYCN', 'THMYCN_139423_MYCN', ] groupString = 'CG_,SCG,H3K27AC,H3K27AC,H3K27AC,MYCN,MYCN,MYCN' plotName = '%s_THMYCN_META_RELATIVE' % (plot_prefix) pipeline_dfci.callBatchPlot(mouse_dataFile, mouse_figure_gff_path, plotName, plotFolder, plotList, uniform=False, bed='', plotType='MERGE', extension=bam_extension, multiPage=False, debug=False, nameString=groupString, rpm=True, rxGenome='') plotName = '%s_THMYCN_META_UNIFORM' % (plot_prefix) pipeline_dfci.callBatchPlot(mouse_dataFile, mouse_figure_gff_path, plotName, plotFolder, plotList, uniform=True, bed='', plotType='MERGE', extension=bam_extension, multiPage=False, debug=False, nameString=groupString, rpm=True, rxGenome='')