def createExpressionDict(annotationFile, projectFolder, projectName, refseqToNameDict, expCutoff,expressionFile=''): ''' input: an activity table with refseq in first column and expression or promoter acetylation in second column output: a dictionary keyed by refseq that points to activity ''' print 'CREATING EXPRESSION DICTIONARY' if not expressionFile: expressionFilename = projectFolder + 'bamliquidator/matrix.txt' else: expressionFilename = expressionFile expressionTable = utils.parseTable(expressionFilename, '\t') expressionDictNM = {} expressionDictGene = {} for line in expressionTable[1:]: trid = line[0] geneName = refseqToNameDict[trid] try: exp = float(line[2]) except IndexError: exp = float(line[1]) # Save the expression value of each NMid in a dict, keep higher value if multiple if trid in expressionDictNM and exp > expressionDictNM[trid]: expressionDictNM[trid] = exp elif trid not in expressionDictNM: expressionDictNM[trid] = exp # Save the value of the expression if it's the highest for that gene if geneName in expressionDictGene and exp > expressionDictGene[geneName]: expressionDictGene[geneName] = exp elif geneName not in expressionDictGene: expressionDictGene[geneName] = exp cutoff = numpy.percentile(expressionDictGene.values(), expCutoff) print 'Expression cutoff: ' + str(cutoff) expressedGenes = [] expressedNM = [] for nmid in expressionDictNM: if float(expressionDictNM[nmid]) > cutoff: expressedGenes.append(refseqToNameDict[nmid]) expressedNM.append(nmid) expressedGenes = utils.uniquify(expressedGenes) Genefilename = projectFolder + projectName + '_EXPRESSED_GENES.txt' utils.unParseTable(expressedGenes, Genefilename, '') expressedNM = utils.uniquify(expressedNM) NMfilename = projectFolder + projectName + '_EXPRESSED_NM.txt' utils.unParseTable(expressedNM, NMfilename, '') return expressedNM, expressionDictNM
def getBamChromList(bamFileList): ''' gets the consensus list of chromosomes mapped by the bams ''' #start w/ the first bam cmd = '%s idxstats %s' % (samtoolsPath, bamFileList[0]) idxStats = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True) idxStats = idxStats.communicate() finalChromList = [ line.split('\t')[0] for line in idxStats[0].split('\n')[0:-2] ] #now go through each additional bam for bamFile in bamFileList: cmd = '%s idxstats %s' % (samtoolsPath, bamFile) idxStats = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True) idxStats = idxStats.communicate() chromList = [ line.split('\t')[0] for line in idxStats[0].split('\n')[0:-2] ] finalChromList = [ chrom for chrom in finalChromList if chromList.count(chrom) != 0 ] return utils.uniquify(finalChromList)
def __init__(self, poll_id: str, poll_title: str, options: List[Any], is_immortal=False, updated_since_start=True): if options is None: options = [] self.poll_id = poll_id self.creation_time = time.time() self.last_update = time.time() self.poll_title = poll_title self.options = uniquify(options) self.reaction_to_option = { reaction_emojies[k]: options[k] for k in range(len(options)) } self.option_to_reaction = { options[k]: reaction_emojies[k] for k in range(len(options)) } self.participants = dict() self.option_to_participants = {key: [] for key in options} self.sent_message = None self.received_message = None self.is_immortal = is_immortal self.is_enabled = True self.updated_since_start = updated_since_start
def mapGFFLineToBed(gffLine, outFolder, nBins, bedCollection, header=''): ''' for every line produces a file with all of the rectangles to draw ''' if len(header) == 0: gffString = '%s_%s_%s_%s' % (gffLine[0], gffLine[6], gffLine[3], gffLine[4]) else: gffString = header diagramTable = [[0, 0, 0, 0]] nameTable = [['', 0, 0]] gffLocus = utils.Locus(gffLine[0], int(gffLine[3]), int(gffLine[4]), gffLine[6], gffLine[1]) scaleFactor = float(nBins) / gffLocus.len() # plotting buffer for diagrams # plotBuffer = int(gffLocus.len() / float(nBins) * 20) # UNUSED (?) overlapLoci = bedCollection.getOverlap(gffLocus, sense='both') print("IDENTIFIED %s OVERLAPPING BED LOCI FOR REGION %s" % (len(overlapLoci), gffLine)) # since beds come from multiple sources, we want to figure out how to offset them offsetDict = {} # this will store each ID name bedNamesList = utils.uniquify([locus.ID() for locus in overlapLoci]) bedNamesList.sort() for i in range(len(bedNamesList)): offsetDict[bedNamesList[ i]] = 2 * i # offsets different categories of bed regions if gffLine[6] == '-': refPoint = int(gffLine[4]) else: refPoint = int(gffLine[3]) # fill out the name table for name in bedNamesList: offset = offsetDict[name] nameTable.append([name, 0, 0.0 - offset]) for bedLocus in overlapLoci: offset = offsetDict[bedLocus.ID()] [start, stop] = [abs(x - refPoint) * scaleFactor for x in bedLocus.coords()] diagramTable.append([start, -0.5 - offset, stop, 0.5 - offset]) utils.unParseTable(diagramTable, outFolder + gffString + '_bedDiagramTemp.txt', '\t') utils.unParseTable(nameTable, outFolder + gffString + '_bedNameTemp.txt', '\t')
def splitRegions(inputGFF,tssCollection): #if even a single coordinate is shared with the +/-1kb splitGFF = [] for line in inputGFF: chrom = line[0] regionID = line[1] lineLocus = utils.Locus(line[0],line[3],line[4],'.') overlappingLoci = tssCollection.getOverlap(lineLocus) if len(overlappingLoci) > 0: #case where a tss Overlap #identify the parts of the line locus that are contained localTSSCollection = utils.LocusCollection(overlappingLoci,50) overlappingCoords = lineLocus.coords() for tssLocus in overlappingLoci: overlappingCoords += tssLocus.coords() overlappingCoords = utils.uniquify(overlappingCoords) overlappingCoords.sort() #you need to hack and slash add 1 to the last coordinate of the overlappingCoords overlappingCoords[-1] +=1 i = 0 regionTicker = 1 while i < (len(overlappingCoords)-1): start = int(overlappingCoords[i]) stop = int(overlappingCoords[(i+1)])-1 if (stop - start) < 50: #this eliminates really tiny regions i+=1 continue splitLocus = utils.Locus(chrom,start+1,stop,'.') if lineLocus.overlaps(splitLocus): #has to be a mycn site newID = '%s_%s' % (regionID,regionTicker) tssStatus = 0 if localTSSCollection.getOverlap(splitLocus): tssStatus = 1 splitGFFLine = [chrom,newID,newID,start,stop,'','.',tssStatus,newID] splitGFF.append(splitGFFLine) regionTicker+=1 i+=1 else: line[7] = 0 splitGFF.append(line) return splitGFF
def get_all_os_versions(): """ Collapse the above list of OS versions for usage/display by the CLI/webapp. """ results = [''] for x in VALID_OS_VERSIONS.keys(): for y in VALID_OS_VERSIONS[x]: results.append(y) results = utils.uniquify(results) results.sort() return results
def findMotifs(subpeakFasta, bg_path, candidate_tf_list, projectFolder, analysis_name, motifConvertFile, motifDatabaseFile): ''' takes the refseq to subpeak seq dict returns the networkx object with all connections ''' fimoFolder = utils.formatFolder(projectFolder + 'FIMO/', True) subpeak_name = subpeakFasta.split('/')[-1].split('.')[0] output = '%s%s_fimo.txt' % (fimoFolder, subpeak_name) # Create a dictionary to call motif names keyed on gene names motifDatabase = utils.parseTable(motifConvertFile, '\t') motifDatabaseDict = {} #create a dict keyed by TF with multiple motifs for line in motifDatabase: motifDatabaseDict[line[1]] = [] for line in motifDatabase: motifDatabaseDict[line[1]].append(line[0]) candidate_tf_list.sort() print(candidate_tf_list) #now make a list of all motifs motif_list = [] for tf in candidate_tf_list: motif_list += motifDatabaseDict[tf] motif_list = utils.uniquify(motif_list) fimo_bash_path = '%s%s_fimo.sh' % (fimoFolder, analysis_name) fimo_bash = open(fimo_bash_path, 'w') fimo_bash.write('#!/usr/bin/bash\n\n') fimoCmd = 'fimo' for motif in motif_list: fimoCmd += ' --motif ' + "'%s'" % (str(motif)) #fimoCmd += ' --thresh 1e-5' #if you want to increase stringency fimoCmd += ' -verbosity 1' # thanks for that ;)! fimoCmd += ' -text' fimoCmd += ' -oc ' + projectFolder + 'FIMO' fimoCmd += ' --bgfile %s' % (bg_path) fimoCmd += ' ' + motifDatabaseFile + ' ' fimoCmd += subpeakFasta fimoCmd += ' > ' + output print fimoCmd fimo_bash.write(fimoCmd) fimo_bash.close() fimoOutput = subprocess.call( fimoCmd, shell=True) #will wait that fimo is done to go on return output
def findMotifs(subpeakFasta,bg_path,candidate_tf_list, projectFolder, analysis_name, motifConvertFile, motifDatabaseFile): ''' takes the refseq to subpeak seq dict returns the networkx object with all connections ''' fimoFolder = utils.formatFolder(projectFolder + 'FIMO/', True) subpeak_name = subpeakFasta.split('/')[-1].split('.')[0] output = '%s%s_fimo.txt' % (fimoFolder,subpeak_name) # Create a dictionary to call motif names keyed on gene names motifDatabase = utils.parseTable(motifConvertFile, '\t') motifDatabaseDict = {} #create a dict keyed by TF with multiple motifs for line in motifDatabase: motifDatabaseDict[line[1]] = [] for line in motifDatabase: motifDatabaseDict[line[1]].append(line[0]) candidate_tf_list.sort() print(candidate_tf_list) #now make a list of all motifs motif_list = [] for tf in candidate_tf_list: motif_list += motifDatabaseDict[tf] motif_list = utils.uniquify(motif_list) fimo_bash_path = '%s%s_fimo.sh' % (fimoFolder,analysis_name) fimo_bash = open(fimo_bash_path,'w') fimo_bash.write('#!/usr/bin/bash\n\n') fimoCmd = 'fimo' for motif in motif_list: fimoCmd += ' --motif ' + "'%s'" % (str(motif)) #fimoCmd += ' --thresh 1e-5' #if you want to increase stringency fimoCmd += ' -verbosity 1' # thanks for that ;)! fimoCmd += ' -text' fimoCmd += ' -oc ' + projectFolder + 'FIMO' fimoCmd += ' --bgfile %s' % (bg_path) fimoCmd += ' ' + motifDatabaseFile + ' ' fimoCmd += subpeakFasta fimoCmd += ' > '+ output print fimoCmd fimo_bash.write(fimoCmd) fimo_bash.close() fimoOutput = subprocess.call(fimoCmd, shell=True) #will wait that fimo is done to go on return output
def checkRefCollection(referenceCollection): ''' makes sure the names of all loci in the reference collection are unique ''' namesList = [locus.ID() for locus in referenceCollection.getLoci()] if len(namesList) != len(utils.uniquify(namesList)): print("ERROR: REGIONS HAVE NON-UNIQUE IDENTIFIERS") sys.exit() else: print("REFERENCE COLLECTION PASSES QC") return
def addLengths(gene_table_path, peak_table_path): ''' add tss and distal lengths to a gene table using the peak table ''' output_path = string.replace(gene_table_path, 'GENE_TABLE', 'GENE_TABLE_LENGTH') print(output_path) tss_dict = defaultdict(int) distal_dict = defaultdict(int) peak_table = utils.parseTable(peak_table_path, '\t') for line in peak_table[1:]: #get the genes gene_list = [] if len(line) == 15: gene_list += line[-1].split(',') gene_list += line[-2].split(',') elif len(line) == 14: gene_list += line[-1].split(',') else: continue gene_list = utils.uniquify( [gene for gene in gene_list if len(gene) > 0]) for gene in gene_list: if int(line[5]) == 1: tss_dict[gene] += int(line[4]) else: distal_dict[gene] += int(line[4]) #now fill out the gene table gene_table = utils.parseTable(gene_table_path, '\t') output_table = [gene_table[0] + ['TSS_LENGTH', 'DISTAL_LENGTH']] for line in gene_table[1:]: gene = line[0] new_line = line + [tss_dict[gene], distal_dict[gene]] output_table.append(new_line) utils.unParseTable(output_table, output_path, '\t') return output_path
def mapGFFLineToBed(gffLine, outFolder, nBins, bedCollection, header=''): ''' for every line produces a file with all of the rectangles to draw ''' if len(header) == 0: gffString = '%s_%s_%s_%s' % (gffLine[0], gffLine[6], gffLine[3], gffLine[4]) else: gffString = header diagramTable = [[0, 0, 0, 0]] nameTable = [['', 0, 0]] gffLocus = utils.Locus(gffLine[0], int(gffLine[3]), int(gffLine[4]), gffLine[6], gffLine[1]) scaleFactor = float(nBins) / gffLocus.len() # plotting buffer for diagrams # plotBuffer = int(gffLocus.len() / float(nBins) * 20) # UNUSED (?) overlapLoci = bedCollection.getOverlap(gffLocus, sense='both') print("IDENTIFIED %s OVERLAPPING BED LOCI FOR REGION %s" % (len(overlapLoci),gffLine)) # since beds come from multiple sources, we want to figure out how to offset them offsetDict = {} # this will store each ID name bedNamesList = utils.uniquify([locus.ID() for locus in overlapLoci]) bedNamesList.sort() for i in range(len(bedNamesList)): offsetDict[bedNamesList[i]] = 2 * i # offsets different categories of bed regions if gffLine[6] == '-': refPoint = int(gffLine[4]) else: refPoint = int(gffLine[3]) # fill out the name table for name in bedNamesList: offset = offsetDict[name] nameTable.append([name, 0, 0.0 - offset]) for bedLocus in overlapLoci: offset = offsetDict[bedLocus.ID()] [start, stop] = [abs(x - refPoint) * scaleFactor for x in bedLocus.coords()] diagramTable.append([start, -0.5 - offset, stop, 0.5 - offset]) utils.unParseTable(diagramTable, outFolder + gffString + '_bedDiagramTemp.txt', '\t') utils.unParseTable(nameTable, outFolder + gffString + '_bedNameTemp.txt', '\t')
def filterGFF(gffFile, chromList): ''' takes in a gff and filters out all lines that don't belong to a chrom in the chromList ''' gff = utils.parseTable(gffFile, '\t') filteredGFF = [] excludeList = [] for line in gff: if chromList.count(line[0]) == 1: filteredGFF.append(line) else: excludeList.append(line[0]) excludeList = utils.uniquify(excludeList) if len(excludeList) > 0: print("EXCLUDED GFF REGIONS FROM THE FALLING CHROMS: %s" % (','.join(excludeList))) return filteredGFF
def filterGFF(gffFile,chromList): ''' takes in a gff and filters out all lines that don't belong to a chrom in the chromList ''' gff = utils.parseTable(gffFile,'\t') filteredGFF = [] excludeList=[] for line in gff: if chromList.count(line[0]) ==1: filteredGFF.append(line) else: excludeList.append(line[0]) excludeList = utils.uniquify(excludeList) if len(excludeList) > 0: print("EXCLUDED GFF REGIONS FROM THE FALLING CHROMS: %s" % (','.join(excludeList))) return filteredGFF
def getBamChromList(bamFileList): ''' gets the consensus list of chromosomes mapped by the bams ''' #start w/ the first bam cmd = '%s idxstats %s' % (samtoolsPath,bamFileList[0]) idxStats = subprocess.Popen(cmd,stdout=subprocess.PIPE,shell=True) idxStats= idxStats.communicate() finalChromList = [line.split('\t')[0] for line in idxStats[0].split('\n')[0:-2]] #now go through each additional bam for bamFile in bamFileList: cmd = '%s idxstats %s' % (samtoolsPath,bamFile) idxStats = subprocess.Popen(cmd,stdout=subprocess.PIPE,shell=True) idxStats= idxStats.communicate() chromList = [line.split('\t')[0] for line in idxStats[0].split('\n')[0:-2]] finalChromList = [chrom for chrom in finalChromList if chromList.count(chrom) != 0] return utils.uniquify(finalChromList)
def make_probe_to_gene_dict(annotFile, array_1_path, array_2_path): ''' keyed by probe ID w/ gene as value ''' #see if it already exists pickle_path = '%soberthuer_outcome/probe_dict.pkl' % (projectFolder) if utils.checkOutput(pickle_path, 0, 0): print('loading previously made probe dict at %s' % (pickle_path)) probe_gene_dict = pickle.load(open(pickle_path, "rb")) return probe_gene_dict #we want to intersect refseq common names w/ the array startDict = utils.makeStartDict(annotFile) ref_name_list = utils.uniquify( [startDict[refID]['name'] for refID in startDict.keys()]) probe_gene_dict = {} array_1 = utils.parseTable(array_1_path, '\t') array_2 = utils.parseTable(array_2_path, '\t') ticker = 0 for line in array_1 + array_2: if len(line) < 5: continue ticker += 1 probe_id = line[4] name = line[-1] # print(probe_id) # print(name) # if ticker== 10: # sys.exit() # print(line) if ref_name_list.count(name) > 0: probe_gene_dict[probe_id] = name pickle.dump(probe_gene_dict, open(pickle_path, 'wb')) return probe_gene_dict
def mapEnhancerToGene(annotFile,enhancerFile,transcribedFile='',uniqueGenes=True,searchWindow =50000,noFormatTable = False): ''' maps genes to enhancers. if uniqueGenes, reduces to gene name only. Otherwise, gives for each refseq ''' startDict = utils.makeStartDict(annotFile) enhancerTable = utils.parseTable(enhancerFile,'\t') #internal parameter for debugging byRefseq = False if len(transcribedFile) > 0: transcribedTable = utils.parseTable(transcribedFile,'\t') transcribedGenes = [line[1] for line in transcribedTable] else: transcribedGenes = startDict.keys() print('MAKING TRANSCRIPT COLLECTION') transcribedCollection = utils.makeTranscriptCollection(annotFile,0,0,500,transcribedGenes) print('MAKING TSS COLLECTION') tssLoci = [] for geneID in transcribedGenes: tssLoci.append(utils.makeTSSLocus(geneID,startDict,0,0)) #this turns the tssLoci list into a LocusCollection #50 is the internal parameter for LocusCollection and doesn't really matter tssCollection = utils.LocusCollection(tssLoci,50) geneDict = {'overlapping':defaultdict(list),'proximal':defaultdict(list)} #dictionaries to hold ranks and superstatus of gene nearby enhancers rankDict = defaultdict(list) superDict= defaultdict(list) #list of all genes that appear in this analysis overallGeneList = [] if noFormatTable: #set up the output tables #first by enhancer enhancerToGeneTable = [enhancerTable[0]+['OVERLAP_GENES','PROXIMAL_GENES','CLOSEST_GENE']] else: #set up the output tables #first by enhancer enhancerToGeneTable = [enhancerTable[0][0:9]+['OVERLAP_GENES','PROXIMAL_GENES','CLOSEST_GENE'] + enhancerTable[5][-2:]] #next by gene geneToEnhancerTable = [['GENE_NAME','REFSEQ_ID','PROXIMAL_ENHANCERS']] #next make the gene to enhancer table geneToEnhancerTable = [['GENE_NAME','REFSEQ_ID','PROXIMAL_ENHANCERS','ENHANCER_RANKS','IS_SUPER']] for line in enhancerTable: if line[0][0] =='#' or line[0][0] == 'R': continue enhancerString = '%s:%s-%s' % (line[1],line[2],line[3]) enhancerLocus = utils.Locus(line[1],line[2],line[3],'.',line[0]) #overlapping genes are transcribed genes whose transcript is directly in the stitchedLocus overlappingLoci = transcribedCollection.getOverlap(enhancerLocus,'both') overlappingGenes =[] for overlapLocus in overlappingLoci: overlappingGenes.append(overlapLocus.ID()) #proximalGenes are transcribed genes where the tss is within 50kb of the boundary of the stitched loci proximalLoci = tssCollection.getOverlap(utils.makeSearchLocus(enhancerLocus,searchWindow,searchWindow),'both') proximalGenes =[] for proxLocus in proximalLoci: proximalGenes.append(proxLocus.ID()) distalLoci = tssCollection.getOverlap(utils.makeSearchLocus(enhancerLocus,1000000,1000000),'both') distalGenes =[] for proxLocus in distalLoci: distalGenes.append(proxLocus.ID()) overlappingGenes = utils.uniquify(overlappingGenes) proximalGenes = utils.uniquify(proximalGenes) distalGenes = utils.uniquify(distalGenes) allEnhancerGenes = overlappingGenes + proximalGenes + distalGenes #these checks make sure each gene list is unique. #technically it is possible for a gene to be overlapping, but not proximal since the #gene could be longer than the 50kb window, but we'll let that slide here for refID in overlappingGenes: if proximalGenes.count(refID) == 1: proximalGenes.remove(refID) for refID in proximalGenes: if distalGenes.count(refID) == 1: distalGenes.remove(refID) #Now find the closest gene if len(allEnhancerGenes) == 0: closestGene = '' else: #get enhancerCenter enhancerCenter = (int(line[2]) + int(line[3]))/2 #get absolute distance to enhancer center distList = [abs(enhancerCenter - startDict[geneID]['start'][0]) for geneID in allEnhancerGenes] #get the ID and convert to name closestGene = startDict[allEnhancerGenes[distList.index(min(distList))]]['name'] #NOW WRITE THE ROW FOR THE ENHANCER TABLE if noFormatTable: newEnhancerLine = list(line) newEnhancerLine.append(join(utils.uniquify([startDict[x]['name'] for x in overlappingGenes]),',')) newEnhancerLine.append(join(utils.uniquify([startDict[x]['name'] for x in proximalGenes]),',')) newEnhancerLine.append(closestGene) else: newEnhancerLine = line[0:9] newEnhancerLine.append(join(utils.uniquify([startDict[x]['name'] for x in overlappingGenes]),',')) newEnhancerLine.append(join(utils.uniquify([startDict[x]['name'] for x in proximalGenes]),',')) newEnhancerLine.append(closestGene) newEnhancerLine += line[-2:] enhancerToGeneTable.append(newEnhancerLine) #Now grab all overlapping and proximal genes for the gene ordered table overallGeneList +=overlappingGenes for refID in overlappingGenes: geneDict['overlapping'][refID].append(enhancerString) rankDict[refID].append(int(line[-2])) superDict[refID].append(int(line[-1])) overallGeneList+=proximalGenes for refID in proximalGenes: geneDict['proximal'][refID].append(enhancerString) rankDict[refID].append(int(line[-2])) superDict[refID].append(int(line[-1])) #End loop through #Make table by gene overallGeneList = utils.uniquify(overallGeneList) #use enhancer rank to order rankOrder = utils.order([min(rankDict[x]) for x in overallGeneList]) usedNames = [] for i in rankOrder: refID = overallGeneList[i] geneName = startDict[refID]['name'] if usedNames.count(geneName) > 0 and uniqueGenes == True: continue else: usedNames.append(geneName) proxEnhancers = geneDict['overlapping'][refID]+geneDict['proximal'][refID] superStatus = max(superDict[refID]) enhancerRanks = join([str(x) for x in rankDict[refID]],',') newLine = [geneName,refID,join(proxEnhancers,','),enhancerRanks,superStatus] geneToEnhancerTable.append(newLine) #resort enhancerToGeneTable if noFormatTable: return enhancerToGeneTable,geneToEnhancerTable else: enhancerOrder = utils.order([int(line[-2]) for line in enhancerToGeneTable[1:]]) sortedTable = [enhancerToGeneTable[0]] for i in enhancerOrder: sortedTable.append(enhancerToGeneTable[(i+1)]) return sortedTable,geneToEnhancerTable
def _matrix_entry_maps(self): """Set of all mappings used in matrix arguments.""" return list(uniquify(m for arg in self.args if arg._is_mat for m in arg.map))
def _unique_matrix(self): return list(uniquify(a.data for a in self._matrix_args))
def main(): ''' main run call ''' debug = False from optparse import OptionParser usage = "usage: %prog [options] -g [GENOME] -i [INPUT_REGION_GFF] -r [RANKBY_BAM_FILE] -o [OUTPUT_FOLDER] [OPTIONAL_FLAGS]" parser = OptionParser(usage=usage) # required flags parser.add_option("-i", "--i", dest="input", nargs=1, default=None, help="Enter a comma separated list of .gff or .bed file of binding sites used to make enhancers") parser.add_option("-r", "--rankby", dest="rankby", nargs=1, default=None, help="Enter a comma separated list of bams to rank by") parser.add_option("-o", "--out", dest="out", nargs=1, default=None, help="Enter an output folder") parser.add_option("-g", "--genome", dest="genome", nargs=1, default=None, help="Enter the genome build (MM9,MM8,HG18,HG19)") # optional flags parser.add_option("-n", "--name", dest="name", nargs=1, default=None, help="Provide a name for the analysis otherwise ROSE will guess") parser.add_option("-c", "--control", dest="control", nargs=1, default=None, help="Enter a comma separated list of control bams. Can either provide a single control bam for all rankby bams, or provide a control bam for each individual bam") parser.add_option("-s", "--stitch", dest="stitch", nargs=1, default='', help="Enter a max linking distance for stitching. Default will determine optimal stitching parameter") parser.add_option("-t", "--tss", dest="tss", nargs=1, default=0, help="Enter a distance from TSS to exclude. 0 = no TSS exclusion") parser.add_option("--mask", dest="mask", nargs=1, default=None, help="Mask a set of regions from analysis. Provide a .bed or .gff of masking regions") # RETRIEVING FLAGS (options, args) = parser.parse_args() if not options.input or not options.rankby or not options.out or not options.genome: print('hi there') parser.print_help() exit() # making the out folder if it doesn't exist outFolder = utils.formatFolder(options.out, True) # figuring out folder schema gffFolder = utils.formatFolder(outFolder + 'gff/', True) mappedFolder = utils.formatFolder(outFolder + 'mappedGFF/', True) # GETTING INPUT FILE(s) inputList = [inputFile for inputFile in options.input.split(',') if len(inputFile) > 1] #converting all input files into GFFs and moving into the GFF folder inputGFFList = [] for inputFile in inputList: if inputFile.split('.')[-1] == 'bed': # CONVERTING A BED TO GFF inputGFFName = inputFile.split('/')[-1][0:-4] #strips the last 4 characters i.e. '.bed' inputGFFFile = '%s%s.gff' % (gffFolder, inputGFFName) utils.bedToGFF(inputFile, inputGFFFile) elif options.input.split('.')[-1] == 'gff': # COPY THE INPUT GFF TO THE GFF FOLDER os.system('cp %s %s' % (inputFile, gffFolder)) inputGFFFile = '%s%s' % (gffFolder,inputFile.split('/')[-1]) else: print('WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT') # COPY THE INPUT GFF TO THE GFF FOLDER os.system('cp %s %s' % (inputFile, gffFolder)) inputGFFFile = '%s%s' % (gffFolder,inputFile.split('/')[-1]) inputGFFList.append(inputGFFFile) # GETTING THE LIST OF BAMFILES TO PROCESS #either same number of bams for rankby and control #or only 1 control #or none! #bamlist should be all rankby bams followed by control bams bamFileList = [] if options.control: controlBamList = [bam for bam in options.control.split(',') if len(bam) >0] rankbyBamList = [bam for bam in options.rankby.split(',') if len(bam) >0] if len(controlBamList) == len(rankbyBamList): #case where an equal number of backgrounds are given bamFileList = rankbyBamList + controlBamList elif len(controlBamList) == 1: #case where a universal background is applied bamFileList = rankbyBamList + controlBamList*len(rankbyBamList) else: print('ERROR: EITHER PROVIDE A SINGLE CONTROL BAM FOR ALL SAMPLES, OR ONE CONTROL BAM FOR EACH SAMPLE') sys.exit() else: bamFileList = [bam for bam in options.rankby.split(',') if len(bam) > 0] # Stitch parameter if options.stitch == '': stitchWindow = '' else: stitchWindow = int(options.stitch) # tss options tssWindow = int(options.tss) if tssWindow != 0: removeTSS = True else: removeTSS = False # GETTING THE GENOME genome = string.upper(options.genome) print('USING %s AS THE GENOME' % (genome)) # GETTING THE CORRECT ANNOT FILE genomeDict = { 'HG18': '%s/annotation/hg18_refseq.ucsc' % (pipeline_dir), 'MM9': '%s/annotation/mm9_refseq.ucsc' % (pipeline_dir), 'HG19': '%s/annotation/hg19_refseq.ucsc' % (pipeline_dir), 'MM8': '%s/annotation/mm8_refseq.ucsc' % (pipeline_dir), 'MM10': '%s/annotation/mm10_refseq.ucsc' % (pipeline_dir), 'RN4': '%s/annotation/rn4_refseq.ucsc' % (pipeline_dir), } try: annotFile = genomeDict[genome.upper()] except KeyError: print('ERROR: UNSUPPORTED GENOMES TYPE %s' % (genome)) sys.exit() #FINDING THE ANALYSIS NAME if options.name: inputName = options.name else: inputName = inputGFFList[0].split('/')[-1].split('.')[0] print('USING %s AS THE ANALYSIS NAME' % (inputName)) print('FORMATTING INPUT REGIONS') # MAKING THE RAW INPUT FILE FROM THE INPUT GFFs #use a simpler unique region naming system if len(inputGFFList) == 1: inputGFF = utils.parseTable(inputGFFList[0],'\t') else: inputLoci = [] for gffFile in inputGFFList: print('\tprocessing %s' % (gffFile)) gff = utils.parseTable(gffFile,'\t') gffCollection = utils.gffToLocusCollection(gff,50) inputLoci += gffCollection.getLoci() inputCollection = utils.LocusCollection(inputLoci,50) inputCollection = inputCollection.stitchCollection() # stitches to produce unique regions inputGFF = utils.locusCollectionToGFF(inputCollection) formattedGFF = [] #now number things appropriately for i,line in enumerate(inputGFF): #use the coordinates to make a new id inputname_chr_sense_start_stop chrom = line[0] coords = [int(line[3]) ,int(line[4])] sense = line[6] lineID = '%s_%s' % (inputName,str(i+1)) #1 indexing newLine = [chrom,lineID,lineID,min(coords),max(coords),'',sense,'',lineID] formattedGFF.append(newLine) #name of the master input gff file masterGFFFile = '%s%s_%s_ALL_-0_+0.gff' % (gffFolder,string.upper(genome),inputName) utils.unParseTable(formattedGFF,masterGFFFile,'\t') print('USING %s AS THE INPUT GFF' % (masterGFFFile)) # MAKING THE START DICT print('MAKING START DICT') startDict = utils.makeStartDict(annotFile) #GET CHROMS FOUND IN THE BAMS print('GETTING CHROMS IN BAMFILES') bamChromList = getBamChromList(bamFileList) print("USING THE FOLLOWING CHROMS") print(bamChromList) #LOADING IN THE GFF AND FILTERING BY CHROM print('LOADING AND FILTERING THE GFF') inputGFF = filterGFF(masterGFFFile,bamChromList) # LOADING IN THE BOUND REGION REFERENCE COLLECTION print('LOADING IN GFF REGIONS') referenceCollection = utils.gffToLocusCollection(inputGFF) print('CHECKING REFERENCE COLLECTION:') checkRefCollection(referenceCollection) # MASKING REFERENCE COLLECTION # see if there's a mask if options.mask: maskFile = options.mask # if it's a bed file if maskFile.split('.')[-1].upper() == 'BED': maskGFF = utils.bedToGFF(maskFile) elif maskFile.split('.')[-1].upper() == 'GFF': maskGFF = utils.parseTable(maskFile, '\t') else: print("MASK MUST BE A .gff or .bed FILE") sys.exit() maskCollection = utils.gffToLocusCollection(maskGFF) # now mask the reference loci referenceLoci = referenceCollection.getLoci() filteredLoci = [locus for locus in referenceLoci if len(maskCollection.getOverlap(locus, 'both')) == 0] print("FILTERED OUT %s LOCI THAT WERE MASKED IN %s" % (len(referenceLoci) - len(filteredLoci), maskFile)) referenceCollection = utils.LocusCollection(filteredLoci, 50) # NOW STITCH REGIONS print('STITCHING REGIONS TOGETHER') stitchedCollection, debugOutput, stitchWindow = regionStitching(referenceCollection, inputName, outFolder, stitchWindow, tssWindow, annotFile, removeTSS) # NOW MAKE A STITCHED COLLECTION GFF print('MAKING GFF FROM STITCHED COLLECTION') stitchedGFF = utils.locusCollectionToGFF(stitchedCollection) print(stitchWindow) print(type(stitchWindow)) if not removeTSS: stitchedGFFFile = '%s%s_%sKB_STITCHED.gff' % (gffFolder, inputName, str(stitchWindow / 1000)) stitchedGFFName = '%s_%sKB_STITCHED' % (inputName, str(stitchWindow / 1000)) debugOutFile = '%s%s_%sKB_STITCHED.debug' % (gffFolder, inputName, str(stitchWindow / 1000)) else: stitchedGFFFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.gff' % (gffFolder, inputName, str(stitchWindow / 1000)) stitchedGFFName = '%s_%sKB_STITCHED_TSS_DISTAL' % (inputName, str(stitchWindow / 1000)) debugOutFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.debug' % (gffFolder, inputName, str(stitchWindow / 1000)) # WRITING DEBUG OUTPUT TO DISK if debug: print('WRITING DEBUG OUTPUT TO DISK AS %s' % (debugOutFile)) utils.unParseTable(debugOutput, debugOutFile, '\t') # WRITE THE GFF TO DISK print('WRITING STITCHED GFF TO DISK AS %s' % (stitchedGFFFile)) utils.unParseTable(stitchedGFF, stitchedGFFFile, '\t') # SETTING UP THE OVERALL OUTPUT FILE outputFile1 = outFolder + stitchedGFFName + '_ENHANCER_REGION_MAP.txt' print('OUTPUT WILL BE WRITTEN TO %s' % (outputFile1)) # MAPPING TO THE NON STITCHED (ORIGINAL GFF) # MAPPING TO THE STITCHED GFF # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail. bamFileListUnique = list(bamFileList) bamFileListUnique = utils.uniquify(bamFileListUnique) #prevent redundant mapping print("MAPPING TO THE FOLLOWING BAMS:") print(bamFileListUnique) for bamFile in bamFileListUnique: bamFileName = bamFile.split('/')[-1] # MAPPING TO THE STITCHED GFF mappedOut1Folder = '%s%s_%s_MAPPED' % (mappedFolder, stitchedGFFName, bamFileName) mappedOut1File = '%s%s_%s_MAPPED/matrix.txt' % (mappedFolder, stitchedGFFName, bamFileName) if utils.checkOutput(mappedOut1File, 0.2, 0.2): print("FOUND %s MAPPING DATA FOR BAM: %s" % (stitchedGFFFile, mappedOut1File)) else: cmd1 = bamliquidator_path + " --sense . -e 200 --match_bamToGFF -r %s -o %s %s" % (stitchedGFFFile, mappedOut1Folder, bamFile) print(cmd1) os.system(cmd1) if utils.checkOutput(mappedOut1File,0.2,5): print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (stitchedGFFFile, bamFileName)) else: print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (stitchedGFFFile, bamFileName)) sys.exit() print('BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS') # CALCULATE DENSITY BY REGION # NEED TO FIX THIS FUNCTION TO ACCOUNT FOR DIFFERENT OUTPUTS OF LIQUIDATOR mapCollection(stitchedCollection, referenceCollection, bamFileList, mappedFolder, outputFile1, refName=stitchedGFFName) print('FINDING AVERAGE SIGNAL AMONGST BAMS') metaOutputFile = collapseRegionMap(outputFile1,inputName + '_MERGED_SIGNAL',controlBams=options.control) #now try the merging print('CALLING AND PLOTTING SUPER-ENHANCERS') rankbyName = inputName + '_MERGED_SIGNAL' controlName = 'NONE' cmd = 'Rscript %sROSE2_callSuper.R %s %s %s %s' % (pipeline_dir,outFolder, metaOutputFile, inputName, controlName) print(cmd) os.system(cmd) # calling the gene mapper print('CALLING GENE MAPPING') superTableFile = "%s_SuperEnhancers.table.txt" % (inputName) #for now don't use ranking bam to call top genes cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % (pipeline_dir,genome, outFolder, superTableFile) print(cmd) os.system(cmd) stretchTableFile = "%s_StretchEnhancers.table.txt" % (inputName) cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % (pipeline_dir,genome, outFolder, stretchTableFile) print(cmd) os.system(cmd) superStretchTableFile = "%s_SuperStretchEnhancers.table.txt" % (inputName) cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % (pipeline_dir,genome, outFolder, superStretchTableFile) os.system(cmd)
def model_prediction(): model_path = 'networks/resnet-18_trained.t7' test_dir = 'Result/' cuda = True log_file = 'evalss_data' eval_rot = True eval_no_crop = True n_tries = 10 seed = 42 output_dir = 'Result' running = True print("loading model", model_path) torch.cuda.manual_seed_all(seed) torch.manual_seed(seed) np.random.seed(seed) log_file = utils.uniquify(os.path.join(output_dir, log_file), sep='-') eval_data = test_dir net = utils.load_model(model_path, cuda) image_array = cv2.imread(os.path.join(test_dir, 'data', 'image.png')) mean_of_image = np.mean(image_array, axis=(0, 1))/1000 std_of_image = np.std(image_array, axis=(0, 1))/100 ### Load the data into PyTorch using dataloader dataloader = utils.get_dataloader(test_dir, [0.6000, 0.3946, 0.6041], [0.2124, 0.2335, 0.2360], eval_no_crop, eval_rot, batch_size=1) #dataloader = utils.get_dataloader(test_dir, mean_of_image, std_of_image, eval_no_crop, eval_rot, batch_size=1) print(type(dataloader)) ''' # A function to get probabililties using only one iteration net = net.eval() for img, label in dataloader: print(type(img)) print(type(label)) img = img.cuda() print('IMAGE TYPE:',img) pred = net(img).data.cpu().numpy() print('checking', pred) probs = nn.functional.softmax(pred) print('PROBABILITIES:', probs) ''' if not running: net.eval() utils.set_strategy(net, 'sample') have_do = utils.set_do_to_train(net) res = utils.predict_proba(dataloader, net, n_classes=5, return_logits=True, ensembles=n_tries, cuda=cuda) print('Result', res) ''' eval_data['test'] = { 'ensemble/proba': res[0], 'ensemble/logits': res[2], 'eval/labels': res[1], 'ensemble/filenames': res[3] } ''' else: net.eval() utils.set_strategy(net, 'running') have_do = utils.set_do_to_train(net) res = utils.predict_proba(dataloader, net, n_classes=5, return_logits=True, ensembles=n_tries if have_do else 3) print('type(eval_data):', type(eval_data)) ''' eval_data['test'].update({ 'eval/proba': res[0], 'eval/logits': res[2], 'eval/labels': res[1], 'ensemble/filenames': res[3] }) ''' # Get the mean of predictions for n_tries iterations for each class prob_means_en = np.mean(res[0], axis=0) output_file_name = 'res_norotate' torch.save(res, output_dir + '/' + output_file_name) #print(res[2].shape) print('Created output file \'', output_file_name, ' \' ') torch.cuda.empty_cache() return (prob_means_en)
def finishRankOutput(dataFile, rankOutput, genome, mergeFolder, mergeName, name1, name2, cutOff=1.5, window=100000, superOnly=True, plotBam=True): ''' cleans up the rank output table makes a gff of all of the gained/lost supers beyond a certain cutoff w/ a window makes a list of gained genes and lost genes makes a bed of gained loss ''' dataDict = pipeline_dfci.loadDataTable(dataFile) #making sure window and cutoff are int/float cutOff = float(cutOff) window = int(window) genome = string.upper(genome) #make the output folder outputFolder = pipeline_dfci.formatFolder(mergeFolder + 'output/', True) #bring in the old rank table rankEnhancerTable = utils.parseTable(rankOutput, '\t') #make a new formatted table header = rankEnhancerTable[0] header[-4] = 'DELTA RANK' header[-3] = 'IS_SUPER' formattedRankTable = [header] #the gffs gainedGFF = [] lostGFF = [] gainedWindowGFF = [] lostWindowGFF = [] if superOnly: enhancerType = 'SUPERS' else: enhancerType = 'ENHANCERS' #the beds if superOnly: gainedTrackHeader = 'track name="%s %s only SEs" description="%s super enhancers that are found only in %s vs %s" itemRGB=On color=255,0,0' % ( genome, name2, genome, name2, name1) gainedBed = [[gainedTrackHeader]] conservedTrackHeader = 'track name="%s %s and %s SEs" description="%s super enhancers that are found in both %s vs %s" itemRGB=On color=0,0,0' % ( genome, name1, name2, genome, name1, name2) conservedBed = [[conservedTrackHeader]] lostTrackHeader = 'track name="%s %s only SEs" description="%s super enhancers that are found only in %s vs %s" itemRGB=On color=0,255,0' % ( genome, name1, genome, name1, name2) lostBed = [[lostTrackHeader]] else: gainedTrackHeader = 'track name="%s %s only enhancers" description="%s enhancers that are found only in %s vs %s" itemRGB=On color=255,0,0' % ( genome, name2, genome, name2, name1) gainedBed = [[gainedTrackHeader]] conservedTrackHeader = 'track name="%s %s and %s enhancers" description="%s enhancers that are found in both %s vs %s" itemRGB=On color=0,0,0' % ( genome, name1, name2, genome, name1, name2) conservedBed = [[conservedTrackHeader]] lostTrackHeader = 'track name="%s %s only enhancers" description="%s enhancers that are found only in %s vs %s" itemRGB=On color=0,255,0' % ( genome, name1, genome, name1, name2) lostBed = [[lostTrackHeader]] #the genes geneTable = [[ 'GENE', 'ENHANCER_ID', 'ENHANCER_CHROM', 'ENHANCER_START', 'ENHANCER_STOP', header[6], header[7], header[8], 'STATUS' ]] for line in rankEnhancerTable[1:]: #fixing the enhancer ID line[0] = line[0].replace('_lociStitched', '') formattedRankTable.append(line) #getting the genes geneList = [] geneList += line[9].split(',') geneList += line[10].split(',') geneList += line[11].split(',') geneList = [x for x in geneList if len(x) > 0] geneList = utils.uniquify(geneList) geneString = string.join(geneList, ',') bedLine = [line[1], line[2], line[3], line[0], line[-4]] #for gained if float(line[6]) > cutOff: gffLine = [ line[1], line[0], '', line[2], line[3], '', '.', '', geneString ] gffWindowLine = [ line[1], line[0], '', int(line[2]) - window, int(line[3]) + window, '', '.', '', geneString ] gainedGFF.append(gffLine) gainedWindowGFF.append(gffWindowLine) geneStatus = name2 gainedBed.append(bedLine) #for lost elif float(line[6]) < (-1 * cutOff): gffLine = [ line[1], line[0], '', line[2], line[3], '', '.', '', geneString ] gffWindowLine = [ line[1], line[0], '', int(line[2]) - window, int(line[3]) + window, '', '.', '', geneString ] lostGFF.append(gffLine) lostWindowGFF.append(gffWindowLine) geneStatus = name1 lostBed.append(bedLine) #for conserved else: geneStatus = 'CONSERVED' conservedBed.append(bedLine) #now fill in the gene Table for gene in geneList: geneTableLine = [ gene, line[0], line[1], line[2], line[3], line[6], line[7], line[8], geneStatus ] geneTable.append(geneTableLine) #concat the bed fullBed = gainedBed + conservedBed + lostBed #start writing the output #there's the two gffs, the bed,the formatted table, the gene table #formatted table formattedFilename = "%s%s_%s_MERGED_%s_RANK_TABLE.txt" % ( outputFolder, genome, mergeName, enhancerType) utils.unParseTable(formattedRankTable, formattedFilename, '\t') #gffs gffFolder = pipeline_dfci.formatFolder(outputFolder + 'gff/', True) gffFilename_gained = "%s%s_%s_%s_ONLY_%s_-0_+0.gff" % ( gffFolder, genome, mergeName, string.upper(name2), enhancerType) gffFilenameWindow_gained = "%s%s_%s_%s_ONLY_%s_-%sKB_+%sKB.gff" % ( gffFolder, genome, mergeName, string.upper(name2), enhancerType, window / 1000, window / 1000) gffFilename_lost = "%s%s_%s_%s_ONLY_%s_-0_+0.gff" % ( gffFolder, genome, mergeName, string.upper(name1), enhancerType) gffFilenameWindow_lost = "%s%s_%s_%s_ONLY_%s_-%sKB_+%sKB.gff" % ( gffFolder, genome, mergeName, string.upper(name1), enhancerType, window / 1000, window / 1000) utils.unParseTable(gainedGFF, gffFilename_gained, '\t') utils.unParseTable(gainedWindowGFF, gffFilenameWindow_gained, '\t') utils.unParseTable(lostGFF, gffFilename_lost, '\t') utils.unParseTable(lostWindowGFF, gffFilenameWindow_lost, '\t') #bed bedFilename = "%s%s_%s_MERGED_%s.bed" % (outputFolder, genome, mergeName, enhancerType) utils.unParseTable(fullBed, bedFilename, '\t') #geneTable geneFilename = "%s%s_%s_MERGED_%s_GENE_TABLE.txt" % ( outputFolder, genome, mergeName, enhancerType) utils.unParseTable(geneTable, geneFilename, '\t') #finally, move all of the plots to the output folder cmd = "cp %s%s_ROSE/*.pdf %s%s_%s_MERGED_%s_DELTA.pdf" % ( mergeFolder, name1, outputFolder, genome, mergeName, enhancerType) os.system(cmd) cmd = "cp %s%s_ROSE/*RANK_PLOT.png %s%s_%s_MERGED_%s_RANK_PLOT.png" % ( mergeFolder, name1, outputFolder, genome, mergeName, enhancerType) os.system(cmd) #now execute the bamPlot_turbo.py commands if plotBam: bam1 = dataDict[name1]['bam'] bam2 = dataDict[name2]['bam'] bamString = "%s,%s" % (bam1, bam2) nameString = "%s,%s" % (name1, name2) colorString = "0,0,0:100,100,100" #change dir os.chdir(pipelineDir) if len(gainedGFF) > 0: #gained command plotTitle = "%s_ONLY_SE" % (name2) cmd = 'python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE' % ( genome, bamString, gffFilename_gained, outputFolder, nameString, colorString, plotTitle) os.system(cmd) #gained window command plotTitle = "%s_ONLY_SE_%sKB_WINDOW" % (name2, window / 1000) cmd = 'python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE' % ( genome, bamString, gffFilenameWindow_gained, outputFolder, nameString, colorString, plotTitle) os.system(cmd) if len(lostGFF) > 0: #lost command plotTitle = "%s_ONLY_SE" % (name1) cmd = 'python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE' % ( genome, bamString, gffFilename_lost, outputFolder, nameString, colorString, plotTitle) os.system(cmd) #lost command plotTitle = "%s_ONLY_SE_%sKB_WINDOW" % (name1, window / 1000) cmd = 'python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE' % ( genome, bamString, gffFilenameWindow_lost, outputFolder, nameString, colorString, plotTitle) os.system(cmd) return
def createExpressionDict(annotationFile, projectFolder, projectName, refseqToNameDict,expressionTable): ''' takes as input an activity table with refseq NMID in first column and expression or promoter acetylation level in a second column output a dictionary keyed by refseq containing activity ''' print 'CREATING EXPRESSION DICTIONARY' annotTable = utils.parseTable(annotationFile, '\t') for line in annotTable: gid = line[1] genename = upper(line[12]) refseqToNameDict[gid] = genename expresionFilename = projectFolder + 'matrix.gff' expressionTable = utils.parseTable(expresionFilename, '\t') expressionDictNM = {} expressionDictGene = {} for line in expressionTable[1:]: trid = line[0] geneName = refseqToNameDict[trid] if len(expressionTable[1]) == 3: #when expressionTable is an output from bamToGFF.py exp = float(line[2]) else: #when expressionTable is passed as an option (2 columns) exp = float(line[1]) # Store the expression value for each NMid in a dict, keep higher value if multiple identical NMIDs if trid in expressionDictNM and exp > expressionDictNM[trid]: expressionDictNM[trid] = exp elif trid not in expressionDictNM: expressionDictNM[trid] = exp # Store the highest value of transcript expression for each gene if geneName in expressionDictGene and exp > expressionDictGene[geneName]: expressionDictGene[geneName] = exp elif geneName not in expressionDictGene: expressionDictGene[geneName] = exp # Calculate the cutoff H3K27ac signal value to consider top 2/3 of genes expressed # or the percentile of genes considered expressed passed in option cutoff = numpy.percentile(expressionDictGene.values(), 33) print 'Expression cutoff: ' + str(cutoff) # Select all NMids that are above the computed cutoff expressedGenes = [] expressedNM = [] for trid in expressionDictNM: if float(expressionDictNM[trid]) >= cutoff: expressedGenes.append(refseqToNameDict[trid]) expressedNM.append(trid) expressedGenes = utils.uniquify(expressedGenes) # Output the list of transcripts considered expressed NMfilename = projectFolder + projectName + '_EXPRESSED_TRANSCRIPTS.txt' # Output the list of genes considered expressed Genefilename = projectFolder + projectName + '_EXPRESSED_GENES.txt' utils.unParseTable(expressedNM, NMfilename, '') utils.unParseTable(expressedGenes, Genefilename, '') return expressedNM
def mapBamToGFF(bamFile,gff,sense = '.',extension = 200,rpm = False,clusterGram = None,matrix = None): '''maps reads from a bam to a gff''' #creating a new gff to output newGFF = [] #reading in the bam bam = utils.Bam(bamFile) #getting RPM normalization if rpm: MMR= round(float(bam.getTotalReads('mapped'))/1000000,4) else: MMR = 1 print('using a MMR value of %s' % (MMR)) #creating a sense trans senseTrans = string.maketrans('-+.','+-+') #reading in the gff if type(gff) == str: gff = utils.parseTable(gff,'\t') #setting up a clustergram table if clusterGram: binSize = int(clusterGram) binSizeList = [] #now go through each line of the gff and make sure they're all the same length for i in range(0,len(gff),1): line = gff[i] gffLocus = utils.Locus(line[0],int(line[3]),int(line[4]),line[6],line[1]) binSizeList.append(gffLocus.len()/binSize) binSizeList = utils.uniquify(binSizeList) if len(binSizeList) > 1: print('WARNING: lines in gff are of different length. Output clustergram will have variable row length') newGFF.append(['GENE_ID','locusLine'] + [str(x*binSize)+'_'+bamFile.split('/')[-1] for x in range(1,max(binSizeList)+1,1)]) #setting up a maxtrix table if matrix: newGFF.append(['GENE_ID','locusLine'] + ['bin_'+str(n)+'_'+bamFile.split('/')[-1] for n in range(1,int(matrix)+1,1)]) nBin = int(matrix) # Try to use the bamliquidatior script on cluster, otherwise, failover to local (in path), otherwise fail. bamliquidatorString = '/opt/bamliquidator/pipeline/bamliquidator_internal/bamliquidator' if not os.path.isfile(bamliquidatorString): bamliquidatorString = './bamliquidator' if not os.path.isfile(bamliquidatorString): raise ValueError('bamliquidator not found in path') #getting and processing reads for gff lines ticker = 0 print('Number lines processed') for line in gff: line = line[0:9] if ticker%100 == 0: print(ticker) ticker+=1 gffLocus = utils.Locus(line[0],int(line[3]),int(line[4]),line[6],line[1]) #get the nBin and binSize if clusterGram: nBin =gffLocus.len()/int(clusterGram) binSize = int(clusterGram) if matrix: nBin = int(matrix) binSize = gffLocus.len()/nBin #some regions will be too short to get info on if binSize == 0: clusterLine = [gffLocus.ID(),gffLocus.__str__()] + ['NA']*nBin newGFF.append(clusterLine) continue #flippy flip if sense is negative if sense == '-': bamSense = string.translate(gffLocus.sense(),senseTrans) elif sense == '+': bamSense = gffLocus.sense() else: bamSense = '.' #using the bamLiquidator to get the readstring #print('using nBin of %s' % nBin) bamCommand = "%s %s %s %s %s %s %s %s" % (bamliquidatorString,bamFile,line[0],gffLocus.start(),gffLocus.end(),bamSense,nBin,extension) #print(bamCommand) getReads = subprocess.Popen(bamCommand,stdin = subprocess.PIPE,stderr = subprocess.PIPE,stdout = subprocess.PIPE,shell = True) readString, stderr = getReads.communicate() if stderr: print("STDERR out: %s" % (stderr)) denList = readString.split('\n')[:-1] #print("denlist is: %s" % denList) #flip the denList if the actual gff region is - if gffLocus.sense() == '-': denList = denList[::-1] #converting from units of total bp of read sequence per bin to rpm/bp denList = [round(float(x)/binSize/MMR,4) for x in denList] #if the gff region is - strand, flip the clusterLine = [gffLocus.ID(),gffLocus.__str__()] + denList newGFF.append(clusterLine) return newGFF
def main(): ''' main run call ''' debug = False from optparse import OptionParser usage = "usage: %prog [options] -g [GENOME] -i [INPUT_REGION_GFF] -r [RANKBY_BAM_FILE] -o [OUTPUT_FOLDER] [OPTIONAL_FLAGS]" parser = OptionParser(usage=usage) # required flags parser.add_option("-i", "--i", dest="input", nargs=1, default=None, help="Enter a .gff or .bed file of binding sites used to make enhancers") parser.add_option("-r", "--rankby", dest="rankby", nargs=1, default=None, help="bamfile to rank enhancer by") parser.add_option("-o", "--out", dest="out", nargs=1, default=None, help="Enter an output folder") parser.add_option("-g", "--genome", dest="genome", nargs=1, default=None, help="Enter the genome build (MM9,MM8,HG18,HG19)") # optional flags parser.add_option("-b", "--bams", dest="bams", nargs=1, default=None, help="Enter a comma separated list of additional bam files to map to") parser.add_option("-c", "--control", dest="control", nargs=1, default=None, help="bamfile to rank enhancer by") parser.add_option("-s", "--stitch", dest="stitch", nargs=1, default='', help="Enter a max linking distance for stitching. Default will determine optimal stitching parameter") parser.add_option("-t", "--tss", dest="tss", nargs=1, default=0, help="Enter a distance from TSS to exclude. 0 = no TSS exclusion") parser.add_option("--mask", dest="mask", nargs=1, default=None, help="Mask a set of regions from analysis. Provide a .bed or .gff of masking regions") # RETRIEVING FLAGS (options, args) = parser.parse_args() if not options.input or not options.rankby or not options.out or not options.genome: print('hi there') parser.print_help() exit() # making the out folder if it doesn't exist outFolder = utils.formatFolder(options.out, True) # figuring out folder schema gffFolder = utils.formatFolder(outFolder + 'gff/', True) mappedFolder = utils.formatFolder(outFolder + 'mappedGFF/', True) # GETTING INPUT FILE if options.input.split('.')[-1] == 'bed': # CONVERTING A BED TO GFF inputGFFName = options.input.split('/')[-1][0:-4] inputGFFFile = '%s%s.gff' % (gffFolder, inputGFFName) utils.bedToGFF(options.input, inputGFFFile) elif options.input.split('.')[-1] == 'gff': # COPY THE INPUT GFF TO THE GFF FOLDER inputGFFFile = options.input os.system('cp %s %s' % (inputGFFFile, gffFolder)) else: print('WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT') # COPY THE INPUT GFF TO THE GFF FOLDER inputGFFFile = options.input os.system('cp %s %s' % (inputGFFFile, gffFolder)) # GETTING THE LIST OF BAMFILES TO PROCESS if options.control: bamFileList = [options.rankby, options.control] else: bamFileList = [options.rankby] if options.bams: bamFileList += options.bams.split(',') bamFileList = utils.uniquify(bamFileList) # optional args # Stitch parameter if options.stitch == '': stitchWindow = '' else: stitchWindow = int(options.stitch) # tss options tssWindow = int(options.tss) if tssWindow != 0: removeTSS = True else: removeTSS = False # GETTING THE BOUND REGION FILE USED TO DEFINE ENHANCERS print('USING %s AS THE INPUT GFF' % (inputGFFFile)) inputName = inputGFFFile.split('/')[-1].split('.')[0] # GETTING THE GENOME genome = options.genome print('USING %s AS THE GENOME' % genome) # GETTING THE CORRECT ANNOT FILE cwd = os.getcwd() genomeDict = { 'HG18': '%s/annotation/hg18_refseq.ucsc' % (cwd), 'MM9': '%s/annotation/mm9_refseq.ucsc' % (cwd), 'HG19': '%s/annotation/hg19_refseq.ucsc' % (cwd), 'MM8': '%s/annotation/mm8_refseq.ucsc' % (cwd), 'MM10': '%s/annotation/mm10_refseq.ucsc' % (cwd), } annotFile = genomeDict[genome.upper()] # MAKING THE START DICT print('MAKING START DICT') startDict = utils.makeStartDict(annotFile) # LOADING IN THE BOUND REGION REFERENCE COLLECTION print('LOADING IN GFF REGIONS') referenceCollection = utils.gffToLocusCollection(inputGFFFile) # MASKING REFERENCE COLLECTION # see if there's a mask if options.mask: maskFile = options.mask # if it's a bed file if maskFile.split('.')[-1].upper() == 'BED': maskGFF = utils.bedToGFF(maskFile) elif maskFile.split('.')[-1].upper() == 'GFF': maskGFF = utils.parseTable(maskFile, '\t') else: print("MASK MUST BE A .gff or .bed FILE") sys.exit() maskCollection = utils.gffToLocusCollection(maskGFF) # now mask the reference loci referenceLoci = referenceCollection.getLoci() filteredLoci = [locus for locus in referenceLoci if len(maskCollection.getOverlap(locus, 'both')) == 0] print("FILTERED OUT %s LOCI THAT WERE MASKED IN %s" % (len(referenceLoci) - len(filteredLoci), maskFile)) referenceCollection = utils.LocusCollection(filteredLoci, 50) # NOW STITCH REGIONS print('STITCHING REGIONS TOGETHER') stitchedCollection, debugOutput, stitchWindow = regionStitching(inputGFFFile, inputName, outFolder, stitchWindow, tssWindow, annotFile, removeTSS) # NOW MAKE A STITCHED COLLECTION GFF print('MAKING GFF FROM STITCHED COLLECTION') stitchedGFF = utils.locusCollectionToGFF(stitchedCollection) # making sure start/stop ordering are correct for i in range(len(stitchedGFF)): line = stitchedGFF[i] start = int(line[3]) stop = int(line[4]) if start > stop: line[3] = stop line[4] = start print(stitchWindow) print(type(stitchWindow)) if not removeTSS: stitchedGFFFile = '%s%s_%sKB_STITCHED.gff' % (gffFolder, inputName, str(stitchWindow / 1000)) stitchedGFFName = '%s_%sKB_STITCHED' % (inputName, str(stitchWindow / 1000)) debugOutFile = '%s%s_%sKB_STITCHED.debug' % (gffFolder, inputName, str(stitchWindow / 1000)) else: stitchedGFFFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.gff' % (gffFolder, inputName, str(stitchWindow / 1000)) stitchedGFFName = '%s_%sKB_STITCHED_TSS_DISTAL' % (inputName, str(stitchWindow / 1000)) debugOutFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.debug' % (gffFolder, inputName, str(stitchWindow / 1000)) # WRITING DEBUG OUTPUT TO DISK if debug: print('WRITING DEBUG OUTPUT TO DISK AS %s' % (debugOutFile)) utils.unParseTable(debugOutput, debugOutFile, '\t') # WRITE THE GFF TO DISK print('WRITING STITCHED GFF TO DISK AS %s' % (stitchedGFFFile)) utils.unParseTable(stitchedGFF, stitchedGFFFile, '\t') # SETTING UP THE OVERALL OUTPUT FILE outputFile1 = outFolder + stitchedGFFName + '_ENHANCER_REGION_MAP.txt' print('OUTPUT WILL BE WRITTEN TO %s' % (outputFile1)) # MAPPING TO THE NON STITCHED (ORIGINAL GFF) # MAPPING TO THE STITCHED GFF # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail. bamliquidator_path = '/ark/home/jdm/pipeline/bamliquidator_batch.py' if not os.path.isfile(bamliquidator_path): bamliquidator_path = 'bamliquidator_batch.py' if not os.path.isfile(bamliquidator_path): raise ValueError('bamliquidator_batch.py not found in path') for bamFile in bamFileList: bamFileName = bamFile.split('/')[-1] # MAPPING TO THE STITCHED GFF mappedOut1Folder = '%s%s_%s_MAPPED' % (mappedFolder, stitchedGFFName, bamFileName) mappedOut1File = '%s%s_%s_MAPPED/matrix.gff' % (mappedFolder, stitchedGFFName, bamFileName) if utils.checkOutput(mappedOut1File, 0.2, 0.2): print("FOUND %s MAPPING DATA FOR BAM: %s" % (stitchedGFFFile, mappedOut1File)) else: cmd1 = "python " + bamliquidator_path + " --sense . -e 200 --match_bamToGFF -r %s -o %s %s" % (stitchedGFFFile, mappedOut1Folder, bamFile) print(cmd1) output1 = subprocess.Popen(cmd1, stdout=subprocess.PIPE, shell=True) output1 = output1.communicate() if len(output1[0]) > 0: # test if mapping worked correctly print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (stitchedGFFFile, bamFileName)) else: print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (stitchedGFFFile, bamFileName)) sys.exit() # MAPPING TO THE ORIGINAL GFF mappedOut2Folder = '%s%s_%s_MAPPED' % (mappedFolder, inputName, bamFileName) mappedOut2File = '%s%s_%s_MAPPED/matrix.gff' % (mappedFolder, inputName, bamFileName) if utils.checkOutput(mappedOut2File, 0.2, 0.2): print("FOUND %s MAPPING DATA FOR BAM: %s" % (stitchedGFFFile, mappedOut2File)) else: cmd2 = "python " + bamliquidator_path + " --sense . -e 200 --match_bamToGFF -r %s -o %s %s" % (inputGFFFile, mappedOut2Folder, bamFile) print(cmd2) output2 = subprocess.Popen(cmd2, stdout=subprocess.PIPE, shell=True) output2 = output2.communicate() if len(output2[0]) > 0: # test if mapping worked correctly print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (inputGFFFile, bamFileName)) else: print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (inputGFFFile, bamFileName)) sys.exit() print('BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS') # CALCULATE DENSITY BY REGION # NEED TO FIX THIS FUNCTION TO ACCOUNT FOR DIFFERENT OUTPUTS OF LIQUIDATOR mapCollection(stitchedCollection, referenceCollection, bamFileList, mappedFolder, outputFile1, refName=stitchedGFFName) print('CALLING AND PLOTTING SUPER-ENHANCERS') if options.control: rankbyName = options.rankby.split('/')[-1] controlName = options.control.split('/')[-1] cmd = 'R --no-save %s %s %s %s < ROSE2_callSuper.R' % (outFolder, outputFile1, inputName, controlName) else: rankbyName = options.rankby.split('/')[-1] controlName = 'NONE' cmd = 'R --no-save %s %s %s %s < ROSE2_callSuper.R' % (outFolder, outputFile1, inputName, controlName) print(cmd) os.system(cmd) # calling the gene mapper time.sleep(20) superTableFile = "%s_SuperEnhancers.table.txt" % (inputName) if options.control: cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s" % (genome, options.rankby, options.control, outFolder, superTableFile) else: cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s" % (genome, options.rankby, outFolder, superTableFile) os.system(cmd) stretchTableFile = "%s_StretchEnhancers.table.txt" % (inputName) if options.control: cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s" % (genome, options.rankby, options.control, outFolder, stretchTableFile) else: cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s" % (genome, options.rankby, outFolder, stretchTableFile) os.system(cmd) superStretchTableFile = "%s_SuperStretchEnhancers.table.txt" % (inputName) if options.control: cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s" % (genome, options.rankby, options.control, outFolder, superStretchTableFile) else: cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s" % (genome, options.rankby, outFolder, superStretchTableFile) os.system(cmd)
def make_mycn_stats_table(nb_all_chip_dataFile,outFile): ''' making a table of conserved mycn peaks w/ some additional stats mycn and h3k27ac signal is avg. background normalized across 4 samples active tss defined as the union of all H3K27ac occupied promoters in NB active enhancers defined as the union of all H3K27ac sites outside of promoters ''' dataDict = pipeline_dfci.loadDataTable(nb_all_chip_dataFile) print('SETTING UP OUTPUT TABLE') outTable = [['PEAK_ID','CHROM','START','STOP','LENGTH','ACTIVE_TSS_OVERLAP','ENHANCER_OVERLAP','CPG_ISLAND_OVERLAP','CPG_ISLAND_FRACTION','GC_FREQ','MYCN_RANK','AVG_MYCN_SIGNAL','AVG_H3K27AC_SIGNAL','CANON_EBOX_COUNT','NONCANON_EBOX_COUNT','TOTAL_EBOX_COUNT','CANON_EXP','NON_CANON_EXP','GABPA_COUNT','GABPA_EXP','GATA_COUNT','GATA_EXP']] dinuc = nmers(2,['A','T','G','C']) #input files mycnSignalFile = '%sHG19_NB_MYCN_CONSERVED_-0_+0_NB_ALL_SIGNAL.txt' % (signalFolder) h3k27acSignalFile = '%sHG19_NB_MYCN_CONSERVED_-500_+500_NB_ALL_SIGNAL.txt' % (signalFolder) mycnRankFile = '%smeta_rose/NB_MYCN/NB_MYCN_0KB_STITCHED_ENHANCER_REGION_RANK_CONSERVED.txt' % (projectFolder) activeGeneFile = '%sHG19_NB_H3K27AC_ACTIVE_UNION.txt' % (geneListFolder) #note, this is the ucsc hg19 cpg islands extended file #to download and format run ./beds/download_cpg.sh cpgFile = '%sbeds/hg19_cpg_islands.bed' % (projectFolder) enhancerFile = '%smeta_rose/NB_H3K27AC/NB_H3K27AC_AllEnhancers.table.txt' % (projectFolder) print('LOADING MYCN BINDING DATA') mycnSignalTable = utils.parseTable(mycnSignalFile,'\t') #making a signal dictionary for MYCN binding names_list = ['BE2C_MYCN','KELLY_MYCN','NGP_MYCN','SHEP21_0HR_MYCN_NOSPIKE'] background_list = [dataDict[name]['background'] for name in names_list] header = mycnSignalTable[0] chip_columns = [header.index(name) for name in names_list] background_columns = [header.index(background_name) for background_name in background_list] mycn_sig_dict = {} #this only works if the first column are unique identifiers if len(mycnSignalTable) != len(utils.uniquify([line[0] for line in mycnSignalTable])): print('Error: Column 1 of must contain unique identifiers.' % (mycnSignalFile)) sys.exit() for line in mycnSignalTable[1:]: line_sig = [] for i in range(len(names_list)): line_sig.append(float(line[chip_columns[i]]) - float(line[background_columns[i]])) mycn_sig_dict[line[0]] = numpy.mean(line_sig) print('LOADING MYCN RANK DATA') mycnRankTable = utils.parseTable(mycnRankFile,'\t') print('LOADING H3K27AC BINDING DATA') h3k27acSignalTable = utils.parseTable(h3k27acSignalFile,'\t') #making a signal dictionary for background subtracted H3K27ac binding names_list = ['BE2C_H3K27AC','KELLY_H3K27AC','NGP_H3K27AC','SHEP21_0HR_H3K27AC_NOSPIKE'] background_list = [dataDict[name]['background'] for name in names_list] header = h3k27acSignalTable[0] chip_columns = [header.index(name) for name in names_list] background_columns = [header.index(background_name) for background_name in background_list] h3k27ac_sig_dict = {} #this only works if the first column are unique identifiers if len(h3k27acSignalTable) != len(utils.uniquify([line[0] for line in h3k27acSignalTable])): print('Error: Column 1 of must contain unique identifiers.' % (h3k27acSignalFile)) sys.exit() for line in h3k27acSignalTable[1:]: line_sig = [] for i in range(len(names_list)): line_sig.append(float(line[chip_columns[i]]) - float(line[background_columns[i]])) h3k27ac_sig_dict[line[0]] = numpy.mean(line_sig) #making the cpg collection print('LOADING CPGS ISLANDS') cpgBed = utils.parseTable(cpgFile,'\t') cpgLoci = [] for line in cpgBed: cpgLoci.append(utils.Locus(line[0],line[1],line[2],'.',line[-1])) cpgCollection = utils.LocusCollection(cpgLoci,50) #next make the tss collection of active promoters print('LOADING ACTIVE PROMOTERS') startDict = utils.makeStartDict(annotFile) activeTable = utils.parseTable(activeGeneFile,'\t') tss_1kb_loci = [] for line in activeTable: tss_1kb_loci.append(utils.makeTSSLocus(line[1],startDict,1000,1000)) tss_1kb_collection = utils.LocusCollection(tss_1kb_loci,50) #enhancer file print("LOADING ACTIVE ENHANCERS") enhancerTable = utils.parseTable(enhancerFile,'\t') print('STARTING WITH THE FOLLOWING NUMBER OF ENHANCERS IN NB') print(len(enhancerTable) - 6) enhancerLoci = [] for line in enhancerTable: if line[0][0] != '#' and line[0][0] != 'R': try: lineLocus = utils.Locus(line[1],int(line[2]),int(line[3]),'.',line[0]) enhancerLoci.append(lineLocus) except IndexError: print(line) sys.exit() enhancerCollection = utils.LocusCollection(enhancerLoci,50) print('CLASSIFYING MYCN PEAKS') ticker = 0 for i in range(1,len(mycnSignalTable)): if ticker%100 == 0: print(ticker) ticker +=1 line = mycnSignalTable[i] mycn_signal = round(mycn_sig_dict[line[0]],4) h3k27ac_signal = round(h3k27ac_sig_dict[line[0]],4) peakID = line[0] locusString = line[1] chrom = locusString.split('(')[0] [start,stop] = [int(x) for x in line[1].split(':')[-1].split('-')] lineLocus = utils.Locus(chrom,start,stop,'.',peakID) tssOverlap = 0 if tss_1kb_collection.getOverlap(lineLocus,'both'): tssOverlap = 1 enhancerOverlap = 0 if enhancerCollection.getOverlap(lineLocus,'both') and tssOverlap == 0: enhancerOverlap = 1 cpgIslandOverlap = 0 if cpgCollection.getOverlap(lineLocus,'both'): cpgIslandOverlap = 1 #now do fractional cpgOverlap overlappingCpGLoci = cpgCollection.getOverlap(lineLocus,'both') overlappingBases = 0 for locus in overlappingCpGLoci: cpgStart = max(locus.start(),lineLocus.start()) cpgEnd = min(locus.end(),lineLocus.end()) overlappingBases += (cpgEnd-cpgStart) overlapFraction = round(float(overlappingBases)/lineLocus.len(),2) #now get the seq lineSeq = string.upper(utils.fetchSeq(genomeDirectory,chrom,start,stop,True)) gcFreq = round(float(lineSeq.count('GC') + lineSeq.count('CG'))/len(lineSeq),2) dinuc_dict = {} for nmer in dinuc: dinuc_dict[nmer] = float(lineSeq.count('GC'))/len(lineSeq) mycnRankLine = mycnRankTable[i] mycnRank = numpy.mean([float(x) for x in mycnRankLine[6:]]) canonMatchList = re.findall('CACGTG',lineSeq) canon_count = len(canonMatchList) eboxMatchList = re.findall('CA..TG',lineSeq) ebox_count = len(eboxMatchList) non_canon_count = ebox_count-canon_count #get the expected values canon_exp = dinuc_dict['CA']*dinuc_dict['CG']*dinuc_dict['TG']*(len(lineSeq) - 5) canon_exp = round(canon_exp,2) notCG = 1- dinuc_dict['CG'] non_exp = dinuc_dict['CA']*notCG*dinuc_dict['TG']*(len(lineSeq) - 5) non_exp = round(non_exp,2) #for gata and GABPA gabpaMatchList = re.findall('CGGAAG',lineSeq) + re.findall('CTTCCG',lineSeq) gabpa_count = len(gabpaMatchList) gabpa_exp_f = dinuc_dict['CG'] * dinuc_dict['GA'] * dinuc_dict['AG']*(len(lineSeq) - 5) gabpa_exp_r = dinuc_dict['CT'] * dinuc_dict['TC'] * dinuc_dict['CG']*(len(lineSeq) - 5) gabpa_exp = round(gabpa_exp_f,2) + round(gabpa_exp_r,2) gataMatchList = re.findall('GATAA',lineSeq) + re.findall('TTATC',lineSeq) gata_count = len(gataMatchList) an_freq = 1 - dinuc_dict['AA'] - dinuc_dict['AT'] - dinuc_dict['AG'] -dinuc_dict['AC'] cn_freq = 1 - dinuc_dict['CA'] - dinuc_dict['CT'] - dinuc_dict['CG'] -dinuc_dict['CC'] gata_exp_f = dinuc_dict['GA'] * dinuc_dict['TA'] * an_freq*(len(lineSeq) - 5) gata_exp_r = dinuc_dict['TT'] * dinuc_dict['AT'] * cn_freq*(len(lineSeq) - 5) gata_exp = round(gata_exp_f,2) + round(gata_exp_r,2) newLine = [peakID,chrom,start,stop,lineLocus.len(),tssOverlap,enhancerOverlap,cpgIslandOverlap,overlapFraction,gcFreq,mycnRank,mycn_signal,h3k27ac_signal,canon_count,non_canon_count,ebox_count,canon_exp,non_exp,gabpa_count,gabpa_exp,gata_count,gata_exp] outTable.append(newLine) utils.unParseTable(outTable,outFile,'\t') return outFile
def make_exp_table(probe_gene_dict, exp_1_path, exp_2_path, ob_s1_path, exp_table_path, patient_table_path): ''' making gene expression table first use an intermediary dictionary exp_dict[gene][patient] = list <- in case multiple exp values present at probe or patient level ''' #check if this has already been done if utils.checkOutput(exp_table_path, 0, 0) and utils.checkOutput( patient_table_path, 0, 0): print( 'loading premade expression table at %s and formatted patient table at %s' % (exp_table_path, patient_table_path)) return exp_table_path, patient_table_path #first get a master list of patients print('formatting patient data') table_s1 = utils.parseTable(ob_s1_path, '\t') patient_list = utils.uniquify([line[0] for line in table_s1[1:]]) #make a dict w/ relevant patient info patient_dict = {} for line in table_s1[1:]: patient_line = [line[3]] + line[8:12] if line[12] == 'Amp': patient_line.append(1) else: patient_line.append(0) patient_dict[line[0]] = patient_line gene_list = utils.uniquify( [probe_gene_dict[probe_id] for probe_id in probe_gene_dict.keys()]) gene_list.sort() exp_dict = {} for gene in gene_list: exp_dict[gene] = {} for patient in patient_list: exp_dict[gene][patient] = [] #next load up both expression tables exp_1 = utils.parseTable(exp_1_path, '\t') exp_2 = utils.parseTable(exp_2_path, '\t') exp_1_header = exp_1[0] exp_1_cols = [ i for i in range(1, len(exp_1_header)) if patient_list.count(exp_1_header[i].split('_')[1]) > 0 ] exp_1_patients = [ exp_1_header[i].split('_')[1] for i in range(1, len(exp_1_header)) if patient_list.count(exp_1_header[i].split('_')[1]) > 0 ] exp_2_header = exp_2[0] exp_2_cols = [ i for i in range(1, len(exp_2_header)) if patient_list.count(exp_2_header[i].split('_')[1]) > 0 ] exp_2_patients = [ exp_2_header[i].split('_')[1] for i in range(1, len(exp_2_header)) if patient_list.count(exp_2_header[i].split('_')[1]) > 0 ] print('loading expression dataset %s' % exp_1_path) for line in exp_1: if probe_gene_dict.has_key(line[0]): for i in range(len(exp_1_cols)): patient_name = exp_1_patients[i] col = exp_1_cols[i] expression = float(line[col]) gene_name = probe_gene_dict[line[0]] exp_dict[gene_name][patient_name].append(expression) print('loading expression dataset %s' % exp_2_path) for line in exp_2: if probe_gene_dict.has_key(line[0]): for i in range(len(exp_2_cols)): patient_name = exp_2_patients[i] col = exp_2_cols[i] expression = float(line[col]) gene_name = probe_gene_dict[line[0]] exp_dict[gene_name][patient_name].append(expression) print('making gene expression table') exp_table = [['GENE_NAME'] + exp_1_patients + exp_2_patients] for gene in gene_list: if min([ len(exp_dict[gene][patient]) for patient in exp_1_patients + exp_2_patients ]) == 0: print(gene) continue exp_line = [gene] + [ numpy.mean(exp_dict[gene][patient]) for patient in exp_1_patients + exp_2_patients ] exp_table.append(exp_line) patient_table = [[ 'PATIENT', 'STAGE', 'EFS_D', 'EFS_STATUS', 'OS_D', 'OS_STATUS', 'MYCN_STATUS' ]] for patient in exp_1_patients + exp_2_patients: patient_line = [patient] + patient_dict[patient] patient_table.append(patient_line) utils.unParseTable(exp_table, exp_table_path, '\t') utils.unParseTable(patient_table, patient_table_path, '\t') return exp_table_path, patient_table_path
def geneToEnhancerDict(genome, enhancer_file, activity_path): ''' Assign each Super-Enhancer to the closest active TSS to its center Return a dictionary keyed by TF that points to a list of loci ''' print('Identifying enhancers and target genes from %s' % (enhancer_file)) #should this do gene assignment???? #for now assume gene assignment has been done #can later toggle to do gene assignment #first load the TF lists tf_table = utils.parseTable(genome.returnFeature('tf_file'), '\t') motif_table = utils.parseTable(genome.returnFeature('motif_convert'), '\t') #this gives all tfs that have a motif motif_tfs = utils.uniquify([line[1] for line in motif_table]) #intersect w/ the activity table if len(activity_path) > 0: activity_table = utils.parseTable(activity_path, '\t') #figure out the right column for actual gene names (basically not NM or NR and not a numeral) for i in range(len(activity_table[0])): # try: # foo = int(activity_table[0][i]) # except ValueError: # case where it is not an integer if activity_table[0][i][0:2] != 'NM' and activity_table[0][i][ 0:2] != 'NR': #assumes refseq gene_col = i break print('using column %s of %s gene activity table for common names' % (gene_col + 1, activity_path)) active_gene_list = [ string.upper(line[gene_col]) for line in activity_table ] tf_list_refseq = [ line[0] for line in tf_table if active_gene_list.count(line[1]) > 0 and motif_tfs.count(line[1]) > 0 ] tf_list_name = utils.uniquify([ line[1] for line in tf_table if active_gene_list.count(line[1]) > 0 and motif_tfs.count(line[1]) > 0 ]) else: tf_list_refseq = [ line[0] for line in tf_table if motif_tfs.count(line[1]) > 0 ] tf_list_name = [ line[1] for line in tf_table if motif_tfs.count(line[1]) > 0 ] print('Identified %s TFs from %s that have motifs' % (len(tf_list_name), genome.returnFeature('tf_file'))) #keyed by gene with loci objects in the list gene_to_enhancer_dict = defaultdict(list) enhancer_to_gene_dict = defaultdict(list) #assuming id,chrom,start,stop w/ gene names in the last 3 columns per standard ROSE output enhancer_table = utils.parseTable(enhancer_file, '\t') print('Analyzing %s cis-regulatory regions' % (len(enhancer_table))) #now let's make the enhancer table by region and then by gene enhancerTable = [['ENHANCER_ID', 'CHROM', 'START', 'STOP', 'GENE_LIST']] enhancerTFTable = [['ENHANCER_ID', 'CHROM', 'START', 'STOP', 'GENE_LIST']] geneTable = [['GENE', 'TF', 'CHROM', 'START', 'STOP', 'ENHANCER_ID']] geneTFTable = [['GENE', 'CHROM', 'START', 'STOP', 'ENHANCER_ID']] geneSummaryTable = [['GENE', 'TF', 'ENHANCER_LIST']] #will need to track which ones are TFs candidate_tf_list = [] #find the columns for gene assignment header = enhancer_table[0] header_length = len(enhancer_table[0]) closest_index = header.index('CLOSEST_GENE') proximal_index = header.index('PROXIMAL_GENES') overlap_index = header.index('OVERLAP_GENES') for line in enhancer_table[1:]: if len( line ) != header_length: #don't bother trying to figure out lines w/o target genes continue enhancer_locus = utils.Locus(line[1], line[2], line[3], '.', line[0]) closest_gene_list = line[closest_index].split(',') proximal_gene_list = line[proximal_index].split(',') overlap_gene_list = line[overlap_index].split(',') all_gene_list = closest_gene_list + proximal_gene_list + overlap_gene_list all_gene_list = [string.upper(gene) for gene in all_gene_list] #print(all_gene_list) #print(activity_path) #print(active_gene_list) #gets a unique list of all tfs if len(activity_path) > 0: all_gene_list = utils.uniquify([ gene for gene in all_gene_list if active_gene_list.count(gene) > 0 ]) else: all_gene_list = utils.uniquify(all_gene_list) candidate_gene_list = utils.uniquify( [gene for gene in all_gene_list if tf_list_name.count(gene) > 0]) if len(all_gene_list) > 0: for gene in all_gene_list: gene_to_enhancer_dict[gene].append(enhancer_locus) enhancer_to_gene_dict[enhancer_locus].append(gene) newLine = line[0:4] + [','.join(all_gene_list)] else: newLine = line[0:4] + [''] enhancerTable.append(newLine) if len(candidate_gene_list) > 0: tfLine = line[0:4] + [','.join(candidate_gene_list)] enhancerTFTable.append(tfLine) #now iterate through each gene and list the enhancers gene_list = gene_to_enhancer_dict.keys() print(gene_list) gene_list.sort() for gene in gene_list: if tf_list_name.count(gene) > 0: tf_status = 1 candidate_tf_list.append(gene) else: tf_status = 0 enhancer_loci = gene_to_enhancer_dict[gene] enhancerString = ','.join( [enhancer.ID() for enhancer in enhancer_loci]) geneSummaryTable.append([gene, tf_status, enhancerString]) for enhancer in enhancer_loci: newLine = [ gene, tf_status, enhancer.chr(), enhancer.start(), enhancer.end(), enhancer.ID() ] geneTable.append(newLine) if tf_status == 1: newLine = [ gene, enhancer.chr(), enhancer.start(), enhancer.end(), enhancer.ID() ] geneTFTable.append(newLine) return geneTable, geneTFTable, enhancerTable, enhancerTFTable, geneSummaryTable, candidate_tf_list, gene_to_enhancer_dict
def main(): from optparse import OptionParser usage = "usage: %prog [options] -e [ENHANCER_FILE] -b [BAM_FILE] -g [GENOME] -o [OUTPUTFOLDER] -n [NAME] -s [SUBPEAKS] -x [EXP_CUTOFF] -l [EXTENSION_LENGTH]" parser = OptionParser(usage = usage) # Required flags parser.add_option("-e","--enhancer_file", dest="enhancers",nargs = 1, default=None, help = "Provide a ROSE generated enhancer table (_AllEnhancers.table.txt)") parser.add_option("-b","--bam_file",dest="bam",nargs =1, default = None, help = "Provide a sorted indexed bam file for H3K27ac sequencing reads") parser.add_option("-g","--genome",dest="genome",nargs =1, default = None, help = "Provide the build of the genome to be used for the analysis. Currently supports HG19, HG18 and MM9") parser.add_option("-f","--fasta",dest="fasta",nargs =1, default = None, help = "Enter location of the fasta files for the genome version used") parser.add_option("-s","--subpeaks", dest="subpeaks",nargs=1,default=None, help = "Enter a bedfile of peaks output from MACS used to identify SE constituents") parser.add_option("-x","--exp_Cutoff", dest="expCutoff",nargs=1,default=33, help = "Enter the percentage of transcripts that are not considered expressed, default=33") parser.add_option("-l","--extension_length", dest="extension",nargs = 1, default = 500, help = "Enter the length (in bp) to extend constituents for motif search, default=500") parser.add_option("-n","--name",dest="name",nargs =1, default = None, help = "Enter the sample name") parser.add_option("-o","--output",dest="output",nargs =1, default = None, help = "Enter directory to be used for storing output") # Options parser.add_option("-a","--activity", dest="activity",nargs = 1, default=None, help = "Enter a two column table with refseq in the first column and the associated activity (expression or promoter acetylation level) in the second column") parser.add_option("-E","--enhancer_number", dest="Enumber",nargs = 1, default='supers', help = "Enter the number of top ranked enhancers to include in the anlaysis, default = supers") (options,args) = parser.parse_args() print(options) if options.enhancers and options.bam and options.genome and options.fasta and options.subpeaks and options.expCutoff and options.extension and options.name and options.output: # Set parameters genomeDirectory = options.fasta genome = options.genome genome = upper(genome) if genome == 'HG19': annotationFile = './annotation/hg19_refseq.ucsc' TFfile = './TFlist_NMid_hg.txt' if genome == 'HG18': annotationFile = './annotation/hg18_refseq.ucsc' TFfile = './TFlist_NMid_hg.txt' if genome == 'MM9': annotationFile = './annotation/mm9_refseq.ucsc' TFfile = './TFlist_NMid_ms.txt' motifConvertFile = './MotifDictionary.txt' motifDatabaseFile = './VertebratePWMs.txt' TFtable = utils.parseTable(TFfile, '\t') TFlist = [line[0] for line in TFtable] TFlistGene = [line[1] for line in TFtable] superFile = options.enhancers superTable = utils.parseTable(superFile, '\t') bamFile = options.bam bam = utils.Bam(bamFile) subpeaks = options.subpeaks expCutoff = int(options.expCutoff) motifExtension = int(options.extension) projectName = options.name projectFolder = options.output refseqToNameDict = {} expressionFile = options.activity if expressionFile: expressionTable = utils.parseTable(expressionFile, '\t') else: calculatePromoterActivity(annotationFile, bamFile, projectName, projectFolder, refseqToNameDict) expresionFilename = projectFolder + 'matrix.gff' expressionTable = utils.parseTable(expresionFilename, '\t') if options.Enumber != 'super': enhancerNumber = options.Enumber else: enhancerNumber = 'super' # Run the program superLoci = createSuperLoci(superTable) expressedNM = createExpressionDict(annotationFile, projectFolder, projectName, refseqToNameDict, expressionTable) TFandSuperDict = findCanidateTFs(annotationFile, superLoci, expressedNM, TFlist, refseqToNameDict, projectFolder, projectName) formatOutput(TFandSuperDict, refseqToNameDict, projectName, projectFolder) candidateGenes = [upper(refseqToNameDict[x]) for x in TFandSuperDict.keys()] candidateGenes = utils.uniquify(candidateGenes) generateSubpeakFASTA(TFandSuperDict, subpeaks, genomeDirectory, projectName, projectFolder, motifExtension) findMotifs(candidateGenes, projectFolder, projectName, motifConvertFile, motifDatabaseFile) graph = buildNetwork(projectFolder, projectName, candidateGenes, refseqToNameDict, motifConvertFile) formatNetworkOutput(graph, projectFolder, projectName, candidateGenes) # Return help else: parser.print_help() sys.exit()
def tf_edge_delta_out(crc_folder, chip_dataFile, analysis_name, group1_list, group2_list, output=''): ''' calculates changes in brd4 out degree at each predicted motif occurrence this is by subpeaks ''' crc_folder = utils.formatFolder(crc_folder, False) edge_path = '%s%s_EDGE_TABLE.txt' % (crc_folder, analysis_name) #make a gff of the edge table edge_table = utils.parseTable(edge_path, '\t') edge_gff = [] for line in edge_table[1:]: gff_line = [ line[2], '%s_%s' % (line[0], line[1]), '', line[3], line[4], '', '.', '', '%s_%s' % (line[0], line[1]) ] edge_gff.append(gff_line) edge_gff_path = '%s%s_EDGE_TABLE.gff' % (crc_folder, analysis_name) utils.unParseTable(edge_gff, edge_gff_path, '\t') #direct the output to the crc folder signal_path = '%s%s_EDGE_TABLE_signal.txt' % (crc_folder, analysis_name) #get a list of all chip datasets all_chip_list = group1_list + group2_list if utils.checkOutput(signal_path, 0, 0) == False: signal_table_list = pipeline_dfci.map_regions(chip_dataFile, [edge_gff_path], mappedFolder, signalFolder, all_chip_list, True, signal_path, extendReadsTo=100) print(signal_table_list) else: print('Found previous signal table at %s' % (signal_path)) #now bring in the signal table as a dictionary using the locus line as the id print('making log2 group1 over group2 signal table at edges') signal_table = utils.parseTable(signal_path, '\t') signal_dict = defaultdict(float) #figure out columns for group1 and group2 group2_columns = [signal_table[0].index(name) for name in group2_list] group1_columns = [signal_table[0].index(name) for name in group1_list] group2_signal_vector = [] group1_signal_vector = [] for line in signal_table[1:]: group2_signal = numpy.mean( [float(line[col]) for col in group2_columns]) group1_signal = numpy.mean( [float(line[col]) for col in group1_columns]) group2_signal_vector.append(group2_signal) group1_signal_vector.append(group1_signal) group2_median = numpy.median(group2_signal_vector) group1_median = numpy.median(group1_signal_vector) print('group2 median signal (rpm/bp)') print(group2_median) print('group1 median signal (rpm/bp)') print(group1_median) #now that we have the median, we can take edges where at least 1 edge is above the median #and both are above zero and generate a new table w/ the fold change signal_filtered_path = string.replace(signal_path, '.txt', '_filtered.txt') if utils.checkOutput(signal_filtered_path, 0, 0): print('Found filtered signal table for edges at %s' % (signal_filtered_path)) signal_table_filtered = utils.parseTable(signal_filtered_path, '\t') else: signal_table_filtered = [ signal_table[0] + ['GROUP2_MEAN', 'GROUP1_MEAN', 'LOG2_GROUP1_OVER_GROUP2'] ] for line in signal_table[1:]: group2_signal = numpy.mean( [float(line[col]) for col in group2_columns]) group1_signal = numpy.mean( [float(line[col]) for col in group1_columns]) if (group2_signal > group2_median or group1_signal > group1_median ) and min(group2_signal, group1_signal) > 0: delta = numpy.log2(group1_signal / group2_signal) new_line = line + [group2_signal, group1_signal, delta] signal_table_filtered.append(new_line) utils.unParseTable(signal_table_filtered, signal_filtered_path, '\t') #now get a list of all TFs in the system tf_list = utils.uniquify( [line[0].split('_')[0] for line in signal_table_filtered[1:]]) tf_list.sort() print(tf_list) out_degree_table = [[ 'TF_NAME', 'EDGE_COUNT', 'DELTA_MEAN', 'DELTA_MEDIAN', 'DELTA_STD', 'DELTA_SEM' ]] for tf_name in tf_list: print(tf_name) edge_vector = [ float(line[-1]) for line in signal_table_filtered[1:] if line[0].split('_')[0] == tf_name ] edge_count = len(edge_vector) delta_mean = round(numpy.mean(edge_vector), 4) delta_median = round(numpy.median(edge_vector), 4) delta_std = round(numpy.std(edge_vector), 4) delta_sem = round(stats.sem(edge_vector), 4) tf_out_line = [ tf_name, edge_count, delta_mean, delta_median, delta_std, delta_sem ] out_degree_table.append(tf_out_line) if output == '': #set final output output_path = '%s%s_EDGE_DELTA_OUT.txt' % (crc_folder, analysis_name) else: output_path = output utils.unParseTable(out_degree_table, output_path, '\t') print(output_path) return (output_path)
def regionStitching(referenceCollection, name, outFolder, stitchWindow, tssWindow, annotFile, removeTSS=True): print('PERFORMING REGION STITCHING') # first have to turn bound region file into a locus collection # need to make sure this names correctly... each region should have a unique name #referenceCollection debugOutput = [] # filter out all bound regions that overlap the TSS of an ACTIVE GENE if removeTSS: print('REMOVING TSS FROM REGIONS USING AN EXCLUSION WINDOW OF %sBP' % (tssWindow)) # first make a locus collection of TSS startDict = utils.makeStartDict(annotFile) # now makeTSS loci for active genes removeTicker = 0 # this loop makes a locus centered around +/- tssWindow of transcribed genes # then adds it to the list tssLoci tssLoci = [] for geneID in startDict.keys(): tssLoci.append(utils.makeTSSLocus(geneID, startDict, tssWindow, tssWindow)) # this turns the tssLoci list into a LocusCollection # 50 is the internal parameter for LocusCollection and doesn't really matter tssCollection = utils.LocusCollection(tssLoci, 50) # gives all the loci in referenceCollection boundLoci = referenceCollection.getLoci() # this loop will check if each bound region is contained by the TSS exclusion zone # this will drop out a lot of the promoter only regions that are tiny # typical exclusion window is around 2kb for locus in boundLoci: if len(tssCollection.getContainers(locus, 'both')) > 0: # if true, the bound locus overlaps an active gene referenceCollection.remove(locus) debugOutput.append([locus.__str__(), locus.ID(), 'CONTAINED']) removeTicker += 1 print('REMOVED %s LOCI BECAUSE THEY WERE CONTAINED BY A TSS' % (removeTicker)) # referenceCollection is now all enriched region loci that don't overlap an active TSS if stitchWindow == '': print('DETERMINING OPTIMUM STITCHING PARAMTER') optCollection = copy.deepcopy(referenceCollection) stitchWindow = optimizeStitching(optCollection, name, outFolder, stepSize=500) print('USING A STITCHING PARAMETER OF %s' % stitchWindow) stitchedCollection = referenceCollection.stitchCollection(stitchWindow, 'both') if removeTSS: # now replace any stitched region that overlap 2 distinct genes # with the original loci that were there fixedLoci = [] tssLoci = [] for geneID in startDict.keys(): tssLoci.append(utils.makeTSSLocus(geneID, startDict, 50, 50)) # this turns the tssLoci list into a LocusCollection # 50 is the internal parameter for LocusCollection and doesn't really matter tssCollection = utils.LocusCollection(tssLoci, 50) removeTicker = 0 originalTicker = 0 for stitchedLocus in stitchedCollection.getLoci(): overlappingTSSLoci = tssCollection.getOverlap(stitchedLocus, 'both') tssNames = [startDict[tssLocus.ID()]['name'] for tssLocus in overlappingTSSLoci] tssNames = utils.uniquify(tssNames) if len(tssNames) > 2: # stitchedCollection.remove(stitchedLocus) originalLoci = referenceCollection.getOverlap(stitchedLocus, 'both') originalTicker += len(originalLoci) fixedLoci += originalLoci debugOutput.append([stitchedLocus.__str__(), stitchedLocus.ID(), 'MULTIPLE_TSS']) removeTicker += 1 else: fixedLoci.append(stitchedLocus) print('REMOVED %s STITCHED LOCI BECAUSE THEY OVERLAPPED MULTIPLE TSSs' % (removeTicker)) print('ADDED BACK %s ORIGINAL LOCI' % (originalTicker)) fixedCollection = utils.LocusCollection(fixedLoci, 50) return fixedCollection, debugOutput, stitchWindow else: return stitchedCollection, debugOutput, stitchWindow
def regionStitching(referenceCollection, name, outFolder, stitchWindow, tssWindow, annotFile, removeTSS=True): print('PERFORMING REGION STITCHING') # first have to turn bound region file into a locus collection # need to make sure this names correctly... each region should have a unique name #referenceCollection debugOutput = [] # filter out all bound regions that overlap the TSS of an ACTIVE GENE if removeTSS: print('REMOVING TSS FROM REGIONS USING AN EXCLUSION WINDOW OF %sBP' % (tssWindow)) # first make a locus collection of TSS startDict = utils.makeStartDict(annotFile) # now makeTSS loci for active genes removeTicker = 0 # this loop makes a locus centered around +/- tssWindow of transcribed genes # then adds it to the list tssLoci tssLoci = [] for geneID in startDict.keys(): tssLoci.append( utils.makeTSSLocus(geneID, startDict, tssWindow, tssWindow)) # this turns the tssLoci list into a LocusCollection # 50 is the internal parameter for LocusCollection and doesn't really matter tssCollection = utils.LocusCollection(tssLoci, 50) # gives all the loci in referenceCollection boundLoci = referenceCollection.getLoci() # this loop will check if each bound region is contained by the TSS exclusion zone # this will drop out a lot of the promoter only regions that are tiny # typical exclusion window is around 2kb for locus in boundLoci: if len(tssCollection.getContainers(locus, 'both')) > 0: # if true, the bound locus overlaps an active gene referenceCollection.remove(locus) debugOutput.append([locus.__str__(), locus.ID(), 'CONTAINED']) removeTicker += 1 print('REMOVED %s LOCI BECAUSE THEY WERE CONTAINED BY A TSS' % (removeTicker)) # referenceCollection is now all enriched region loci that don't overlap an active TSS if stitchWindow == '': print('DETERMINING OPTIMUM STITCHING PARAMTER') optCollection = copy.deepcopy(referenceCollection) stitchWindow = optimizeStitching(optCollection, name, outFolder, stepSize=500) print('USING A STITCHING PARAMETER OF %s' % stitchWindow) stitchedCollection = referenceCollection.stitchCollection( stitchWindow, 'both') if removeTSS: # now replace any stitched region that overlap 2 distinct genes # with the original loci that were there fixedLoci = [] tssLoci = [] for geneID in startDict.keys(): tssLoci.append(utils.makeTSSLocus(geneID, startDict, 50, 50)) # this turns the tssLoci list into a LocusCollection # 50 is the internal parameter for LocusCollection and doesn't really matter tssCollection = utils.LocusCollection(tssLoci, 50) removeTicker = 0 originalTicker = 0 for stitchedLocus in stitchedCollection.getLoci(): overlappingTSSLoci = tssCollection.getOverlap( stitchedLocus, 'both') tssNames = [ startDict[tssLocus.ID()]['name'] for tssLocus in overlappingTSSLoci ] tssNames = utils.uniquify(tssNames) if len(tssNames) > 2: # stitchedCollection.remove(stitchedLocus) originalLoci = referenceCollection.getOverlap( stitchedLocus, 'both') originalTicker += len(originalLoci) fixedLoci += originalLoci debugOutput.append([ stitchedLocus.__str__(), stitchedLocus.ID(), 'MULTIPLE_TSS' ]) removeTicker += 1 else: fixedLoci.append(stitchedLocus) print( 'REMOVED %s STITCHED LOCI BECAUSE THEY OVERLAPPED MULTIPLE TSSs' % (removeTicker)) print('ADDED BACK %s ORIGINAL LOCI' % (originalTicker)) fixedCollection = utils.LocusCollection(fixedLoci, 50) return fixedCollection, debugOutput, stitchWindow else: return stitchedCollection, debugOutput, stitchWindow
def makePeakTable(paramDict, splitGFFPath, averageTablePath, startDict, geneList, genomeDirectory, tads_path=''): ''' makes the final peak table with ebox info ''' peakTable = [[ 'REGION_ID', 'CHROM', 'START', 'STOP', 'LENGTH', 'TSS', 'CPG', 'CPG_FRACTION', 'GC_FREQ', 'SIGNAL', 'CANON_EBOX_COUNT', 'NON_CANON_EBOX_COUNT', 'TOTAL_EBOX_COUNT', 'OVERLAPPING_GENES', 'PROXIMAL_GENES' ]] print('LOADING PEAK REGIONS') peakGFF = utils.parseTable(splitGFFPath, '\t') print('LOADING BINDING DATA') signalTable = utils.parseTable(averageTablePath, '\t') print('LOADING CPGS ISLANDS') cpgBed = utils.parseTable(paramDict['cpgPath'], '\t') cpgLoci = [] for line in cpgBed: cpgLoci.append(utils.Locus(line[0], line[1], line[2], '.', line[-1])) cpgCollection = utils.LocusCollection(cpgLoci, 50) print("MAKING TSS COLLECTIONS") if len(geneList) == 0: geneList = startDict.keys() tss_1kb_loci = [] tss_50kb_loci = [] for refID in geneList: tss_1kb_loci.append(utils.makeTSSLocus(refID, startDict, 1000, 1000)) tss_50kb_loci.append(utils.makeTSSLocus(refID, startDict, 50000, 50000)) #make a 1kb flanking and 50kb flanking collection tss_1kb_collection = utils.LocusCollection(tss_1kb_loci, 50) tss_50kb_collection = utils.LocusCollection(tss_50kb_loci, 50) if len(tads_path) > 0: print('LOADING TADS FROM %s' % (tads_path)) tad_collection = utils.importBoundRegion(tads_path, 'tad') use_tads = True #building a tad dict keyed by tad ID w/ genes in that tad provided tad_dict = defaultdict(list) for tss_locus in tss_1kb_loci: overlapping_tads = tad_collection.getOverlap(tss_locus, 'both') for tad_locus in overlapping_tads: tad_dict[tad_locus.ID()].append(tss_locus.ID()) else: use_tads = False print('CLASSIFYING PEAKS') ticker = 0 no_tad_count = 0 for i in range(len(peakGFF)): if ticker % 1000 == 0: print(ticker) ticker += 1 #getting the particulars of the region gffLine = peakGFF[i] peakID = gffLine[1] chrom = gffLine[0] start = int(gffLine[3]) stop = int(gffLine[4]) lineLocus = utils.Locus(chrom, start, stop, '.', peakID) #getting the mapped signal signalLine = signalTable[(i + 1)] signalVector = [float(x) for x in signalLine[2:]] #setting up the new line newLine = [peakID, chrom, start, stop, lineLocus.len()] #get the tss status from the gff itself (we are able to do this nicely from the split gff code earlier newLine.append(gffLine[7]) #check cpg status if cpgCollection.getOverlap(lineLocus, 'both'): newLine.append(1) else: newLine.append(0) #now do fractional cpgOverlap overlappingCpGLoci = cpgCollection.getOverlap(lineLocus, 'both') overlappingBases = 0 for locus in overlappingCpGLoci: cpgStart = max(locus.start(), lineLocus.start()) cpgEnd = min(locus.end(), lineLocus.end()) overlappingBases += (cpgEnd - cpgStart) overlapFraction = float(overlappingBases) / lineLocus.len() newLine.append(round(overlapFraction, 2)) #now get the seq lineSeq = string.upper( utils.fetchSeq(genomeDirectory, chrom, start, stop, True)) if len(lineSeq) == 0: print('UH OH') print(lineSeq) print(gffLine) print(i) print(chrom) print(start) print(stop) sys.exit() gcFreq = float(lineSeq.count('GC') + lineSeq.count('CG')) / len(lineSeq) newLine.append(gcFreq) #this is where we add the ChIP-Seq signal newLine += signalVector eboxMatchList = re.findall('CA..TG', lineSeq) if len(eboxMatchList) == 0: newLine += [0] * 3 else: totalCount = len(eboxMatchList) canonCount = eboxMatchList.count('CACGTG') otherCount = totalCount - canonCount newLine += [canonCount, otherCount, totalCount] #now find the overlapping and proximal genes #here each overlapping gene the tss 1kb locus overlaps the peak if use_tads: tad_loci = tad_collection.getOverlap(lineLocus, 'both') tad_id_list = [tad_locus.ID() for tad_locus in tad_loci] tad_genes = [] for tad_id in tad_id_list: tad_genes += tad_dict[tad_id] if len(tad_genes) == 0: #print('no tad for this region') #print(gffLine) no_tad_count += 1 else: tad_genes = [] if len(tad_genes) > 0: overlappingGenes = [ startDict[locus.ID()]['name'] for locus in tss_1kb_collection.getOverlap(lineLocus, 'both') if tad_genes.count(locus.ID()) > 0 ] proximalGenes = [ startDict[locus.ID()]['name'] for locus in tss_50kb_collection.getOverlap(lineLocus, 'both') if tad_genes.count(locus.ID()) > 0 ] # print('linked peak to tad genes') # print([startDict[x]['name'] for x in tad_genes]) # print(tad_id_list) # print(gffLine) # print(overlappingGenes) # print(proximalGenes) else: overlappingGenes = [ startDict[locus.ID()]['name'] for locus in tss_1kb_collection.getOverlap(lineLocus, 'both') ] proximalGenes = [ startDict[locus.ID()]['name'] for locus in tss_50kb_collection.getOverlap(lineLocus, 'both') ] overlappingGenes = utils.uniquify(overlappingGenes) #here the tss 50kb locus overlaps the peak #overlap takes priority over proximal proximalGenes = [ gene for gene in proximalGenes if overlappingGenes.count(gene) == 0 ] proximalGenes = utils.uniquify(proximalGenes) overlappingString = string.join(overlappingGenes, ',') proximalString = string.join(proximalGenes, ',') newLine += [overlappingString, proximalString] peakTable.append(newLine) print('Out of %s regions, %s were assigned to at least 1 tad' % (len(peakTable), no_tad_count)) return peakTable
def mapBamToGFF(bamFile, gff, sense='.', extension=200, rpm=False, clusterGram=None, matrix=None): '''maps reads from a bam to a gff''' #creating a new gff to output newGFF = [] #reading in the bam bam = utils.Bam(bamFile) #getting RPM normalization if rpm: MMR = round(float(bam.getTotalReads('mapped')) / 1000000, 4) else: MMR = 1 print('using a MMR value of %s' % (MMR)) #creating a sense trans senseTrans = string.maketrans('-+.', '+-+') #reading in the gff if type(gff) == str: gff = utils.parseTable(gff, '\t') #setting up a clustergram table if clusterGram: binSize = int(clusterGram) binSizeList = [] #now go through each line of the gff and make sure they're all the same length for i in range(0, len(gff), 1): line = gff[i] gffLocus = utils.Locus(line[0], int(line[3]), int(line[4]), line[6], line[1]) binSizeList.append(gffLocus.len() / binSize) binSizeList = utils.uniquify(binSizeList) if len(binSizeList) > 1: print( 'WARNING: lines in gff are of different length. Output clustergram will have variable row length' ) newGFF.append(['GENE_ID', 'locusLine'] + [ str(x * binSize) + '_' + bamFile.split('/')[-1] for x in range(1, max(binSizeList) + 1, 1) ]) #setting up a maxtrix table if matrix: newGFF.append(['GENE_ID', 'locusLine'] + [ 'bin_' + str(n) + '_' + bamFile.split('/')[-1] for n in range(1, int(matrix) + 1, 1) ]) nBin = int(matrix) # Try to use the bamliquidatior script on cluster, otherwise, failover to local (in path), otherwise fail. bamliquidatorString = '/usr/bin/bamliquidator' if not os.path.isfile(bamliquidatorString): bamliquidatorString = './bamliquidator' if not os.path.isfile(bamliquidatorString): raise ValueError('bamliquidator not found in path') #getting and processing reads for gff lines ticker = 0 print('Number lines processed') for line in gff: line = line[0:9] if ticker % 100 == 0: print(ticker) ticker += 1 gffLocus = utils.Locus(line[0], int(line[3]), int(line[4]), line[6], line[1]) #get the nBin and binSize if clusterGram: nBin = gffLocus.len() / int(clusterGram) binSize = int(clusterGram) if matrix: nBin = int(matrix) binSize = gffLocus.len() / nBin #some regions will be too short to get info on if binSize == 0: clusterLine = [gffLocus.ID(), gffLocus.__str__()] + ['NA'] * nBin newGFF.append(clusterLine) continue #flippy flip if sense is negative if sense == '-': bamSense = string.translate(gffLocus.sense(), senseTrans) elif sense == '+': bamSense = gffLocus.sense() else: bamSense = '.' #using the bamLiquidator to get the readstring #print('using nBin of %s' % nBin) bamCommand = "%s %s %s %s %s %s %s %s" % ( bamliquidatorString, bamFile, line[0], gffLocus.start(), gffLocus.end(), bamSense, nBin, extension) #print(bamCommand) getReads = subprocess.Popen(bamCommand, stdin=subprocess.PIPE, stderr=subprocess.PIPE, stdout=subprocess.PIPE, shell=True) readString, stderr = getReads.communicate() if stderr: print("STDERR out: %s" % (stderr)) denList = readString.split('\n')[:-1] #print("denlist is: %s" % denList) #flip the denList if the actual gff region is - if gffLocus.sense() == '-': denList = denList[::-1] #converting from units of total bp of read sequence per bin to rpm/bp denList = [round(float(x) / binSize / MMR, 4) for x in denList] #if the gff region is - strand, flip the clusterLine = [gffLocus.ID(), gffLocus.__str__()] + denList newGFF.append(clusterLine) return newGFF
def loadAnnotFile(genome, window, geneList=[], skip_cache=False): """ load in the annotation and create a startDict and tss collection for a set of refseq IDs a given genome """ genomeDict = { 'HG18': 'annotation/hg18_refseq.ucsc', 'MM9': 'annotation/mm9_refseq.ucsc', 'MM10': 'annotation/mm10_refseq.ucsc', 'HG19': 'annotation/hg19_refseq.ucsc', 'HG19_RIBO': 'annotation/hg19_refseq.ucsc', 'RN4': 'annotation/rn4_refseq.ucsc', 'RN6': 'annotation/rn6_refseq.ucsc', 'HG38': 'annotation/hg38_refseq.ucsc', } genomeDirectoryDict = { 'HG19': '/storage/cylin/grail/genomes/Homo_sapiens/UCSC/hg19/Sequence/Chromosomes/', 'RN6': '/storage/cylin/grail/genomes/Rattus_norvegicus/UCSC/rn6/Sequence/Chromosomes/', 'MM9': '/storage/cylin/grail/genomes/Mus_musculus/UCSC/mm9/Sequence/Chromosomes/', 'MM10': '/storage/cylin/grail/genomes/Mus_musculus/UCSC/mm10/Sequence/Chromosomes/', 'HG38': '/storage/cylin/grail/genomes/Homo_sapiens/UCSC/hg38/Sequence/Chromosomes/', } mouse_convert_file = '%s/annotation/HMD_HumanPhenotype.rpt' % (whereAmI) #making a dictionary for mouse to human conversion mouse_convert_dict = defaultdict(str) mouse_convert_table = utils.parseTable(mouse_convert_file, '\t') for line in mouse_convert_table: mouse_convert_dict[line[4]] = line[0] genomeDirectory = genomeDirectoryDict[string.upper(genome)] #making a chrom_dict that is a list of all chroms with sequence chrom_list = utils.uniquify([ name.split('.')[0] for name in os.listdir(genomeDirectory) if len(name) > 0 ]) annotFile = whereAmI + '/' + genomeDict[string.upper(genome)] if not skip_cache: # Try loading from a cache, if the crc32 matches annotPathHash = zlib.crc32( annotFile) & 0xFFFFFFFF # hash the entire location of this script annotFileHash = zlib.crc32(open(annotFile, "rb").read()) & 0xFFFFFFFF cache_file_name = "%s.%s.%s.cache" % (genome, annotPathHash, annotFileHash) cache_file_path = '%s/%s' % (tempfile.gettempdir(), cache_file_name) if os.path.isfile(cache_file_path): # Cache exists! Load it! try: print('\tLoading genome data from cache.') with open(cache_file_path, 'rb') as cache_fh: cached_data = cPickle.load(cache_fh) print('\tCache loaded.') return cached_data except (IOError, cPickle.UnpicklingError): # Pickle corrupt? Let's get rid of it. print('\tWARNING: Cache corrupt or unreadable. Ignoring.') else: print('\tNo cache exists: Loading annotation (slow).') # We're still here, so either caching was disabled, or the cache doesn't exist startDict = utils.makeStartDict(annotFile, geneList) tssLoci = [] if geneList == []: geneList = startDict.keys() for gene in geneList: tssLoci.append(utils.makeTSSLocus(gene, startDict, window, window)) tssCollection = utils.LocusCollection(tssLoci, 50) if not skip_cache: print('Writing cache for the first time.') with open(cache_file_path, 'wb') as cache_fh: cPickle.dump((startDict, tssCollection), cache_fh, cPickle.HIGHEST_PROTOCOL) return startDict, tssCollection, genomeDirectory, chrom_list, mouse_convert_dict
def main(): ''' using argparse ''' parser = argparse.ArgumentParser(usage='%(prog)s -i DATAFILE -1 GROUP1_NAMES -2 GROUP2_NAMES') # required flags parser.add_argument("-d", "--data_table", dest="data_table", type=str, help="input a data table with all datasets to be analyzed", required=True) parser.add_argument("-1", "--group1", dest="group1", type=str, help="input a comma separated list of all datasets in group1", required=True) parser.add_argument("-2", "--group2", dest="group2", type=str, help="input a comma separated list of all datasets in group2", required=True) #optional input override parser.add_argument("-i", "--input", dest="input", type=str, help="input a gff of regions to analyze", required=False) #optional arguments parser.add_argument("-n", "--name", dest="name", type=str, help="specify a name for the analysis. Default is drawn from the data table name", required=False) parser.add_argument("--group1-name", dest="group1_name", default='GROUP1',type=str, help="Enter a name for group1. Default is 'GROUP1'", required=False) parser.add_argument("--group2-name", dest="group2_name", default='GROUP2',type=str, help="Enter a name for group2. Default is 'GROUP2'", required=False) parser.add_argument("-a", "--activity", dest="activity", type=str,default='', help="a table with active gene names in the first column", required=False) parser.add_argument("-t", "--tss", dest="tss", type=int,default=2500, help="Specify a TSS exclusion distance. Default is 2500", required=False) parser.add_argument("-s", "--stitch", dest="stitch", type=int,default=None, help="Specify a stitching distance. Default is auto stitching", required=False) parser.add_argument("-o", "--output", dest="output", default='./',type=str, help="Enter the output folder. Default is the current working directory", required=False) parser.add_argument("--log", dest="log", default='',type=str, help="Enter a path to log output", required=False) # # DEBUG OPTION TO SAVE TEMP FILES # parser.add_argument("--scale", dest="scale", default='', # help="Enter a comma separated list of scaling factors for your bams. Default is none") # parser.add_argument("--save-temp", dest="save", action='store_true', default=False, # help="If flagged will save temporary files made by bamPlot") # parser.add_argument("--bed", dest="bed", # help="Add a space-delimited list of bed files to plot") # parser.add_argument("--multi-page", dest="multi", action='store_true', default=False, # help="If flagged will create a new pdf for each region") args = parser.parse_args() #now we can begin to parse the arguments #===================================================================================== #===============================I. PARSING ARGUMENTS================================== #===================================================================================== #pulling in the data table data_file = os.path.abspath(args.data_table) dataDict = pipeline_dfci.loadDataTable(data_file) #setting naming conventions if not args.name: analysis_name = data_file.split('/')[-1].split('.')[0] else: analysis_name = args.name #getting the optional input gff if args.input: inputGFF = args.input else: inputGFF = '' #getting group names group1_name = args.group1_name group2_name = args.group2_name #getting group1 group1_string = args.group1 group1_list = [name for name in string.split(group1_string,',') if len(name) > 0] #getting group2 group2_string = args.group2 group2_list = [name for name in string.split(group2_string,',') if len(name) > 0] #checking that all datasets are in the data table for name in group1_list + group2_list: if name not in dataDict: print('ERROR: DATASET %s NOT FOUND IN DATA TABLE %s. EXITING NOW' % (name,data_file)) sys.exit() #loading in the genome object from the data table genome_list = utils.uniquify([dataDict[name]['genome'] for name in group1_list + group2_list]) if len(genome_list) > 1: print('ERROR: ATTEMPTING TO ANALYZE DATASETS FROM MULTIPLE GENOME BUILDS. EXITING NOW.') sys.exit() #the load genome function has an assertion test to make sure the genome is supported genome = loadGenome(genome_list[0]) parent_folder = utils.formatFolder(args.output,True) output_folder = utils.formatFolder(parent_folder + analysis_name,True) #these are the user defined optional arguments tss = int(args.tss) stitch = args.stitch print('stitch') print(stitch) #list of active genes to constrain analysis if len(args.activity) == 0: #assumes all genes are active unless told otherwise #activity_path,activity_table = getActivity() # fix this function print('using all active genes') else: activity_path = args.activity activity_table = utils.parseTable(activity_path,'\t') print('\n\n#======================================\n#===========I. DATA SUMMARY============\n#======================================\n') print('Analyzing datasets described in %s\n' % (data_file)) print('Name for the analysis: %s\n' % (analysis_name)) print('Using genome: %s\n' % (genome.name())) print('%s datasets: %s\n' % (group1_name,group1_string)) print('%s datasets: %s\n' % (group2_name,group2_string)) if len(activity_path) > 0: print('Identified %s active genes in the analysis using %s as a list of active genes' % (len(activity_table),activity_path)) else: print('Identified %s active genes in the analysis using aggregate data from %s and %s' % (len(activity_table),group1_name,group2_name)) print('Writing output to: %s\n' % (output_folder)) #===================================================================================== #======================II. DEFINING CIS-REGULATORY ELEMENTS=========================== #===================================================================================== print('\n\n#======================================\n#=II. MAPPING CIS-REGULATORY ELEMENTS==\n#======================================\n') #crc_wrapper will act at the group level and not consider individual datasets #since a data table is used as the input, the code will rely heavily on pipeline_dfci #embedded tools #1. first we need to run meta rose using default parameters and check the output #exists for each group meta_rose_folder = utils.formatFolder(output_folder + 'meta_rose/',True) group1_output = '%s%s/%s_AllEnhancers.table.txt' % (meta_rose_folder,group1_name,group1_name) group2_output = '%s%s/%s_AllEnhancers.table.txt' % (meta_rose_folder,group2_name,group2_name) #print(group1_output) #print(group2_output) #for each output check to see if they exist #if not launch try: foo = open(group1_output,'r') except IOError: print('No META_ROSE output found for %s. Running META_ROSE now' % (group1_name)) launchMetaRose(group1_name,group1_list,meta_rose_folder,genome,data_file,stitch,tss) try: foo = open(group2_output,'r') except IOError: print('No META_ROSE output found for %s. Running META_ROSE now' % (group2_name)) launchMetaRose(group2_name,group2_list,meta_rose_folder,genome,data_file,stitch,tss) #now check for completion if utils.checkOutput(group1_output,1,10): print('META_ROSE finished for %s' % (group1_name)) else: print('META_ROSE timed out for %s. EXITING NOW.' % (group1_name)) sys.exit() if utils.checkOutput(group2_output,1,10): print('META_ROSE finished for %s' % (group2_name)) else: print('META_ROSE timed out for %s. EXITING NOW.' % (group2_name)) sys.exit() #Meta rose does not give all regions that are SE in at least one sample #and can be blown out by amplicons etc... #sooo we need to run clustering to generate a good input gff #ideally we just rewrite dynamic meta to run off of clustering output #until we do that let's just overwrite w/ an input gff print('Comparing cis-regulatory landscapes of %s and %s' % (group1_name,group2_name)) dynamic_rose_folder = utils.formatFolder(output_folder + 'dynamic_meta_rose/',True) #here we will use the rank table as the primary output dynamic_rose_output = '%soutput/%s_%s_%s_merged_MERGED_SUPERS_RANK_TABLE.txt' % (dynamic_rose_folder,genome.name(),group1_name,group2_name) try: foo = open(dynamic_rose_output,'r') except IOError: print('No DYNAMIC_ROSE output found for %s. Running DYNAMIC_ROSE now' % (analysis_name)) launchDynamicRose(analysis_name,group1_name,group2_name,group1_list,group2_list,meta_rose_folder,dynamic_rose_folder,genome,data_file,activity_path,inputGFF) if utils.checkOutput(dynamic_rose_output,1,10): print('DYNAMIC_ROSE finsihed for %s' % (analysis_name)) else: print('DYNAMIC_ROSE analysis timed out for %s. EXITING NOW.' % (analysis_name)) sys.exit() #===================================================================================== #======================III. IDENTIFYING TF NODES IN NETWORK=========================== #===================================================================================== print('\n\n#======================================\n#===III. RUNNING CIRCUITRY ANALYSIS====\n#======================================\n') #now we want to call circuitry on each group... ok to have different subpeaks and motif calls #if as a first approximation we weight by the overall enhancer crc_folder = utils.formatFolder('%scrc/' % (output_folder),True) #for all all_crc_folder = utils.formatFolder('%s%s' % (crc_folder,analysis_name),True) launchCRC(data_file,genome,dynamic_rose_output,analysis_name,group1_list+group2_list,all_crc_folder,activity_path) #for group1 group1_crc_folder = utils.formatFolder('%s%s' % (crc_folder,group1_name),True) launchCRC(data_file,genome,dynamic_rose_output,group1_name,group1_list,group1_crc_folder,activity_path) #for group2 group2_crc_folder = utils.formatFolder('%s%s' % (crc_folder,group2_name),True) launchCRC(data_file,genome,dynamic_rose_output,group2_name,group2_list,group2_crc_folder,activity_path)
def main(): ''' main run call ''' debug = False from optparse import OptionParser usage = "usage: %prog [options] -g [GENOME] -i [INPUT_REGION_GFF] -r [RANKBY_BAM_FILE] -o [OUTPUT_FOLDER] [OPTIONAL_FLAGS]" parser = OptionParser(usage=usage) # required flags parser.add_option( "-i", "--i", dest="input", nargs=1, default=None, help="Enter a .gff or .bed file of binding sites used to make enhancers" ) parser.add_option("-r", "--rankby", dest="rankby", nargs=1, default=None, help="bamfile to rank enhancer by") parser.add_option("-o", "--out", dest="out", nargs=1, default=None, help="Enter an output folder") parser.add_option("-g", "--genome", dest="genome", nargs=1, default=None, help="Enter the genome build (MM9,MM8,HG18,HG19)") # optional flags parser.add_option( "-b", "--bams", dest="bams", nargs=1, default=None, help="Enter a comma separated list of additional bam files to map to") parser.add_option("-c", "--control", dest="control", nargs=1, default=None, help="bamfile to rank enhancer by") parser.add_option( "-s", "--stitch", dest="stitch", nargs=1, default='', help= "Enter a max linking distance for stitching. Default will determine optimal stitching parameter" ) parser.add_option( "-t", "--tss", dest="tss", nargs=1, default=0, help="Enter a distance from TSS to exclude. 0 = no TSS exclusion") parser.add_option( "--mask", dest="mask", nargs=1, default=None, help= "Mask a set of regions from analysis. Provide a .bed or .gff of masking regions" ) # RETRIEVING FLAGS (options, args) = parser.parse_args() if not options.input or not options.rankby or not options.out or not options.genome: print('hi there') parser.print_help() exit() # making the out folder if it doesn't exist outFolder = utils.formatFolder(options.out, True) # figuring out folder schema gffFolder = utils.formatFolder(outFolder + 'gff/', True) mappedFolder = utils.formatFolder(outFolder + 'mappedGFF/', True) # GETTING INPUT FILE if options.input.split('.')[-1] == 'bed': # CONVERTING A BED TO GFF inputGFFName = options.input.split('/')[-1][0:-4] inputGFFFile = '%s%s.gff' % (gffFolder, inputGFFName) utils.bedToGFF(options.input, inputGFFFile) elif options.input.split('.')[-1] == 'gff': # COPY THE INPUT GFF TO THE GFF FOLDER inputGFFFile = options.input os.system('cp %s %s' % (inputGFFFile, gffFolder)) else: print( 'WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT' ) # COPY THE INPUT GFF TO THE GFF FOLDER inputGFFFile = options.input os.system('cp %s %s' % (inputGFFFile, gffFolder)) # GETTING THE LIST OF BAMFILES TO PROCESS if options.control: bamFileList = [options.rankby, options.control] else: bamFileList = [options.rankby] if options.bams: bamFileList += options.bams.split(',') #bamFileList = utils.uniquify(bamFileList) # makes sad when you have the same control bam over and over again # optional args # Stitch parameter if options.stitch == '': stitchWindow = '' else: stitchWindow = int(options.stitch) # tss options tssWindow = int(options.tss) if tssWindow != 0: removeTSS = True else: removeTSS = False # GETTING THE BOUND REGION FILE USED TO DEFINE ENHANCERS print('USING %s AS THE INPUT GFF' % (inputGFFFile)) inputName = inputGFFFile.split('/')[-1].split('.')[0] # GETTING THE GENOME genome = options.genome print('USING %s AS THE GENOME' % genome) # GETTING THE CORRECT ANNOT FILE cwd = os.getcwd() genomeDict = { 'HG18': '%s/annotation/hg18_refseq.ucsc' % (cwd), 'MM9': '%s/annotation/mm9_refseq.ucsc' % (cwd), 'HG19': '%s/annotation/hg19_refseq.ucsc' % (cwd), 'MM8': '%s/annotation/mm8_refseq.ucsc' % (cwd), 'MM10': '%s/annotation/mm10_refseq.ucsc' % (cwd), 'RN4': '%s/annotation/rn4_refseq.ucsc' % (cwd), 'RN6': '%s/annotation/rn6_refseq.ucsc' % (cwd), } annotFile = genomeDict[genome.upper()] # MAKING THE START DICT print('MAKING START DICT') startDict = utils.makeStartDict(annotFile) #GET CHROMS FOUND IN THE BAMS print('GETTING CHROMS IN BAMFILES') bamChromList = getBamChromList(bamFileList) print("USING THE FOLLOWING CHROMS") print(bamChromList) #LOADING IN THE GFF AND FILTERING BY CHROM print('LOADING AND FILTERING THE GFF') inputGFF = filterGFF(inputGFFFile, bamChromList) # LOADING IN THE BOUND REGION REFERENCE COLLECTION print('LOADING IN GFF REGIONS') referenceCollection = utils.gffToLocusCollection(inputGFF) print('CHECKING REFERENCE COLLECTION:') checkRefCollection(referenceCollection) # MASKING REFERENCE COLLECTION # see if there's a mask if options.mask: maskFile = options.mask # if it's a bed file if maskFile.split('.')[-1].upper() == 'BED': maskGFF = utils.bedToGFF(maskFile) elif maskFile.split('.')[-1].upper() == 'GFF': maskGFF = utils.parseTable(maskFile, '\t') else: print("MASK MUST BE A .gff or .bed FILE") sys.exit() maskCollection = utils.gffToLocusCollection(maskGFF) # now mask the reference loci referenceLoci = referenceCollection.getLoci() filteredLoci = [ locus for locus in referenceLoci if len(maskCollection.getOverlap(locus, 'both')) == 0 ] print("FILTERED OUT %s LOCI THAT WERE MASKED IN %s" % (len(referenceLoci) - len(filteredLoci), maskFile)) referenceCollection = utils.LocusCollection(filteredLoci, 50) # NOW STITCH REGIONS print('STITCHING REGIONS TOGETHER') stitchedCollection, debugOutput, stitchWindow = regionStitching( referenceCollection, inputName, outFolder, stitchWindow, tssWindow, annotFile, removeTSS) # NOW MAKE A STITCHED COLLECTION GFF print('MAKING GFF FROM STITCHED COLLECTION') stitchedGFF = utils.locusCollectionToGFF(stitchedCollection) # making sure start/stop ordering are correct for i in range(len(stitchedGFF)): line = stitchedGFF[i] start = int(line[3]) stop = int(line[4]) if start > stop: line[3] = stop line[4] = start print(stitchWindow) print(type(stitchWindow)) if not removeTSS: stitchedGFFFile = '%s%s_%sKB_STITCHED.gff' % (gffFolder, inputName, str(stitchWindow / 1000)) stitchedGFFName = '%s_%sKB_STITCHED' % (inputName, str(stitchWindow / 1000)) debugOutFile = '%s%s_%sKB_STITCHED.debug' % (gffFolder, inputName, str(stitchWindow / 1000)) else: stitchedGFFFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.gff' % ( gffFolder, inputName, str(stitchWindow / 1000)) stitchedGFFName = '%s_%sKB_STITCHED_TSS_DISTAL' % ( inputName, str(stitchWindow / 1000)) debugOutFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.debug' % ( gffFolder, inputName, str(stitchWindow / 1000)) # WRITING DEBUG OUTPUT TO DISK if debug: print('WRITING DEBUG OUTPUT TO DISK AS %s' % (debugOutFile)) utils.unParseTable(debugOutput, debugOutFile, '\t') # WRITE THE GFF TO DISK print('WRITING STITCHED GFF TO DISK AS %s' % (stitchedGFFFile)) utils.unParseTable(stitchedGFF, stitchedGFFFile, '\t') # SETTING UP THE OVERALL OUTPUT FILE outputFile1 = outFolder + stitchedGFFName + '_ENHANCER_REGION_MAP.txt' print('OUTPUT WILL BE WRITTEN TO %s' % (outputFile1)) # MAPPING TO THE NON STITCHED (ORIGINAL GFF) # MAPPING TO THE STITCHED GFF # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail. bamliquidator_path = 'bamliquidator_batch.py' bamFileListUnique = list(bamFileList) bamFileListUnique = utils.uniquify(bamFileListUnique) #prevent redundant mapping print("MAPPING TO THE FOLLOWING BAMS:") print(bamFileListUnique) for bamFile in bamFileListUnique: bamFileName = bamFile.split('/')[-1] # MAPPING TO THE STITCHED GFF mappedOut1Folder = '%s%s_%s_MAPPED' % (mappedFolder, stitchedGFFName, bamFileName) mappedOut1File = '%s%s_%s_MAPPED/matrix.txt' % ( mappedFolder, stitchedGFFName, bamFileName) if utils.checkOutput(mappedOut1File, 0.2, 0.2): print("FOUND %s MAPPING DATA FOR BAM: %s" % (stitchedGFFFile, mappedOut1File)) else: cmd1 = bamliquidator_path + " --sense . -e 200 --match_bamToGFF -r %s -o %s %s" % ( stitchedGFFFile, mappedOut1Folder, bamFile) print(cmd1) os.system(cmd1) if utils.checkOutput(mappedOut1File, 0.2, 5): print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (stitchedGFFFile, bamFileName)) else: print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (stitchedGFFFile, bamFileName)) sys.exit() print('BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS') # CALCULATE DENSITY BY REGION # NEED TO FIX THIS FUNCTION TO ACCOUNT FOR DIFFERENT OUTPUTS OF LIQUIDATOR mapCollection(stitchedCollection, referenceCollection, bamFileList, mappedFolder, outputFile1, refName=stitchedGFFName) print('CALLING AND PLOTTING SUPER-ENHANCERS') if options.control: rankbyName = options.rankby.split('/')[-1] controlName = options.control.split('/')[-1] cmd = 'R --no-save %s %s %s %s < ROSE2_callSuper.R' % ( outFolder, outputFile1, inputName, controlName) else: rankbyName = options.rankby.split('/')[-1] controlName = 'NONE' cmd = 'R --no-save %s %s %s %s < ROSE2_callSuper.R' % ( outFolder, outputFile1, inputName, controlName) print(cmd) os.system(cmd) # calling the gene mapper time.sleep(20) superTableFile = "%s_SuperEnhancers.table.txt" % (inputName) if options.control: cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s &" % ( genome, options.rankby, options.control, outFolder, superTableFile) else: cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s &" % ( genome, options.rankby, outFolder, superTableFile) os.system(cmd) stretchTableFile = "%s_StretchEnhancers.table.txt" % (inputName) if options.control: cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s &" % ( genome, options.rankby, options.control, outFolder, stretchTableFile) else: cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s &" % ( genome, options.rankby, outFolder, stretchTableFile) os.system(cmd) superStretchTableFile = "%s_SuperStretchEnhancers.table.txt" % (inputName) if options.control: cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s &" % ( genome, options.rankby, options.control, outFolder, superStretchTableFile) else: cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s &" % ( genome, options.rankby, outFolder, superStretchTableFile) os.system(cmd)
def findCanidateTFs(annotationFile, enhancerLoci, expressedNM, expressionDictNM, bamFile, TFlist, refseqToNameDict, projectFolder, projectName, promoter): ''' Assign each Super-Enhancer to the closest active TSS to its center Return a dictionary keyed by TF that points to a list of loci ''' print 'FINDING CANIDATE TFs' enhancerAssignment = [] TFtoEnhancerDict = defaultdict(list) startDict = utils.makeStartDict(annotationFile) tssLoci = [] for gene in expressedNM: tssLoci.append(utils.makeTSSLocus(gene,startDict,1000,1000)) tssCollection = utils.LocusCollection(tssLoci,50) # Loop through enhancers for enhancer in enhancerLoci: # If the enhancer overlaps a TSS, save it overlappingLoci = tssCollection.getOverlap(enhancer, 'both') overlappingGenes =[] for overlapLocus in overlappingLoci: overlappingGenes.append(overlapLocus.ID()) # Find all gene TSS within 100 kb proximalLoci = tssCollection.getOverlap(utils.makeSearchLocus(enhancer,100000,100000),'both') proximalGenes =[] for proxLocus in proximalLoci: proximalGenes.append(proxLocus.ID()) # If no genes are within 100 kb, find the closest active gene closestGene = '' if len(overlappingGenes) == 0 and len(proximalGenes) == 0: distalLoci = tssCollection.getOverlap(utils.makeSearchLocus(enhancer,1000000,1000000),'both') distalGenes =[] for distalLocus in distalLoci: distalGenes.append(distalLocus.ID()) enhancerCenter = (int(enhancer.start()) + int(enhancer.end())) / 2 distList = [abs(enhancerCenter - startDict[geneID]['start'][0]) for geneID in distalGenes] if distList: closestGene = distalGenes[distList.index(min(distList))] overlappingGenes = utils.uniquify(overlappingGenes) proximalGenes = utils.uniquify(proximalGenes) for refID in overlappingGenes: if proximalGenes.count(refID) == 1: proximalGenes.remove(refID) # If a TSS overlaps an enhancer, assign them together if overlappingGenes: for gene in overlappingGenes: if gene in TFlist: TFtoEnhancerDict[gene].append(enhancer) enhancerAssignment.append([gene, enhancer.chr(), enhancer.start(), enhancer.end(), enhancer.ID()]) # Otherwise, assign the enhancer to the most active gene in 100 kb elif not overlappingGenes and proximalGenes: highestGene = '' highestActivity = 0 for gene in proximalGenes: if expressionDictNM[gene] > highestActivity: highestActivity = expressionDictNM[gene] highestGene = gene if highestGene in TFlist: TFtoEnhancerDict[gene].append(enhancer) enhancerAssignment.append([gene, enhancer.chr(), enhancer.start(), enhancer.end(), enhancer.ID()]) elif not overlappingGenes and not proximalGenes and closestGene: if closestGene in TFlist: gene = closestGene TFtoEnhancerDict[gene].append(enhancer) enhancerAssignment.append([gene, enhancer.chr(), enhancer.start(), enhancer.end(), enhancer.ID()]) # Add promoter is it's not contained in the super if promoter: for gene in TFtoEnhancerDict.keys(): promoter = utils.Locus(startDict[gene]['chr'], int(startDict[gene]['start'][0]) - 2000, int(startDict[gene]['start'][0]) + 2000, startDict[gene]['sense']) overlapBool = False for enhancer in TFtoEnhancerDict[gene]: if promoter.overlaps(enhancer): overlapBool = True if not overlapBool: TFtoEnhancerDict[gene].append(promoter) seAssignmentFile = projectFolder + projectName + '_ENHANCER_ASSIGNMENT.txt' utils.unParseTable(enhancerAssignment, seAssignmentFile, '\t') return TFtoEnhancerDict
def main(): ''' main run call ''' debug = False from optparse import OptionParser usage = "usage: %prog [options] -g [GENOME] -i [INPUT_REGION_GFF] -r [RANKBY_BAM_FILE] -o [OUTPUT_FOLDER] [OPTIONAL_FLAGS]" parser = OptionParser(usage=usage) # required flags parser.add_option( "-i", "--i", dest="input", nargs=1, default=None, help= "Enter a comma separated list of .gff or .bed file of binding sites used to make enhancers" ) parser.add_option("-r", "--rankby", dest="rankby", nargs=1, default=None, help="Enter a comma separated list of bams to rank by") parser.add_option("-o", "--out", dest="out", nargs=1, default=None, help="Enter an output folder") parser.add_option("-g", "--genome", dest="genome", nargs=1, default=None, help="Enter the genome build (MM9,MM8,HG18,HG19)") # optional flags parser.add_option( "-n", "--name", dest="name", nargs=1, default=None, help="Provide a name for the analysis otherwise ROSE will guess") parser.add_option( "-c", "--control", dest="control", nargs=1, default=None, help= "Enter a comma separated list of control bams. Can either provide a single control bam for all rankby bams, or provide a control bam for each individual bam" ) parser.add_option( "-s", "--stitch", dest="stitch", nargs=1, default='', help= "Enter a max linking distance for stitching. Default will determine optimal stitching parameter" ) parser.add_option( "-t", "--tss", dest="tss", nargs=1, default=0, help="Enter a distance from TSS to exclude. 0 = no TSS exclusion") parser.add_option( "--mask", dest="mask", nargs=1, default=None, help= "Mask a set of regions from analysis. Provide a .bed or .gff of masking regions" ) # RETRIEVING FLAGS (options, args) = parser.parse_args() if not options.input or not options.rankby or not options.out or not options.genome: print('hi there') parser.print_help() exit() # making the out folder if it doesn't exist outFolder = utils.formatFolder(options.out, True) # figuring out folder schema gffFolder = utils.formatFolder(outFolder + 'gff/', True) mappedFolder = utils.formatFolder(outFolder + 'mappedGFF/', True) # GETTING INPUT FILE(s) inputList = [ inputFile for inputFile in options.input.split(',') if len(inputFile) > 1 ] #converting all input files into GFFs and moving into the GFF folder inputGFFList = [] for inputFile in inputList: if inputFile.split('.')[-1] == 'bed': # CONVERTING A BED TO GFF inputGFFName = inputFile.split('/')[-1][ 0:-4] #strips the last 4 characters i.e. '.bed' inputGFFFile = '%s%s.gff' % (gffFolder, inputGFFName) utils.bedToGFF(inputFile, inputGFFFile) elif options.input.split('.')[-1] == 'gff': # COPY THE INPUT GFF TO THE GFF FOLDER os.system('cp %s %s' % (inputFile, gffFolder)) inputGFFFile = '%s%s' % (gffFolder, inputFile.split('/')[-1]) else: print( 'WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT' ) # COPY THE INPUT GFF TO THE GFF FOLDER os.system('cp %s %s' % (inputFile, gffFolder)) inputGFFFile = '%s%s' % (gffFolder, inputFile.split('/')[-1]) inputGFFList.append(inputGFFFile) # GETTING THE LIST OF BAMFILES TO PROCESS #either same number of bams for rankby and control #or only 1 control #or none! #bamlist should be all rankby bams followed by control bams bamFileList = [] if options.control: controlBamList = [ bam for bam in options.control.split(',') if len(bam) > 0 ] rankbyBamList = [ bam for bam in options.rankby.split(',') if len(bam) > 0 ] if len(controlBamList) == len(rankbyBamList): #case where an equal number of backgrounds are given bamFileList = rankbyBamList + controlBamList elif len(controlBamList) == 1: #case where a universal background is applied bamFileList = rankbyBamList + controlBamList * len(rankbyBamList) else: print( 'ERROR: EITHER PROVIDE A SINGLE CONTROL BAM FOR ALL SAMPLES, OR ONE CONTROL BAM FOR EACH SAMPLE' ) sys.exit() else: bamFileList = [ bam for bam in options.rankby.split(',') if len(bam) > 0 ] # Stitch parameter if options.stitch == '': stitchWindow = '' else: stitchWindow = int(options.stitch) # tss options tssWindow = int(options.tss) if tssWindow != 0: removeTSS = True else: removeTSS = False # GETTING THE GENOME genome = string.upper(options.genome) print('USING %s AS THE GENOME' % (genome)) # GETTING THE CORRECT ANNOT FILE genomeDict = { 'HG18': '%s/annotation/hg18_refseq.ucsc' % (pipeline_dir), 'MM9': '%s/annotation/mm9_refseq.ucsc' % (pipeline_dir), 'HG19': '%s/annotation/hg19_refseq.ucsc' % (pipeline_dir), 'MM8': '%s/annotation/mm8_refseq.ucsc' % (pipeline_dir), 'MM10': '%s/annotation/mm10_refseq.ucsc' % (pipeline_dir), 'RN4': '%s/annotation/rn4_refseq.ucsc' % (pipeline_dir), } try: annotFile = genomeDict[genome.upper()] except KeyError: print('ERROR: UNSUPPORTED GENOMES TYPE %s' % (genome)) sys.exit() #FINDING THE ANALYSIS NAME if options.name: inputName = options.name else: inputName = inputGFFList[0].split('/')[-1].split('.')[0] print('USING %s AS THE ANALYSIS NAME' % (inputName)) print('FORMATTING INPUT REGIONS') # MAKING THE RAW INPUT FILE FROM THE INPUT GFFs #use a simpler unique region naming system if len(inputGFFList) == 1: inputGFF = utils.parseTable(inputGFFList[0], '\t') else: inputLoci = [] for gffFile in inputGFFList: print('\tprocessing %s' % (gffFile)) gff = utils.parseTable(gffFile, '\t') gffCollection = utils.gffToLocusCollection(gff, 50) inputLoci += gffCollection.getLoci() inputCollection = utils.LocusCollection(inputLoci, 50) inputCollection = inputCollection.stitchCollection( ) # stitches to produce unique regions inputGFF = utils.locusCollectionToGFF(inputCollection) formattedGFF = [] #now number things appropriately for i, line in enumerate(inputGFF): #use the coordinates to make a new id inputname_chr_sense_start_stop chrom = line[0] coords = [int(line[3]), int(line[4])] sense = line[6] lineID = '%s_%s' % (inputName, str(i + 1)) #1 indexing newLine = [ chrom, lineID, lineID, min(coords), max(coords), '', sense, '', lineID ] formattedGFF.append(newLine) #name of the master input gff file masterGFFFile = '%s%s_%s_ALL_-0_+0.gff' % (gffFolder, string.upper(genome), inputName) utils.unParseTable(formattedGFF, masterGFFFile, '\t') print('USING %s AS THE INPUT GFF' % (masterGFFFile)) # MAKING THE START DICT print('MAKING START DICT') startDict = utils.makeStartDict(annotFile) #GET CHROMS FOUND IN THE BAMS print('GETTING CHROMS IN BAMFILES') bamChromList = getBamChromList(bamFileList) print("USING THE FOLLOWING CHROMS") print(bamChromList) #LOADING IN THE GFF AND FILTERING BY CHROM print('LOADING AND FILTERING THE GFF') inputGFF = filterGFF(masterGFFFile, bamChromList) # LOADING IN THE BOUND REGION REFERENCE COLLECTION print('LOADING IN GFF REGIONS') referenceCollection = utils.gffToLocusCollection(inputGFF) print('CHECKING REFERENCE COLLECTION:') checkRefCollection(referenceCollection) # MASKING REFERENCE COLLECTION # see if there's a mask if options.mask: maskFile = options.mask # if it's a bed file if maskFile.split('.')[-1].upper() == 'BED': maskGFF = utils.bedToGFF(maskFile) elif maskFile.split('.')[-1].upper() == 'GFF': maskGFF = utils.parseTable(maskFile, '\t') else: print("MASK MUST BE A .gff or .bed FILE") sys.exit() maskCollection = utils.gffToLocusCollection(maskGFF) # now mask the reference loci referenceLoci = referenceCollection.getLoci() filteredLoci = [ locus for locus in referenceLoci if len(maskCollection.getOverlap(locus, 'both')) == 0 ] print("FILTERED OUT %s LOCI THAT WERE MASKED IN %s" % (len(referenceLoci) - len(filteredLoci), maskFile)) referenceCollection = utils.LocusCollection(filteredLoci, 50) # NOW STITCH REGIONS print('STITCHING REGIONS TOGETHER') stitchedCollection, debugOutput, stitchWindow = regionStitching( referenceCollection, inputName, outFolder, stitchWindow, tssWindow, annotFile, removeTSS) # NOW MAKE A STITCHED COLLECTION GFF print('MAKING GFF FROM STITCHED COLLECTION') stitchedGFF = utils.locusCollectionToGFF(stitchedCollection) print(stitchWindow) print(type(stitchWindow)) if not removeTSS: stitchedGFFFile = '%s%s_%sKB_STITCHED.gff' % (gffFolder, inputName, str(stitchWindow / 1000)) stitchedGFFName = '%s_%sKB_STITCHED' % (inputName, str(stitchWindow / 1000)) debugOutFile = '%s%s_%sKB_STITCHED.debug' % (gffFolder, inputName, str(stitchWindow / 1000)) else: stitchedGFFFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.gff' % ( gffFolder, inputName, str(stitchWindow / 1000)) stitchedGFFName = '%s_%sKB_STITCHED_TSS_DISTAL' % ( inputName, str(stitchWindow / 1000)) debugOutFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.debug' % ( gffFolder, inputName, str(stitchWindow / 1000)) # WRITING DEBUG OUTPUT TO DISK if debug: print('WRITING DEBUG OUTPUT TO DISK AS %s' % (debugOutFile)) utils.unParseTable(debugOutput, debugOutFile, '\t') # WRITE THE GFF TO DISK print('WRITING STITCHED GFF TO DISK AS %s' % (stitchedGFFFile)) utils.unParseTable(stitchedGFF, stitchedGFFFile, '\t') # SETTING UP THE OVERALL OUTPUT FILE outputFile1 = outFolder + stitchedGFFName + '_ENHANCER_REGION_MAP.txt' print('OUTPUT WILL BE WRITTEN TO %s' % (outputFile1)) # MAPPING TO THE NON STITCHED (ORIGINAL GFF) # MAPPING TO THE STITCHED GFF # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail. bamFileListUnique = list(bamFileList) bamFileListUnique = utils.uniquify(bamFileListUnique) #prevent redundant mapping print("MAPPING TO THE FOLLOWING BAMS:") print(bamFileListUnique) for bamFile in bamFileListUnique: bamFileName = bamFile.split('/')[-1] # MAPPING TO THE STITCHED GFF mappedOut1Folder = '%s%s_%s_MAPPED' % (mappedFolder, stitchedGFFName, bamFileName) mappedOut1File = '%s%s_%s_MAPPED/matrix.txt' % ( mappedFolder, stitchedGFFName, bamFileName) if utils.checkOutput(mappedOut1File, 0.2, 0.2): print("FOUND %s MAPPING DATA FOR BAM: %s" % (stitchedGFFFile, mappedOut1File)) else: cmd1 = bamliquidator_path + " --sense . -e 200 --match_bamToGFF -r %s -o %s %s" % ( stitchedGFFFile, mappedOut1Folder, bamFile) print(cmd1) os.system(cmd1) if utils.checkOutput(mappedOut1File, 0.2, 5): print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (stitchedGFFFile, bamFileName)) else: print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (stitchedGFFFile, bamFileName)) sys.exit() print('BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS') # CALCULATE DENSITY BY REGION # NEED TO FIX THIS FUNCTION TO ACCOUNT FOR DIFFERENT OUTPUTS OF LIQUIDATOR mapCollection(stitchedCollection, referenceCollection, bamFileList, mappedFolder, outputFile1, refName=stitchedGFFName) print('FINDING AVERAGE SIGNAL AMONGST BAMS') metaOutputFile = collapseRegionMap(outputFile1, inputName + '_MERGED_SIGNAL', controlBams=options.control) #now try the merging print('CALLING AND PLOTTING SUPER-ENHANCERS') rankbyName = inputName + '_MERGED_SIGNAL' controlName = 'NONE' cmd = 'Rscript %sROSE2_callSuper.R %s %s %s %s' % ( pipeline_dir, outFolder, metaOutputFile, inputName, controlName) print(cmd) os.system(cmd) # calling the gene mapper print('CALLING GENE MAPPING') superTableFile = "%s_SuperEnhancers.table.txt" % (inputName) #for now don't use ranking bam to call top genes cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % ( pipeline_dir, genome, outFolder, superTableFile) print(cmd) os.system(cmd) stretchTableFile = "%s_StretchEnhancers.table.txt" % (inputName) cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % ( pipeline_dir, genome, outFolder, stretchTableFile) print(cmd) os.system(cmd) superStretchTableFile = "%s_SuperStretchEnhancers.table.txt" % (inputName) cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % ( pipeline_dir, genome, outFolder, superStretchTableFile) os.system(cmd)
def main(): from optparse import OptionParser usage = "usage: %prog [options] -e [ENHANCER_FILE] -b [BAM_FILE] -g [GENOME] -o [OUTPUTFOLDER] -n [NAME]" parser = OptionParser(usage = usage) #required flags parser.add_option("-e","--enhancer_file", dest="enhancers",nargs = 1, default=None, help = "Provide a ROSE generated enhancer table (_AllEnhancers.table.txt)") parser.add_option("-b","--bam",dest="bam",nargs =1, default = None, help = "Provide a bam that corresponds to the super enhancer table") parser.add_option("-g","--genome",dest="genome",nargs =1, default = None, help = "Provide the build of the genome to be used for the analysis. Currently supports HG19, HG18 and MM9") parser.add_option("-o","--output",dest="output",nargs =1, default = None, help = "Enter an output folder") parser.add_option("-n","--name",dest="name",nargs =1, default = None, help = "Provide a name for the job") #additional options parser.add_option("-s","--subpeaks", dest="subpeaks",nargs=1,default=None, help = "Enter a BED file of regions to search for motifs") parser.add_option("-x","--expCutoff", dest="expCutoff",nargs=1,default=33, help = "Enter the expression cutoff to be used to define canidate TFs") parser.add_option("-l","--extension-length", dest="extension",nargs = 1, default=100, help = "Enter the length to extend subpeak regions for motif finding") parser.add_option("-B","--background", dest="background",nargs = 1, default=None, help = "Provide a background BAM file") parser.add_option("-a","--activity", dest="activity",nargs = 1, default=None, help = "A table with refseq in the first column and activity (expression or promoter acetylation) in second") parser.add_option("-E","--enhancer_number", dest="Enumber",nargs = 1, default='super', help = "Enter the number of top ranked enhancers to include in the anlaysis. Default is all super-enhancers") parser.add_option("-N", "--number", dest="number",nargs = 1, default=2, help = "Enter the number of motifs required to assign a binding event") #I have modified the destination of -N option so that it is different from the destination of -E option parser.add_option("--promoter", dest="promoter",nargs = 1, default=False, help = "Enter True if the promoters should be included in the analysis") parser.add_option("--motifs", dest="motifs",nargs = 1, default=False, help = "Enter an alternative PWM file for the analysis") parser.add_option("-t","--tfs", dest="tfs",nargs=1,default=None, help = "Enter additional TFs (comma separated) to be used in the bindinf analysis") parser.add_option("-u","--ucsc", dest="is_ucsc", action='store_true', default=False, help = "If set, use the ucsc folders or files with chromosome names as chr1, chr2, etc.") (options,args) = parser.parse_args() print(options) if options.enhancers and options.genome and options.output and options.name: ### # Define all global file names ### if options.motifs: motifDatabaseFile = options.motifs else: motifConvertFile = '/home/rad/users/gaurav/projects/ctrc/scripts/CLL_TFnetworks_2018/annotations/MotifDictionary.txt' motifDatabaseFile = '/home/rad/users/gaurav/projects/ctrc/scripts/CLL_TFnetworks_2018/annotations/VertebratePWMs.txt' # User input files enhancerFile = options.enhancers enhancerTable = utils.parseTable(enhancerFile, '\t') if options.bam: bamFile = options.bam bam = utils.Bam(bamFile) if options.background: background = options.background else: background = None genome = options.genome genome = upper(genome) if genome == 'HG19': genomeDirectory = '/home/rad/packages/data/fasta/human/hg19/chromosomes/' annotationFile = '/home/rad/users/gaurav/projects/ctrc/scripts/pipeline/annotation/hg19_refseq.ucsc' TFfile = '/home/rad/users/gaurav/projects/ctrc/scripts/CLL_TFnetworks_2018/annotations/TFlist_NMid_hg19.txt' if genome == 'HG18': genomeDirectory = '/grail/genomes/Homo_sapiens/human_gp_mar_06_no_random/fasta/' annotationFile = '/ark/home/cl512/src/pipeline/annotation/hg18_refseq.ucsc' TFfile = '/home/rad/users/gaurav/projects/ctrc/scripts/CLL_TFnetworks_2018/annotations/TFlist_NMid_hg19.txt' if genome == 'MM9': genomeDirectory = '/grail/genomes/Mus_musculus/UCSC/mm9/Sequence/Chromosomes/' annotationFile = '/home/rad/users/gaurav/projects/ctrc/scripts/pipeline/annotation/mm9_refseq.ucsc' TFfile = '/home/rad/users/gaurav/projects/ctrc/scripts/CLL_TFnetworks_2018/annotations/TFlist_NMid_mm9.txt' if genome == 'MM10': TFfile = '/home/rad/users/gaurav/projects/ctrc/scripts/CLL_TFnetworks_2018/annotations/TFlist_NMid_mm10.txt' if options.is_ucsc: genomeDirectory = '/home/rad/packages/data/fasta/mouse/mm10/ucsc_chromosomes/' annotationFile = '/home/rad/users/gaurav/projects/ctrc/scripts/pipeline/annotation/ucsc/mm10_refseq.ucsc' else: genomeDirectory = '/home/rad/packages/data/fasta/mouse/mm10/chromosomes/' annotationFile = '/home/rad/users/gaurav/projects/ctrc/scripts/pipeline/annotation/mm10_refseq.ucsc' TFtable = utils.parseTable(TFfile, '\t') TFlist = [line[0] for line in TFtable] TFlistGene = [line[1] for line in TFtable] projectFolder = options.output projectName = options.name if options.subpeaks: subpeakFile = options.subpeaks else: subpeakFile = None refseqToNameDict = {} expressionFile = options.activity if expressionFile: expressionTable = utils.parseTable(expressionFile, '\t') else: expressionTable = calculatePromoterActivity(annotationFile, bamFile, projectName, projectFolder, refseqToNameDict, background) expCutoff = int(options.expCutoff) constExtension = int(options.extension) enhancerNumber = options.Enumber if options.Enumber != 'super': enhancerNumber = options.Enumber else: enhancerNumber = 'super' promoter = options.promoter additionalTFs = options.tfs number = options.number annotTable = utils.parseTable(annotationFile, '\t') for line in annotTable: gid = line[1] genename = upper(line[12]) refseqToNameDict[gid] = genename ### # Now run all the functions ### enhancerLoci = createEnhancerLoci(enhancerTable, enhancerNumber) expressedNM, expressionDictNM = createExpressionDict(annotationFile, projectFolder, projectName, refseqToNameDict, expCutoff,expressionFile) TFtoEnhancerDict = findCanidateTFs(annotationFile, enhancerLoci, expressedNM, expressionDictNM, bamFile, TFlist, refseqToNameDict, projectFolder, projectName, promoter) # print TFtoEnhancerDict # sys.exit() formatOutput(TFtoEnhancerDict, refseqToNameDict, projectName, projectFolder) canidateGenes = [upper(refseqToNameDict[x]) for x in TFtoEnhancerDict.keys()] if additionalTFs: for tf in additionalTFs.split(','): canidateGenes.append(tf) canidateGenes = utils.uniquify(canidateGenes) print canidateGenes if subpeakFile == None: subpeakFile = findValleys(TFtoEnhancerDict, bamFile, projectName, projectFolder, cutoff = 0.2) generateSubpeakFASTA(TFtoEnhancerDict, subpeakFile, genomeDirectory, projectName, projectFolder, constExtension) subpeakFile = projectFolder + projectName + '_SUBPEAKS.fa' findMotifs(canidateGenes, projectFolder, projectName, motifConvertFile, motifDatabaseFile) graph = buildGraph(projectFolder, projectName, motifConvertFile, refseqToNameDict, canidateGenes) formatNetworkOutput(graph, projectFolder, projectName, canidateGenes) else: parser.print_help() sys.exit()
def geneToEnhancerDict(genome, enhancer_file, activity_path): ''' Assign each Super-Enhancer to the closest active TSS to its center Return a dictionary keyed by TF that points to a list of loci ''' print('Identifying enhancers and target genes from %s' %(enhancer_file)) #should this do gene assignment???? #for now assume gene assignment has been done #can later toggle to do gene assignment #first load the TF lists tf_table = utils.parseTable(genome.returnFeature('tf_file'), '\t') motif_table = utils.parseTable(genome.returnFeature('motif_convert'),'\t') #this gives all tfs that have a motif motif_tfs = utils.uniquify([line[1] for line in motif_table]) #intersect w/ the activity table if len(activity_path) > 0: activity_table = utils.parseTable(activity_path,'\t') #figure out the right column for actual gene names (basically not NM or NR and not a numeral) for i in range(len(activity_table[0])): # try: # foo = int(activity_table[0][i]) # except ValueError: # case where it is not an integer if activity_table[0][i][0:2] != 'NM' and activity_table[0][i][0:2] != 'NR': #assumes refseq gene_col = i break print('using column %s of %s gene activity table for common names' % (gene_col + 1, activity_path)) active_gene_list = [string.upper(line[gene_col]) for line in activity_table] tf_list_refseq = [line[0] for line in tf_table if active_gene_list.count(line[1]) > 0 and motif_tfs.count(line[1]) > 0] tf_list_name = utils.uniquify([line[1] for line in tf_table if active_gene_list.count(line[1]) > 0 and motif_tfs.count(line[1]) > 0]) else: tf_list_refseq = [line[0] for line in tf_table if motif_tfs.count(line[1]) >0] tf_list_name = [line[1] for line in tf_table if motif_tfs.count(line[1]) >0] print('Identified %s TFs from %s that have motifs' % (len(tf_list_name),genome.returnFeature('tf_file'))) #keyed by gene with loci objects in the list gene_to_enhancer_dict = defaultdict(list) enhancer_to_gene_dict = defaultdict(list) #assuming id,chrom,start,stop w/ gene names in the last 3 columns per standard ROSE output enhancer_table = utils.parseTable(enhancer_file,'\t') print('Analyzing %s cis-regulatory regions' % (len(enhancer_table))) #now let's make the enhancer table by region and then by gene enhancerTable = [['ENHANCER_ID','CHROM','START','STOP','GENE_LIST']] enhancerTFTable = [['ENHANCER_ID','CHROM','START','STOP','GENE_LIST']] geneTable = [['GENE','TF','CHROM','START','STOP','ENHANCER_ID']] geneTFTable = [['GENE','CHROM','START','STOP','ENHANCER_ID']] geneSummaryTable = [['GENE','TF','ENHANCER_LIST']] #will need to track which ones are TFs candidate_tf_list = [] #find the columns for gene assignment header = enhancer_table[0] header_length = len(enhancer_table[0]) closest_index = header.index('CLOSEST_GENE') proximal_index = header.index('PROXIMAL_GENES') overlap_index = header.index('OVERLAP_GENES') for line in enhancer_table[1:]: if len(line) != header_length: #don't bother trying to figure out lines w/o target genes continue enhancer_locus = utils.Locus(line[1],line[2],line[3],'.',line[0]) closest_gene_list = line[closest_index].split(',') proximal_gene_list = line[proximal_index].split(',') overlap_gene_list = line[overlap_index].split(',') all_gene_list = closest_gene_list + proximal_gene_list + overlap_gene_list all_gene_list = [string.upper(gene) for gene in all_gene_list] #print(all_gene_list) #print(activity_path) #print(active_gene_list) #gets a unique list of all tfs if len(activity_path) > 0: all_gene_list = utils.uniquify([gene for gene in all_gene_list if active_gene_list.count(gene) > 0]) else: all_gene_list = utils.uniquify(all_gene_list) candidate_gene_list = utils.uniquify([gene for gene in all_gene_list if tf_list_name.count(gene) > 0]) if len(all_gene_list) > 0: for gene in all_gene_list: gene_to_enhancer_dict[gene].append(enhancer_locus) enhancer_to_gene_dict[enhancer_locus].append(gene) newLine = line[0:4] + [','.join(all_gene_list)] else: newLine = line[0:4] + [''] enhancerTable.append(newLine) if len(candidate_gene_list) > 0: tfLine = line[0:4] + [','.join(candidate_gene_list)] enhancerTFTable.append(tfLine) #now iterate through each gene and list the enhancers gene_list = gene_to_enhancer_dict.keys() print(gene_list) gene_list.sort() for gene in gene_list: if tf_list_name.count(gene) > 0: tf_status = 1 candidate_tf_list.append(gene) else: tf_status = 0 enhancer_loci = gene_to_enhancer_dict[gene] enhancerString =','.join([enhancer.ID() for enhancer in enhancer_loci]) geneSummaryTable.append([gene,tf_status,enhancerString]) for enhancer in enhancer_loci: newLine = [gene,tf_status,enhancer.chr(),enhancer.start(),enhancer.end(),enhancer.ID()] geneTable.append(newLine) if tf_status == 1: newLine = [gene,enhancer.chr(),enhancer.start(),enhancer.end(),enhancer.ID()] geneTFTable.append(newLine) return geneTable,geneTFTable,enhancerTable,enhancerTFTable,geneSummaryTable,candidate_tf_list,gene_to_enhancer_dict
def finishRankOutput(dataFile, rankOutput, genome, mergeFolder, mergeName, name1, name2, cutOff=1.5, window=100000): """ cleans up the rank output table makes a gff of all of the gained/lost supers beyond a certain cutoff w/ a window makes a list of gained genes and lost genes makes a bed of gained loss """ dataDict = pipeline_dfci.loadDataTable(dataFile) # making sure window and cutoff are int/float cutOff = float(cutOff) window = int(window) genome = string.upper(genome) # make the output folder outputFolder = pipeline_dfci.formatFolder(mergeFolder + "output/", True) # bring in the old rank table rankEnhancerTable = utils.parseTable(rankOutput, "\t") # make a new formatted table header = rankEnhancerTable[0] header[-4] = "DELTA RANK" header[-3] = "IS_SUPER" formattedRankTable = [header] # the gffs gainedGFF = [] lostGFF = [] gainedWindowGFF = [] lostWindowGFF = [] # the beds gainedTrackHeader = ( 'track name="%s %s only SEs" description="%s super enhancers that are found only in %s vs %s" itemRGB=On color=255,0,0' % (genome, name2, genome, name2, name1) ) gainedBed = [[gainedTrackHeader]] conservedTrackHeader = ( 'track name="%s %s and %s SEs" description="%s super enhancers that are found in both %s vs %s" itemRGB=On color=0,0,0' % (genome, name1, name2, genome, name1, name2) ) conservedBed = [[conservedTrackHeader]] lostTrackHeader = ( 'track name="%s %s only SEs" description="%s super enhancers that are found only in %s vs %s" itemRGB=On color=0,255,0' % (genome, name1, genome, name1, name2) ) lostBed = [[lostTrackHeader]] # the genes geneTable = [ [ "GENE", "ENHANCER_ID", "ENHANCER_CHROM", "ENHANCER_START", "ENHANCER_STOP", header[6], header[7], header[8], "STATUS", ] ] for line in rankEnhancerTable[1:]: # fixing the enhancer ID line[0] = line[0].replace("_lociStitched", "") formattedRankTable.append(line) # getting the genes geneList = [] geneList += line[9].split(",") geneList += line[10].split(",") geneList += line[11].split(",") geneList = [x for x in geneList if len(x) > 0] geneList = utils.uniquify(geneList) geneString = string.join(geneList, ",") bedLine = [line[1], line[2], line[3], line[0], line[-4]] # for gained if float(line[6]) > cutOff: gffLine = [line[1], line[0], "", line[2], line[3], "", ".", "", geneString] gffWindowLine = [ line[1], line[0], "", int(line[2]) - window, int(line[3]) + window, "", ".", "", geneString, ] gainedGFF.append(gffLine) gainedWindowGFF.append(gffWindowLine) geneStatus = name2 gainedBed.append(bedLine) # for lost elif float(line[6]) < (-1 * cutOff): gffLine = [line[1], line[0], "", line[2], line[3], "", ".", "", geneString] gffWindowLine = [ line[1], line[0], "", int(line[2]) - window, int(line[3]) + window, "", ".", "", geneString, ] lostGFF.append(gffLine) lostWindowGFF.append(gffWindowLine) geneStatus = name1 lostBed.append(bedLine) # for conserved else: geneStatus = "CONSERVED" conservedBed.append(bedLine) # now fill in the gene Table for gene in geneList: geneTableLine = [gene, line[0], line[1], line[2], line[3], line[6], line[7], line[8], geneStatus] geneTable.append(geneTableLine) # concat the bed fullBed = gainedBed + conservedBed + lostBed # start writing the output # there's the two gffs, the bed,the formatted table, the gene table # formatted table formattedFilename = "%s%s_%s_MERGED_SUPERS_RANK_TABLE.txt" % (outputFolder, genome, mergeName) utils.unParseTable(formattedRankTable, formattedFilename, "\t") # gffs gffFolder = pipeline_dfci.formatFolder(outputFolder + "gff/", True) gffFilename_gained = "%s%s_%s_%s_ONLY_SUPERS_-0_+0.gff" % (gffFolder, genome, mergeName, string.upper(name2)) gffFilenameWindow_gained = "%s%s_%s_%s_ONLY_SUPERS_-%sKB_+%sKB.gff" % ( gffFolder, genome, mergeName, string.upper(name2), window / 1000, window / 1000, ) gffFilename_lost = "%s%s_%s_%s_ONLY_SUPERS_-0_+0.gff" % (gffFolder, genome, mergeName, string.upper(name1)) gffFilenameWindow_lost = "%s%s_%s_%s_ONLY_SUPERS_-%sKB_+%sKB.gff" % ( gffFolder, genome, mergeName, string.upper(name1), window / 1000, window / 1000, ) utils.unParseTable(gainedGFF, gffFilename_gained, "\t") utils.unParseTable(gainedWindowGFF, gffFilenameWindow_gained, "\t") utils.unParseTable(lostGFF, gffFilename_lost, "\t") utils.unParseTable(lostWindowGFF, gffFilenameWindow_lost, "\t") # bed bedFilename = "%s%s_%s_MERGED_SUPERS.bed" % (outputFolder, genome, mergeName) utils.unParseTable(fullBed, bedFilename, "\t") # geneTable geneFilename = "%s%s_%s_MERGED_SUPERS_GENE_TABLE.txt" % (outputFolder, genome, mergeName) utils.unParseTable(geneTable, geneFilename, "\t") # finally, move all of the plots to the output folder cmd = "cp %s%s_ROSE/*.pdf %s%s_%s_MERGED_SUPERS_DELTA.pdf" % (mergeFolder, name1, outputFolder, genome, mergeName) os.system(cmd) cmd = "cp %s%s_ROSE/*RANK_PLOT.png %s%s_%s_MERGED_SUPERS_RANK_PLOT.png" % ( mergeFolder, name1, outputFolder, genome, mergeName, ) os.system(cmd) # now execute the bamPlot_turbo.py commands bam1 = dataDict[name1]["bam"] bam2 = dataDict[name2]["bam"] bamString = "%s,%s" % (bam1, bam2) nameString = "%s,%s" % (name1, name2) colorString = "0,0,0:100,100,100" # change dir os.chdir("/ark/home/cl512/pipeline/") if len(gainedGFF) > 0: # gained command plotTitle = "%s_ONLY_SE" % (name2) cmd = "python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE" % ( genome, bamString, gffFilename_gained, outputFolder, nameString, colorString, plotTitle, ) os.system(cmd) # gained window command plotTitle = "%s_ONLY_SE_%sKB_WINDOW" % (name2, window / 1000) cmd = "python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE" % ( genome, bamString, gffFilenameWindow_gained, outputFolder, nameString, colorString, plotTitle, ) os.system(cmd) if len(lostGFF) > 0: # lost command plotTitle = "%s_ONLY_SE" % (name1) cmd = "python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE" % ( genome, bamString, gffFilename_lost, outputFolder, nameString, colorString, plotTitle, ) os.system(cmd) # lost command plotTitle = "%s_ONLY_SE_%sKB_WINDOW" % (name1, window / 1000) cmd = "python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE" % ( genome, bamString, gffFilenameWindow_lost, outputFolder, nameString, colorString, plotTitle, ) os.system(cmd) return
def findCanidateTFs(genome, enhancer_gff, expressedNM, expressionDictNM, bamFile, TFlist, refseqToNameDict, projectFolder, projectName, promoter): ''' Assign each Super-Enhancer to the closest active TSS to its center Return a dictionary keyed by TF that points to a list of loci ''' #loading in the enhancer gff regions enhancer_collection = utils.gffToLocusCollection(enhancer_gff) enhancer_loci = enhancer_collection.getLoci() #loading in the genome and TF info annot_file = genome.returnFeature('annot_file') startDict = utils.makeStartDict(annot_file) tf_table = utils.parseTable(genome.returnFeature('tf_file'),'\t') refID_list = [line[0] for line in tf_table] #creates a list of all NM IDs for TFs #make a collection of all TF TSSs tssLoci = [] for refID in refID_list: tssLoci.append(utils.makeTSSLocus(refID,startDict,0,0)) #this is a precise 1 coordinate TSS locus tssCollection = utils.LocusCollection(tssLoci,50) enhancerTable = [['ENHANCER_ID','CHROM','START','STOP','GENE_LIST']] gene_to_enhancer_dict = defaultdict(list) # Loop through enhancers #all gene nnames stored by refID for enhancer in enhancer_loci: # If the enhancer overlaps a TSS, save it overlapping_loci = tssCollection.getOverlap(enhancer, 'both') overlapping_refIDs =[locus.ID() for locus in overlapping_loci] # Find all gene TSS within 100 kb proximal_loci = tssCollection.getOverlap(utils.makeSearchLocus(enhancer,100000,100000),'both') proximal_refIDs =[locus.ID() for locus in proximal_loci] # If no genes are within 100 kb, find the closest active gene within 1 million bp closest_refID = [] if len(overlapping_refIDs) == 0 and len(proximal_refIDs) == 0: distal_loci = tssCollection.getOverlap(utils.makeSearchLocus(enhancer,1000000,1000000),'both') distal_refIDs =[locus.ID() for locus in distal_loci] enhancerCenter = (int(enhancer.start()) + int(enhancer.end())) / 2 distance_list = [abs(enhancerCenter - startDict[geneID]['start'][0]) for geneID in distal_refIDs] if len(distance_list) > 0: closest_refID = [distalGenes[distance_list.index(min(distance_list))]] #now we have all potential gene cases all_refIDs = overlappingGenes + proximalGenes + closest_refID #now we get all names and refIDs all_refIDs = utils.uniquify([refID for refID in all_refIDs if len(refID) > 0 ]) all_names = utils.uniquify([startDict[refID]['name'] for refID in all_refIDs]) #first do enhancer level assignment names_string = ','.join(all_names) enhancer_table.append([enhancer.ID(),enhancer.chr(),enhancer.start(),enhancer.end(),names_string]) #now do gene level assignment for refID in all_refIDs: gene_to_enhancer_dict[refID].append(enhancer.ID()) #an enhancer can be assigned to multiple genes #a promoter can only be assigned to 1 gene #promoters don't have enhancerIDs so don't add them yet #this should just be an enhancer level table #followed by a gene level table overlappingGenes = utils.uniquify(overlappingGenes) proximalGenes = utils.uniquify(proximalGenes) for refID in overlappingGenes: if proximalGenes.count(refID) == 1: proximalGenes.remove(refID) # If a TSS overlaps an enhancer, assign them together if overlappingGenes: for gene in overlappingGenes: if gene in tf_list: TFtoEnhancerDict[gene].append(enhancer) enhancerAssignment.append([gene, enhancer.chr(), enhancer.start(), enhancer.end(), enhancer.ID()]) # Otherwise, assign the enhancer to the most active gene in 100 kb elif not overlappingGenes and proximalGenes: highestGene = '' highestActivity = 0 for gene in proximalGenes: if expressionDictNM[gene] > highestActivity: highestActivity = expressionDictNM[gene] highestGene = gene if highestGene in TFlist: TFtoEnhancerDict[gene].append(enhancer) enhancerAssignment.append([gene, enhancer.chr(), enhancer.start(), enhancer.end(), enhancer.ID()]) elif not overlappingGenes and not proximalGenes and closestGene: if closestGene in TFlist: gene = closestGene TFtoEnhancerDict[gene].append(enhancer) enhancerAssignment.append([gene, enhancer.chr(), enhancer.start(), enhancer.end(), enhancer.ID()]) # Add promoter is it's not contained in the super if promoter: for gene in TFtoEnhancerDict.keys(): promoter = utils.Locus(startDict[gene]['chr'], int(startDict[gene]['start'][0]) - 2000, int(startDict[gene]['start'][0]) + 2000, startDict[gene]['sense']) overlapBool = False for enhancer in TFtoEnhancerDict[gene]: if promoter.overlaps(enhancer): overlapBool = True if not overlapBool: TFtoEnhancerDict[gene].append(promoter) seAssignmentFile = projectFolder + projectName + '_ENHANCER_ASSIGNMENT.txt' utils.unParseTable(enhancerAssignment, seAssignmentFile, '\t') return TFtoEnhancerDict
def mapEnhancerToGeneTop(rankByBamFile, controlBamFile, genome, annotFile, enhancerFile, transcribedFile='', uniqueGenes=True, searchWindow=50000, noFormatTable=False): ''' maps genes to enhancers. if uniqueGenes, reduces to gene name only. Otherwise, gives for each refseq ''' startDict = utils.makeStartDict(annotFile) enhancerName = enhancerFile.split('/')[-1].split('.')[0] enhancerTable = utils.parseTable(enhancerFile, '\t') # internal parameter for debugging byRefseq = False if len(transcribedFile) > 0: transcribedTable = utils.parseTable(transcribedFile, '\t') transcribedGenes = [line[1] for line in transcribedTable] else: transcribedGenes = startDict.keys() print('MAKING TRANSCRIPT COLLECTION') transcribedCollection = utils.makeTranscriptCollection( annotFile, 0, 0, 500, transcribedGenes) print('MAKING TSS COLLECTION') tssLoci = [] for geneID in transcribedGenes: tssLoci.append(utils.makeTSSLocus(geneID, startDict, 0, 0)) # this turns the tssLoci list into a LocusCollection # 50 is the internal parameter for LocusCollection and doesn't really # matter tssCollection = utils.LocusCollection(tssLoci, 50) geneDict = {'overlapping': defaultdict( list), 'proximal': defaultdict(list)} # dictionaries to hold ranks and superstatus of gene nearby enhancers rankDict = defaultdict(list) superDict = defaultdict(list) # list of all genes that appear in this analysis overallGeneList = [] # find the damn header for line in enhancerTable: if line[0][0] == '#': continue else: header = line break if noFormatTable: # set up the output tables # first by enhancer enhancerToGeneTable = [ header + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE']] else: # set up the output tables # first by enhancer enhancerToGeneTable = [ header[0:9] + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE'] + header[-2:]] # next by gene geneToEnhancerTable = [ ['GENE_NAME', 'REFSEQ_ID', 'PROXIMAL_ENHANCERS']] # next make the gene to enhancer table geneToEnhancerTable = [ ['GENE_NAME', 'REFSEQ_ID', 'PROXIMAL_ENHANCERS', 'ENHANCER_RANKS', 'IS_SUPER', 'ENHANCER_SIGNAL']] for line in enhancerTable: if line[0][0] == '#' or line[0][0] == 'R': continue enhancerString = '%s:%s-%s' % (line[1], line[2], line[3]) enhancerLocus = utils.Locus(line[1], line[2], line[3], '.', line[0]) # overlapping genes are transcribed genes whose transcript is directly # in the stitchedLocus overlappingLoci = transcribedCollection.getOverlap( enhancerLocus, 'both') overlappingGenes = [] for overlapLocus in overlappingLoci: overlappingGenes.append(overlapLocus.ID()) # proximalGenes are transcribed genes where the tss is within 50kb of # the boundary of the stitched loci proximalLoci = tssCollection.getOverlap( utils.makeSearchLocus(enhancerLocus, searchWindow, searchWindow), 'both') proximalGenes = [] for proxLocus in proximalLoci: proximalGenes.append(proxLocus.ID()) distalLoci = tssCollection.getOverlap( utils.makeSearchLocus(enhancerLocus, 1000000, 1000000), 'both') distalGenes = [] for proxLocus in distalLoci: distalGenes.append(proxLocus.ID()) overlappingGenes = utils.uniquify(overlappingGenes) proximalGenes = utils.uniquify(proximalGenes) distalGenes = utils.uniquify(distalGenes) allEnhancerGenes = overlappingGenes + proximalGenes + distalGenes # these checks make sure each gene list is unique. # technically it is possible for a gene to be overlapping, but not proximal since the # gene could be longer than the 50kb window, but we'll let that slide # here for refID in overlappingGenes: if proximalGenes.count(refID) == 1: proximalGenes.remove(refID) for refID in proximalGenes: if distalGenes.count(refID) == 1: distalGenes.remove(refID) # Now find the closest gene if len(allEnhancerGenes) == 0: closestGene = '' else: # get enhancerCenter enhancerCenter = (int(line[2]) + int(line[3])) / 2 # get absolute distance to enhancer center distList = [abs(enhancerCenter - startDict[geneID]['start'][0]) for geneID in allEnhancerGenes] # get the ID and convert to name closestGene = startDict[ allEnhancerGenes[distList.index(min(distList))]]['name'] # NOW WRITE THE ROW FOR THE ENHANCER TABLE if noFormatTable: newEnhancerLine = list(line) newEnhancerLine.append( join(utils.uniquify([startDict[x]['name'] for x in overlappingGenes]), ',')) newEnhancerLine.append( join(utils.uniquify([startDict[x]['name'] for x in proximalGenes]), ',')) newEnhancerLine.append(closestGene) else: newEnhancerLine = line[0:9] newEnhancerLine.append( join(utils.uniquify([startDict[x]['name'] for x in overlappingGenes]), ',')) newEnhancerLine.append( join(utils.uniquify([startDict[x]['name'] for x in proximalGenes]), ',')) newEnhancerLine.append(closestGene) newEnhancerLine += line[-2:] enhancerToGeneTable.append(newEnhancerLine) # Now grab all overlapping and proximal genes for the gene ordered # table overallGeneList += overlappingGenes for refID in overlappingGenes: geneDict['overlapping'][refID].append(enhancerString) rankDict[refID].append(int(line[-2])) superDict[refID].append(int(line[-1])) overallGeneList += proximalGenes for refID in proximalGenes: geneDict['proximal'][refID].append(enhancerString) rankDict[refID].append(int(line[-2])) superDict[refID].append(int(line[-1])) # End loop through # Make table by gene print('MAKING ENHANCER ASSOCIATED GENE TSS COLLECTION') overallGeneList = utils.uniquify(overallGeneList) enhancerGeneCollection = utils.makeTranscriptCollection( annotFile, 5000, 5000, 500, overallGeneList) enhancerGeneGFF = utils.locusCollectionToGFF(enhancerGeneCollection) # dump the gff to file enhancerFolder = utils.getParentFolder(enhancerFile) gffRootName = "%s_TSS_ENHANCER_GENES_-5000_+5000" % (genome) enhancerGeneGFFFile = "%s%s_%s.gff" % (enhancerFolder, enhancerName,gffRootName) utils.unParseTable(enhancerGeneGFF, enhancerGeneGFFFile, '\t') # now we need to run bamToGFF # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail. bamliquidator_path = '/ark/home/jdm/pipeline/bamliquidator_batch.py' if not os.path.isfile(bamliquidator_path): bamliquidator_path = 'bamliquidator_batch.py' if not os.path.isfile(bamliquidator_path): raise ValueError('bamliquidator_batch.py not found in path') print('MAPPING SIGNAL AT ENHANCER ASSOCIATED GENE TSS') # map density at genes in the +/- 5kb tss region # first on the rankBy bam bamName = rankByBamFile.split('/')[-1] mappedRankByFolder = "%s%s_%s_%s/" % (enhancerFolder, enhancerName,gffRootName, bamName) mappedRankByFile = "%s%s_%s_%s/matrix.gff" % (enhancerFolder,enhancerName, gffRootName, bamName) cmd = 'python ' + bamliquidator_path + ' --sense . -e 200 --match_bamToGFF -r %s -o %s %s' % (enhancerGeneGFFFile, mappedRankByFolder,rankByBamFile) print("Mapping rankby bam %s" % (rankByBamFile)) print(cmd) outputRank = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True) outputRank = outputRank.communicate() if len(outputRank[0]) > 0: # test if mapping worked correctly print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (enhancerGeneGFFFile, rankByBamFile)) else: print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (enhancerGeneGFFFile, rankByBamFile)) sys.exit() # next on the control bam if it exists if len(controlBamFile) > 0: controlName = controlBamFile.split('/')[-1] mappedControlFolder = "%s%s_%s_%s/" % ( enhancerFolder, enhancerName,gffRootName, controlName) mappedControlFile = "%s%s_%s_%s/matrix.gff" % ( enhancerFolder, enhancerName,gffRootName, controlName) cmd = 'python ' + bamliquidator_path + ' --sense . -e 200 --match_bamToGFF -r %s -o %s %s' % (enhancerGeneGFFFile, mappedControlFolder,controlBamFile) print("Mapping control bam %s" % (controlBamFile)) print(cmd) outputControl = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True) outputControl = outputControl.communicate() if len(outputControl[0]) > 0: # test if mapping worked correctly print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (enhancerGeneGFFFile, controlBamFile)) else: print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (enhancerGeneGFFFile, controlBamFile)) sys.exit() # now get the appropriate output files if len(controlBamFile) > 0: print("CHECKING FOR MAPPED OUTPUT AT %s AND %s" % (mappedRankByFile, mappedControlFile)) if utils.checkOutput(mappedRankByFile, 1, 1) and utils.checkOutput(mappedControlFile, 1, 1): print('MAKING ENHANCER ASSOCIATED GENE TSS SIGNAL DICTIONARIES') signalDict = makeSignalDict(mappedRankByFile, mappedControlFile) else: print("NO MAPPING OUTPUT DETECTED") sys.exit() else: print("CHECKING FOR MAPPED OUTPUT AT %s" % (mappedRankByFile)) if utils.checkOutput(mappedRankByFile, 1, 30): print('MAKING ENHANCER ASSOCIATED GENE TSS SIGNAL DICTIONARIES') signalDict = makeSignalDict(mappedRankByFile) else: print("NO MAPPING OUTPUT DETECTED") sys.exit() # use enhancer rank to order rankOrder = utils.order([min(rankDict[x]) for x in overallGeneList]) usedNames = [] # make a new dict to hold TSS signal by max per geneName geneNameSigDict = defaultdict(list) print('MAKING GENE TABLE') for i in rankOrder: refID = overallGeneList[i] geneName = startDict[refID]['name'] if usedNames.count(geneName) > 0 and uniqueGenes == True: continue else: usedNames.append(geneName) proxEnhancers = geneDict['overlapping'][ refID] + geneDict['proximal'][refID] superStatus = max(superDict[refID]) enhancerRanks = join([str(x) for x in rankDict[refID]], ',') enhancerSignal = signalDict[refID] geneNameSigDict[geneName].append(enhancerSignal) newLine = [geneName, refID, join( proxEnhancers, ','), enhancerRanks, superStatus, enhancerSignal] geneToEnhancerTable.append(newLine) #utils.unParseTable(geneToEnhancerTable,'/grail/projects/newRose/geneMapper/foo.txt','\t') print('MAKING ENHANCER TO TOP GENE TABLE') if noFormatTable: enhancerToTopGeneTable = [ enhancerToGeneTable[0] + ['TOP_GENE', 'TSS_SIGNAL']] else: enhancerToTopGeneTable = [enhancerToGeneTable[0][0:12] + [ 'TOP_GENE', 'TSS_SIGNAL'] + enhancerToGeneTable[0][-2:]] for line in enhancerToGeneTable[1:]: geneList = [] if noFormatTable: geneList += line[-3].split(',') geneList += line[-2].split(',') else: geneList += line[10].split(',') geneList += line[11].split(',') geneList = utils.uniquify([x for x in geneList if len(x) > 0]) if len(geneList) > 0: try: sigVector = [max(geneNameSigDict[x]) for x in geneList] maxIndex = sigVector.index(max(sigVector)) maxGene = geneList[maxIndex] maxSig = sigVector[maxIndex] if maxSig == 0.0: maxGene = 'NONE' maxSig = 'NONE' except ValueError: if len(geneList) == 1: maxGene = geneList[0] maxSig = 'NONE' else: maxGene = 'NONE' maxSig = 'NONE' else: maxGene = 'NONE' maxSig = 'NONE' if noFormatTable: newLine = line + [maxGene, maxSig] else: newLine = line[0:12] + [maxGene, maxSig] + line[-2:] enhancerToTopGeneTable.append(newLine) # resort enhancerToGeneTable if noFormatTable: return enhancerToGeneTable, enhancerToTopGeneTable, geneToEnhancerTable else: enhancerOrder = utils.order([int(line[-2]) for line in enhancerToGeneTable[1:]]) sortedTable = [enhancerToGeneTable[0]] sortedTopGeneTable = [enhancerToTopGeneTable[0]] for i in enhancerOrder: sortedTable.append(enhancerToGeneTable[(i + 1)]) sortedTopGeneTable.append(enhancerToTopGeneTable[(i + 1)]) return sortedTable, sortedTopGeneTable, geneToEnhancerTable
def makeGeneTable(peakTable, analysisName): ''' takes the peak table and makes a gene centric table ''' geneDict = {} geneTable = [[ 'GENE', '%s_TSS_SIGNAL' % (analysisName), '%s_DISTAL_SIGNAL' % (analysisName) ]] #now iterate through the table for line in peakTable[1:]: regionLength = int(line[4]) signal = float(line[9]) * regionLength #genes where this particular peak overlaps the tss 1kb window #where there are both overlap and proximal meet if len(line) == 15: overlapGeneList = [ gene for gene in line[-2].split(',') if len(gene) > 0 ] if overlapGeneList.count('107'): print(line) sys.exit() for overlapGene in overlapGeneList: if geneDict.has_key(overlapGene) == False: geneDict[overlapGene] = {'tss': 0.0, 'distal': 0.0} #there can be a nasty 1 overlap case where the region might overlap by the overlapping gene list, but not be real if int(line[5]) == 1: geneDict[overlapGene]['tss'] += signal else: #this is the case where the mycn site is just outside of the promoter or overlapping the gene locus/body these are rar geneDict[overlapGene]['distal'] += signal proximalGeneList = [ gene for gene in line[-1].split(',') if len(gene) > 0 ] for proximalGene in proximalGeneList: if geneDict.has_key(proximalGene) == False: geneDict[proximalGene] = {'tss': 0.0, 'distal': 0.0} if int(line[5]) == 0: geneDict[proximalGene]['distal'] += signal #where there's just overlap if len(line) == 14: overlapGeneList = [ gene for gene in line[-1].split(',') if len(gene) > 0 ] if overlapGeneList.count('107'): print(line) sys.exit() for overlapGene in overlapGeneList: if geneDict.has_key(overlapGene) == False: geneDict[overlapGene] = {'tss': 0.0, 'distal': 0.0} #there can be a nasty 1 overlap case where the region might overlap by the overlapping gene list, but not be real if int(line[5]) == 1: geneDict[overlapGene]['tss'] += signal else: #this is the case where the mycn site is just outside of the promoter or overlapping the gene locus/body these are rar geneDict[overlapGene]['distal'] += signal geneList = geneDict.keys() geneList = utils.uniquify(geneList) geneList.sort() for gene in geneList: newLine = [gene] newLine.append(geneDict[gene]['tss']) newLine.append(geneDict[gene]['distal']) geneTable.append(newLine) return geneTable
def makeNESTable(nes_path_list,names_list,output =''): ''' combines the GSEA NES output from the enhancerPromoter analysis creates a table of all represented gene sets ''' if len(nes_path_list) != len(names_list): print('please provide the same number of nes table paths and sample names') sys.exit() #nested dictionaries gsea['pathway']['name][nes,fdr] gsea_dict = defaultdict(dict) pathway_list = [] #iterate once to just get all potential pathways for i in range(len(nes_path_list)): nes_path = nes_path_list[i] nes = utils.parseTable(nes_path,'\t') for line in nes[1:]: pathway_list.append(line[0]) pathway_list = utils.uniquify(pathway_list) pathway_list.sort() #now blank the dictionary w defaul NES of 0 and FDR of 1 for pathway in pathway_list: for name in names_list: gsea_dict[pathway][name] = [0.0,1.0] #now loop again to fill out properly for i in range(len(nes_path_list)): nes_path = nes_path_list[i] name = names_list[i] nes = utils.parseTable(nes_path,'\t') for line in nes[1:]: if line[2] == 'NA': continue try: nes_vector = [float(line[2]),float(line[3])] except ValueError: print(line) print(nes_path) print(name) sys.exit() pathway = line[0] gsea_dict[pathway][name] = nes_vector #set up the output table header = ['PATHWAY'] for name in names_list: header += ['%s_NES' % (name),'%s_FDR' % (name)] nes_table = [header] for pathway in pathway_list: nes_line = [pathway] for name in names_list: nes_line += gsea_dict[pathway][name] nes_table.append(nes_line) if len(output) != 0: utils.unParseTable(nes_table,output,'\t') else: return nes_table
def finishRankOutput(dataFile,rankOutput,genome,mergeFolder,mergeName,name1,name2,cutOff=1.5,window = 100000,superOnly=True,plotBam=True): ''' cleans up the rank output table makes a gff of all of the gained/lost supers beyond a certain cutoff w/ a window makes a list of gained genes and lost genes makes a bed of gained loss ''' dataDict = pipeline_dfci.loadDataTable(dataFile) #making sure window and cutoff are int/float cutOff = float(cutOff) window = int(window) genome = string.upper(genome) #make the output folder outputFolder =pipeline_dfci.formatFolder(mergeFolder+'output/',True) #bring in the old rank table rankEnhancerTable = utils.parseTable(rankOutput,'\t') #make a new formatted table header = rankEnhancerTable[0] header[-4] = 'DELTA RANK' header[-3] = 'IS_SUPER' formattedRankTable =[header] #the gffs gainedGFF = [] lostGFF = [] gainedWindowGFF = [] lostWindowGFF = [] if superOnly: enhancerType = 'SUPERS' else: enhancerType = 'ENHANCERS' #the beds if superOnly: gainedTrackHeader = 'track name="%s %s only SEs" description="%s super enhancers that are found only in %s vs %s" itemRGB=On color=255,0,0' % (genome,name2,genome,name2,name1) gainedBed = [[gainedTrackHeader]] conservedTrackHeader = 'track name="%s %s and %s SEs" description="%s super enhancers that are found in both %s vs %s" itemRGB=On color=0,0,0' % (genome,name1,name2,genome,name1,name2) conservedBed = [[conservedTrackHeader]] lostTrackHeader = 'track name="%s %s only SEs" description="%s super enhancers that are found only in %s vs %s" itemRGB=On color=0,255,0' % (genome,name1,genome,name1,name2) lostBed = [[lostTrackHeader]] else: gainedTrackHeader = 'track name="%s %s only enhancers" description="%s enhancers that are found only in %s vs %s" itemRGB=On color=255,0,0' % (genome,name2,genome,name2,name1) gainedBed = [[gainedTrackHeader]] conservedTrackHeader = 'track name="%s %s and %s enhancers" description="%s enhancers that are found in both %s vs %s" itemRGB=On color=0,0,0' % (genome,name1,name2,genome,name1,name2) conservedBed = [[conservedTrackHeader]] lostTrackHeader = 'track name="%s %s only enhancers" description="%s enhancers that are found only in %s vs %s" itemRGB=On color=0,255,0' % (genome,name1,genome,name1,name2) lostBed = [[lostTrackHeader]] #the genes geneTable =[['GENE','ENHANCER_ID','ENHANCER_CHROM','ENHANCER_START','ENHANCER_STOP',header[6],header[7],header[8],'STATUS']] for line in rankEnhancerTable[1:]: #fixing the enhancer ID line[0] = line[0].replace('_lociStitched','') formattedRankTable.append(line) #getting the genes geneList = [] geneList += line[9].split(',') geneList += line[10].split(',') geneList += line[11].split(',') geneList = [x for x in geneList if len(x) >0] geneList = utils.uniquify(geneList) geneString = string.join(geneList,',') bedLine = [line[1],line[2],line[3],line[0],line[-4]] #for gained if float(line[6]) > cutOff: gffLine = [line[1],line[0],'',line[2],line[3],'','.','',geneString] gffWindowLine = [line[1],line[0],'',int(line[2])-window,int(line[3])+window,'','.','',geneString] gainedGFF.append(gffLine) gainedWindowGFF.append(gffWindowLine) geneStatus = name2 gainedBed.append(bedLine) #for lost elif float(line[6]) < (-1 * cutOff): gffLine = [line[1],line[0],'',line[2],line[3],'','.','',geneString] gffWindowLine = [line[1],line[0],'',int(line[2])-window,int(line[3])+window,'','.','',geneString] lostGFF.append(gffLine) lostWindowGFF.append(gffWindowLine) geneStatus = name1 lostBed.append(bedLine) #for conserved else: geneStatus = 'CONSERVED' conservedBed.append(bedLine) #now fill in the gene Table for gene in geneList: geneTableLine = [gene,line[0],line[1],line[2],line[3],line[6],line[7],line[8],geneStatus] geneTable.append(geneTableLine) #concat the bed fullBed = gainedBed + conservedBed + lostBed #start writing the output #there's the two gffs, the bed,the formatted table, the gene table #formatted table formattedFilename = "%s%s_%s_MERGED_%s_RANK_TABLE.txt" % (outputFolder,genome,mergeName,enhancerType) utils.unParseTable(formattedRankTable,formattedFilename,'\t') #gffs gffFolder = pipeline_dfci.formatFolder(outputFolder+'gff/',True) gffFilename_gained = "%s%s_%s_%s_ONLY_%s_-0_+0.gff" % (gffFolder,genome,mergeName,string.upper(name2),enhancerType) gffFilenameWindow_gained = "%s%s_%s_%s_ONLY_%s_-%sKB_+%sKB.gff" % (gffFolder,genome,mergeName,string.upper(name2),enhancerType,window/1000,window/1000) gffFilename_lost = "%s%s_%s_%s_ONLY_%s_-0_+0.gff" % (gffFolder,genome,mergeName,string.upper(name1),enhancerType) gffFilenameWindow_lost = "%s%s_%s_%s_ONLY_%s_-%sKB_+%sKB.gff" % (gffFolder,genome,mergeName,string.upper(name1),enhancerType,window/1000,window/1000) utils.unParseTable(gainedGFF,gffFilename_gained,'\t') utils.unParseTable(gainedWindowGFF,gffFilenameWindow_gained,'\t') utils.unParseTable(lostGFF,gffFilename_lost,'\t') utils.unParseTable(lostWindowGFF,gffFilenameWindow_lost,'\t') #bed bedFilename = "%s%s_%s_MERGED_%s.bed" % (outputFolder,genome,mergeName,enhancerType) utils.unParseTable(fullBed,bedFilename,'\t') #geneTable geneFilename = "%s%s_%s_MERGED_%s_GENE_TABLE.txt" % (outputFolder,genome,mergeName,enhancerType) utils.unParseTable(geneTable,geneFilename,'\t') #finally, move all of the plots to the output folder cmd = "cp %s%s_ROSE/*.pdf %s%s_%s_MERGED_%s_DELTA.pdf" % (mergeFolder,name1,outputFolder,genome,mergeName,enhancerType) os.system(cmd) cmd = "cp %s%s_ROSE/*RANK_PLOT.png %s%s_%s_MERGED_%s_RANK_PLOT.png" % (mergeFolder,name1,outputFolder,genome,mergeName,enhancerType) os.system(cmd) #now execute the bamPlot_turbo.py commands if plotBam: bam1 = dataDict[name1]['bam'] bam2 = dataDict[name2]['bam'] bamString = "%s,%s" % (bam1,bam2) nameString = "%s,%s" % (name1,name2) colorString = "0,0,0:100,100,100" #change dir os.chdir(pipelineDir) if len(gainedGFF) > 0: #gained command plotTitle = "%s_ONLY_SE" % (name2) cmd = 'python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE' % (genome,bamString,gffFilename_gained,outputFolder,nameString,colorString,plotTitle) os.system(cmd) #gained window command plotTitle = "%s_ONLY_SE_%sKB_WINDOW" % (name2,window/1000) cmd = 'python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE' % (genome,bamString,gffFilenameWindow_gained,outputFolder,nameString,colorString,plotTitle) os.system(cmd) if len(lostGFF) > 0: #lost command plotTitle = "%s_ONLY_SE" % (name1) cmd = 'python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE' % (genome,bamString,gffFilename_lost,outputFolder,nameString,colorString,plotTitle) os.system(cmd) #lost command plotTitle = "%s_ONLY_SE_%sKB_WINDOW" % (name1,window/1000) cmd = 'python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE' % (genome,bamString,gffFilenameWindow_lost,outputFolder,nameString,colorString,plotTitle) os.system(cmd) return
def finishRankOutput(dataFile,statOutput,diffOutput,genome,mergeFolder,mergeName,name1,name2,namesList1,namesList2,cutOff=1.0,window = 100000,superOnly=True,plotBam=True): ''' cleans up the rank output table makes a gff of all of the gained/lost supers beyond a certain cutoff w/ a window makes a list of gained genes and lost genes makes a bed of gained loss ''' dataDict = pipeline_dfci.loadDataTable(dataFile) #making sure window and cutoff are int/float cutOff = float(cutOff) window = int(window) genome = string.upper(genome) #make the output folder outputFolder =pipeline_dfci.formatFolder(mergeFolder+'output/',True) #bring in the old rank table rankEnhancerTable = utils.parseTable(statOutput,'\t') #make a new formatted table header = rankEnhancerTable[0] formattedRankTable =[header] #the gffs gainedGFF = [] lostGFF = [] gainedWindowGFF = [] lostWindowGFF = [] if superOnly: enhancerType = 'SUPERS' else: enhancerType = 'ENHANCERS' #the beds if superOnly: gainedTrackHeader = 'track name="%s %s only SEs" description="%s super enhancers that are found only in %s vs %s" itemRGB=On color=255,0,0' % (genome,name2,genome,name2,name1) gainedBed = [[gainedTrackHeader]] conservedTrackHeader = 'track name="%s %s and %s SEs" description="%s super enhancers that are found in both %s vs %s" itemRGB=On color=0,0,0' % (genome,name1,name2,genome,name1,name2) conservedBed = [[conservedTrackHeader]] lostTrackHeader = 'track name="%s %s only SEs" description="%s super enhancers that are found only in %s vs %s" itemRGB=On color=0,255,0' % (genome,name1,genome,name1,name2) lostBed = [[lostTrackHeader]] else: gainedTrackHeader = 'track name="%s %s only enhancers" description="%s enhancers that are found only in %s vs %s" itemRGB=On color=255,0,0' % (genome,name2,genome,name2,name1) gainedBed = [[gainedTrackHeader]] conservedTrackHeader = 'track name="%s %s and %s enhancers" description="%s enhancers that are found in both %s vs %s" itemRGB=On color=0,0,0' % (genome,name1,name2,genome,name1,name2) conservedBed = [[conservedTrackHeader]] lostTrackHeader = 'track name="%s %s only enhancers" description="%s enhancers that are found only in %s vs %s" itemRGB=On color=0,255,0' % (genome,name1,genome,name1,name2) lostBed = [[lostTrackHeader]] #the genes geneTable =[['GENE','ENHANCER_ID','ENHANCER_CHROM','ENHANCER_START','ENHANCER_STOP',header[6],header[7],header[8],'STATUS']] headerLength = len(rankEnhancerTable[0]) for line in rankEnhancerTable[1:]: #fix line lengths if len(line) != headerLength: line += ['']*(headerLength-len(line)) #fixing the enhancer ID line[0] = line[0].replace('_lociStitched','') formattedRankTable.append(line) #getting the genes geneList = [] geneList += line[-1].split(',') geneList += line[-2].split(',') geneList += line[-3].split(',') geneList = [x for x in geneList if len(x) >0] geneList = utils.uniquify(geneList) geneString = string.join(geneList,',') bedLine = [line[1],line[2],line[3],line[0],line[-4]] #for gained #this applies both the statistical test chosen (default fdr <= 0.05) and the cutoff #the cutoff is hard wired, but we can add an option to change the test #stats are done in the R script. FDR norm can kinda suck if no genes are considered diff #print(line) if float(line[-8]) > cutOff and int(line[-4]) == 1: gffLine = [line[1],line[0],'',line[2],line[3],'','.','',geneString] gffWindowLine = [line[1],line[0],'',int(line[2])-window,int(line[3])+window,'','.','',geneString] gainedGFF.append(gffLine) gainedWindowGFF.append(gffWindowLine) geneStatus = name2 gainedBed.append(bedLine) #for lost elif float(line[-8]) < (-1 * cutOff) and int(line[-4]) == 1: gffLine = [line[1],line[0],'',line[2],line[3],'','.','',geneString] gffWindowLine = [line[1],line[0],'',int(line[2])-window,int(line[3])+window,'','.','',geneString] lostGFF.append(gffLine) lostWindowGFF.append(gffWindowLine) geneStatus = name1 lostBed.append(bedLine) #for conserved else: geneStatus = 'UNCHANGED' conservedBed.append(bedLine) #now fill in the gene Table for gene in geneList: geneTableLine = [gene,line[0],line[1],line[2],line[3],line[6],line[7],line[8],geneStatus] geneTable.append(geneTableLine) #concat the bed fullBed = gainedBed + conservedBed + lostBed #start writing the output #there's the two gffs, the bed,the formatted table, the gene table #formatted table formattedFilename = "%s%s_%s_MERGED_%s_RANK_TABLE.txt" % (outputFolder,genome,mergeName,enhancerType) utils.unParseTable(formattedRankTable,formattedFilename,'\t') #formatted diff table #possible that no genes are differential rankEnhancerDiffTable = utils.parseTable(diffOutput,'\t') #make a new formatted table header = rankEnhancerDiffTable[0] formattedRankDiffTable =[header] for line in rankEnhancerDiffTable[1:]: #fixing the enhancer ID line[0] = line[0].replace('_lociStitched','') formattedRankDiffTable.append(line) formattedDiffFilename = "%s%s_%s_MERGED_%s_RANK_DIFF_TABLE.txt" % (outputFolder,genome,mergeName,enhancerType) utils.unParseTable(formattedRankDiffTable,formattedDiffFilename,'\t') #gffs gffFolder = pipeline_dfci.formatFolder(outputFolder+'gff/',True) gffFilename_gained = "%s%s_%s_%s_ONLY_%s_-0_+0.gff" % (gffFolder,genome,mergeName,string.upper(name2),enhancerType) gffFilenameWindow_gained = "%s%s_%s_%s_ONLY_%s_-%sKB_+%sKB.gff" % (gffFolder,genome,mergeName,string.upper(name2),enhancerType,window/1000,window/1000) gffFilename_lost = "%s%s_%s_%s_ONLY_%s_-0_+0.gff" % (gffFolder,genome,mergeName,string.upper(name1),enhancerType) gffFilenameWindow_lost = "%s%s_%s_%s_ONLY_%s_-%sKB_+%sKB.gff" % (gffFolder,genome,mergeName,string.upper(name1),enhancerType,window/1000,window/1000) utils.unParseTable(gainedGFF,gffFilename_gained,'\t') utils.unParseTable(gainedWindowGFF,gffFilenameWindow_gained,'\t') utils.unParseTable(lostGFF,gffFilename_lost,'\t') utils.unParseTable(lostWindowGFF,gffFilenameWindow_lost,'\t') #bed bedFilename = "%s%s_%s_MERGED_%s.bed" % (outputFolder,genome,mergeName,enhancerType) utils.unParseTable(fullBed,bedFilename,'\t') #geneTable geneFilename = "%s%s_%s_MERGED_%s_GENE_TABLE.txt" % (outputFolder,genome,mergeName,enhancerType) utils.unParseTable(geneTable,geneFilename,'\t') #finally, move all of the plots to the output folder cmd = "cp %s%s_ROSE/*DELTA*.pdf %s%s_%s_MERGED_%s_DELTA.pdf" % (mergeFolder,namesList1[0],outputFolder,genome,mergeName,enhancerType) os.system(cmd) cmd = "cp %s%s_ROSE/*REGION_GAINED*.pdf %s%s_%s_MERGED_%s_REGION_GAINED.pdf" % (mergeFolder,namesList1[0],outputFolder,genome,mergeName,enhancerType) os.system(cmd) cmd = "cp %s%s_ROSE/*REGION_LOST*.pdf %s%s_%s_MERGED_%s_REGION_LOST.pdf" % (mergeFolder,namesList1[0],outputFolder,genome,mergeName,enhancerType) os.system(cmd) cmd = "cp %s%s_ROSE/*REGION_LOST*.pdf %s%s_%s_MERGED_%s_REGION_UNCHANGED.pdf" % (mergeFolder,namesList1[0],outputFolder,genome,mergeName,enhancerType) os.system(cmd) cmd = "cp %s%s_ROSE/*RANK_PLOT.png %s%s_%s_MERGED_%s_RANK_PLOT.png" % (mergeFolder,namesList1[0],outputFolder,genome,mergeName,enhancerType) os.system(cmd) #now execute the bamPlot_turbo.py commands if plotBam: bamList1 = [dataDict[name]['bam'] for name in namesList1] bamList2 = [dataDict[name]['bam'] for name in namesList2] bamList = bamList1 + bamList2 bamString = string.join(bamList,',') nameList = [name1]*len(namesList1) + [name2]*len(namesList2) nameString = string.join(nameList,',') print(namesList1[0]) print(namesList2[0]) print(namesList1) print(namesList2) print(dataDict[namesList1[0]]['color']) if dataDict[namesList1[0]]['color'] != dataDict[namesList2[0]]['color']: colorList = [dataDict[namesList1[0]]['color']]*len(namesList1) + [dataDict[namesList2[0]]['color']]*len(namesList2) else: colorList = ['0,0,0']*len(namesList1) + ['100,100,100']*len(namesList2) colorString = string.join(colorList,':') #change dir if len(gainedGFF) > 0: #gained command plotTitle = "%s_ONLY_SE" % (name2) cmd = 'python %sbamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MERGE' % (pipelineDir,genome,bamString,gffFilename_gained,outputFolder,nameString,colorString,plotTitle) os.system(cmd) #gained window command plotTitle = "%s_ONLY_SE_%sKB_WINDOW" % (name2,window/1000) cmd = 'python %sbamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MERGE' % (pipelineDir,genome,bamString,gffFilenameWindow_gained,outputFolder,nameString,colorString,plotTitle) os.system(cmd) if len(lostGFF) > 0: #lost command plotTitle = "%s_ONLY_SE" % (name1) cmd = 'python %sbamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MERGE' % (pipelineDir,genome,bamString,gffFilename_lost,outputFolder,nameString,colorString,plotTitle) os.system(cmd) #lost command plotTitle = "%s_ONLY_SE_%sKB_WINDOW" % (name1,window/1000) cmd = 'python %sbamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MERGE' % (pipelineDir,genome,bamString,gffFilenameWindow_lost,outputFolder,nameString,colorString,plotTitle) os.system(cmd) return