def makeFoldTable(annotFile,analysisName,testName,controlName,testMMR,controlMMR,testIdxFile,controlIdxFile,outputFolder,epsilon = 1): ''' makes the fold table and writes to disk fold table is ranked by fold change first column is guideID, second column is gene name, third is fold change ''' guideDict,geneDict = makeAnnotDict(annotFile) testIdx = utils.parseTable(testIdxFile,'\t') controlIdx = utils.parseTable(controlIdxFile,'\t') #for each guide, divide the count by the MMR then add 1 then take the log2 ratio outTable = [['GUIDE_ID','GENE','LOG2_RATIO',testName,controlName]] for i in range(len(testIdx)): guideID = testIdx[i][0] gene = guideDict[guideID] testCount = float(testIdx[i][2])/testMMR + epsilon controlCount = float(controlIdx[i][2])/controlMMR + epsilon log2Ratio = numpy.log2(testCount/controlCount) newLine = [guideID,gene,log2Ratio,round(testCount,4),round(controlCount,4)] outTable.append(newLine) outputFile = '%s%s_log2Ratio.txt' % (outputFolder,analysisName) utils.unParseTable(outTable,outputFile,'\t') return outputFile
def mapBams(bamFileList,splitGFFPath,analysisName,mappedFolder): print("MAPPING TO THE FOLLOWING BAMS:") for bamFile in bamFileList: print(bamFile) bamFileName = bamFile.split('/')[-1] # MAPPING TO THE STITCHED GFF mappedOut1Folder = '%s%s_%s_MAPPED' % (mappedFolder, analysisName, bamFileName) mappedOut1File = '%s%s_%s_MAPPED/matrix.txt' % (mappedFolder, analysisName, bamFileName) if utils.checkOutput(mappedOut1File, 0.2, 0.2): print("FOUND %s MAPPING DATA FOR BAM: %s" % (splitGFFPath, mappedOut1File)) else: cmd1 = bamliquidator_path + " --sense . -e 200 --match_bamToGFF -r %s -o %s %s" % (splitGFFPath, mappedOut1Folder, bamFile) print(cmd1) os.system(cmd1) if utils.checkOutput(mappedOut1File,0.2,5): print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (splitGFFPath, bamFileName)) else: print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (splitGFFPath, bamFileName)) sys.exit() print('BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS') #now we make a signal table #set up the table using the first bam if len(bamFileList) > 1: #set up the first pass at the table signalTable = [['REGION_ID','locusLine'] + [name.split('/')[-1] for name in bamFileList]] bamFileName = bamFileList[0].split('/')[-1] mappedTable = utils.parseTable( '%s%s_%s_MAPPED/matrix.txt' % (mappedFolder, analysisName, bamFileName),'\t') for i in range(1,len(mappedTable)): signalTable.append(mappedTable[i]) for bamFile in bamFileList[1:]: bamFileName = bamFile.split('/')[-1] mappedTable = utils.parseTable( '%s%s_%s_MAPPED/matrix.txt' % (mappedFolder, analysisName, bamFileName),'\t') for i in range(1,len(mappedTable[i])): mapSignal = mappedTable[i][2] signalTable[i].append(mapSignal) else: bamFileName = bamFileList[0].split('/')[-1] signalTable = utils.parseTable( '%s%s_%s_MAPPED/matrix.txt' % (mappedFolder, analysisName, bamFileName),'\t') return(signalTable)
def addLengths(gene_table_path, peak_table_path): ''' add tss and distal lengths to a gene table using the peak table ''' output_path = string.replace(gene_table_path, 'GENE_TABLE', 'GENE_TABLE_LENGTH') print(output_path) tss_dict = defaultdict(int) distal_dict = defaultdict(int) peak_table = utils.parseTable(peak_table_path, '\t') for line in peak_table[1:]: #get the genes gene_list = [] if len(line) == 15: gene_list += line[-1].split(',') gene_list += line[-2].split(',') elif len(line) == 14: gene_list += line[-1].split(',') else: continue gene_list = utils.uniquify( [gene for gene in gene_list if len(gene) > 0]) for gene in gene_list: if int(line[5]) == 1: tss_dict[gene] += int(line[4]) else: distal_dict[gene] += int(line[4]) #now fill out the gene table gene_table = utils.parseTable(gene_table_path, '\t') output_table = [gene_table[0] + ['TSS_LENGTH', 'DISTAL_LENGTH']] for line in gene_table[1:]: gene = line[0] new_line = line + [tss_dict[gene], distal_dict[gene]] output_table.append(new_line) utils.unParseTable(output_table, output_path, '\t') return output_path
def callMergeSupers(dataFile,superFile1,superFile2,name1,name2,mergeName,genome,parentFolder): ''' this is the main run function for the script all of the work should occur here, but no functions should be defined here ''' mergedGFFFile = '%s%s_%s_MERGED_REGIONS_-0_+0.gff' % (parentFolder,string.upper(genome),mergeName) #check to make sure this hasn't been done yet roseOutput = "%s%s_ROSE/%s_%s_MERGED_REGIONS_-0_+0_SuperEnhancers_ENHANCER_TO_GENE.txt" % (parentFolder,name1,string.upper(genome),mergeName) try: foo = utils.parseTable(roseOutput,'\t') print "ROSE OUTPUT ALREADY FOUND HERE %s" % (roseOutput) return roseOutput except IOError: print "MERGING ENHANCER REGIONS FROM %s and %s" % (superFile1,superFile2) mergedGFF = mergeCollections(superFile1,superFile2,name1,name2,mergedGFFFile) #call rose on the merged shit roseBashFile = callRoseMerged(dataFile,mergedGFF,name1,name2,parentFolder) print('i can has rose bash file %s' % (roseBashFile)) #run the bash command os.system('bash %s' % (roseBashFile)) #check for and return output if utils.checkOutput(roseOutput,1,30): return roseOutput else: print "ERROR: ROSE CALL ON MERGED REGIONS FAILED" sys.exit()
def makeSEDict(enhancerFile, name, superOnly=True): ''' makes an attribute dict for enhancers keyed by uniqueID ''' seDict = {} enhancerTable = utils.parseTable(enhancerFile, '\t') superLoci = [] for line in enhancerTable: if line[0][0] == '#': continue if line[0][0] == 'R': header = line supColumn = header.index('isSuper') continue if superOnly: if int(line[supColumn]) == 1: rank = int(line[-2]) enhancerID = name + '_' + line[0] seDict[enhancerID] = {'rank': rank} else: signal = float(line[6]) - float(line[7]) rank = int(line[-2]) enhancerID = name + '_' + line[0] seDict[enhancerID] = {'rank': rank} return seDict
def makeSEDict(enhancerFile, name, superOnly=True): """ makes an attribute dict for enhancers keyed by uniqueID """ seDict = {} enhancerTable = utils.parseTable(enhancerFile, "\t") superLoci = [] for line in enhancerTable: if line[0][0] == "#": continue if line[0][0] == "R": header = line supColumn = header.index("isSuper") continue if superOnly: if int(line[supColumn]) == 1: signal = float(line[6]) - float(line[7]) rank = int(line[-2]) enhancerID = name + "_" + line[0] seDict[enhancerID] = {"rank": rank, "signal": signal} else: signal = float(line[6]) - float(line[7]) rank = int(line[-2]) enhancerID = name + "_" + line[0] seDict[enhancerID] = {"rank": rank, "signal": signal} return seDict
def makeEnhancerSignalTable(mergedRegionMap,medianDict,analysisName,genome,outputFolder): ''' makes a table where each row is an enhancer and each column is the log2 background corrected signal vs. median ''' #load in the region map regionMap = utils.parseTable(mergedRegionMap,'\t') namesList = medianDict.keys() signalTable = [['REGION_ID','CHROM','START','STOP','NUM_LOCI','CONSTITUENT_SIZE'] + namesList] for line in regionMap[1:]: newLine = line[0:6] for i in range(len(namesList)): enhancerIndex = (i*2) + 6 controlIndex = (i*2) + 7 enhancerSignal = float(line[enhancerIndex]) - float(line[controlIndex]) if enhancerSignal < 0: enhancerSignal = 0 enhancerSignal = enhancerSignal/medianDict[namesList[i]] newLine.append(enhancerSignal) signalTable.append(newLine) outputFile = "%s%s_%s_signalTable.txt" % (outputFolder,genome,analysisName) print "WRITING MEDIAN NORMALIZED SIGNAL TABLE TO %s" % (outputFile) utils.unParseTable(signalTable,outputFile,'\t') return outputFile
def collapseRegionMap(regionMapFile, name='', controlBams=False): ''' takes a regionMap file and collapses signal into a single column also fixes any stupid start/stop sorting issues needs to take into account whether or not controls were used ''' regionMap = utils.parseTable(regionMapFile, '\t') for n, line in enumerate(regionMap): if n == 0: #new header if len(name) == 0: name = 'MERGED_SIGNAL' regionMap[n] = line[0:6] + [name] else: newLine = list(line[0:6]) if controlBams: signalLine = [float(x) for x in line[6:]] rankbyIndexes = range(0, len(signalLine) / 2, 1) controlIndexes = range(len(signalLine) / 2, len(signalLine), 1) metaVector = [] for i, j in zip(rankbyIndexes, controlIndexes): #min signal is 0 metaVector.append(max(0, signalLine[i] - signalLine[j])) metaSignal = numpy.mean(metaVector) else: metaSignal = numpy.mean([float(x) for x in line[6:]]) regionMap[n] = newLine + [metaSignal] outputFile = string.replace(regionMapFile, 'REGION', 'META') utils.unParseTable(regionMap, outputFile, '\t') return (outputFile)
def makeBedCollection(bedFileList): ''' takes in a list of bedFiles and makes a single huge collection each locus has as its ID the name of the bed file ''' bedLoci = [] print("MAKING BED COLLECTION FOR:") for bedFile in bedFileList: bedName = bedFile.split('/')[-1].split('.')[0] print(bedName) bed = utils.parseTable(bedFile, '\t') for line in bed: if len(line) >= 3: #check that line[0] if line[0][0:3] == 'chr': try: coords = [int(line[1]), int(line[2])] bedLocus = utils.Locus(line[0], min(coords), max(coords), '.', bedName) bedLoci.append(bedLocus) except ValueError: pass print("IDENTIFIED %s BED REGIONS" % (len(bedLoci))) return utils.LocusCollection(bedLoci, 50)
def makeSEDict(enhancerFile,name,superOnly = True): ''' makes an attribute dict for enhancers keyed by uniqueID ''' seDict = {} enhancerTable = utils.parseTable(enhancerFile,'\t') superLoci = [] for line in enhancerTable: if line[0][0] == '#': continue if line[0][0] == 'R': header = line supColumn = header.index('isSuper') continue if superOnly: if int(line[supColumn]) == 1: signal = float(line[6]) - float(line[7]) rank = int(line[-2]) enhancerID = name+'_'+line[0] seDict[enhancerID] = {'rank':rank,'signal':signal} else: signal = float(line[6]) - float(line[7]) rank = int(line[-2]) enhancerID = name+'_'+line[0] seDict[enhancerID] = {'rank':rank,'signal':signal} return seDict
def makeBedCollection(bedFileList): ''' takes in a list of bedFiles and makes a single huge collection each locus has as its ID the name of the bed file ''' bedLoci = [] print("MAKING BED COLLECTION FOR:") for bedFile in bedFileList: bedName = bedFile.split('/')[-1].split('.')[0] print(bedName) bed = utils.parseTable(bedFile, '\t') for line in bed: if len(line) >= 3: #check that line[0] if line[0][0:3] == 'chr': try: coords = [int(line[1]),int(line[2])] bedLocus = utils.Locus(line[0], min(coords), max(coords), '.', bedName) bedLoci.append(bedLocus) except ValueError: pass print("IDENTIFIED %s BED REGIONS" % (len(bedLoci))) return utils.LocusCollection(bedLoci, 50)
def loadGenome(genome_build,config_file = ''): ''' loads annotation for a genome into a genome object ''' #this nested dictionary has all of the useful information and likely will have to be #edited so it can be configured any time genome_build = string.upper(genome_build) genomeDict = { 'HG19':{'annot_file':'%sannotation/hg19_refseq.ucsc' % (pipeline_dir), 'genome_directory':'/storage/cylin/grail/genomes/Homo_sapiens/UCSC/hg19/Sequence/Chromosomes/', 'tf_file':'%s/annotation/TFlist_NMid_hg19.txt' % (whereAmI), 'mask_file':'/storage/cylin/grail/genomes/Homo_sapiens/UCSC/hg19/Annotation/Masks/hg19_encode_blacklist.bed', 'motif_convert':'%s/annotation/MotifDictionary.txt' % (whereAmI), 'motif_database':'%s/annotation/VertebratePWMs.txt' % (whereAmI), }, 'RN6':{'annot_file':'%sannotation/rn6_refseq.ucsc' % (pipeline_dir), 'genome_directory':'/storage/cylin/grail/genomes/Rattus_norvegicus/UCSC/rn6/Sequence/Chromosomes/', 'tf_file':'%s/annotation/TFlist_NMid_rn6.txt' % (whereAmI), 'motif_convert':'%s/annotation/MotifDictionary.txt' % (whereAmI), 'motif_database':'%s/annotation/VertebratePWMs.txt' % (whereAmI), }, 'MM10':{'annot_file':'%sannotation/mm10_refseq.ucsc' % (pipeline_dir), 'genome_directory':'/storage/cylin/grail/genomes/Mus_musculus/UCSC/mm10/Sequence/Chromosomes/', 'tf_file':'%s/annotation/TFlist_NMid_mm10.txt' % (whereAmI), 'motif_convert':'%s/annotation/MotifDictionary.txt' % (whereAmI), 'motif_database':'%s/annotation/VertebratePWMs.txt' % (whereAmI), } } #allow an optional config file to overwrite default paths if len(config_file) >0: config_table = utils.parseTable(config_file,'\t') for line in config_table[1:]: (build,field,feature_path) = line[0].split(':') genomeDict[string.upper(build)][string.lower(field)] = feature_path if genome_build not in genomeDict: print('ERROR: UNSUPPORTED GENOME BUILD %s. EXITING NOW' % (genome_build)) sys.exit() else: print('USING BUILD %s WITH FOLLOWING FIELDS:' % (genome_build)) print(genomeDict[genome_build]) #now attempt to load the genome genome = Genome(genome_build,genomeDict[genome_build]['genome_directory'],genomeDict[genome_build]['annot_file']) #adding additional optional features genome.addFeature('tf_file',genomeDict[genome_build]['tf_file']) if genome_build == 'HG19': genome.addFeature('mask',genomeDict[genome_build]['mask_file']) genome.addFeature('motif_convert',genomeDict[genome_build]['motif_convert']) genome.addFeature('motif_database',genomeDict[genome_build]['motif_database']) return genome
def createExpressionDict(annotationFile, projectFolder, projectName, refseqToNameDict, expCutoff,expressionFile=''): ''' input: an activity table with refseq in first column and expression or promoter acetylation in second column output: a dictionary keyed by refseq that points to activity ''' print 'CREATING EXPRESSION DICTIONARY' if not expressionFile: expressionFilename = projectFolder + 'bamliquidator/matrix.txt' else: expressionFilename = expressionFile expressionTable = utils.parseTable(expressionFilename, '\t') expressionDictNM = {} expressionDictGene = {} for line in expressionTable[1:]: trid = line[0] geneName = refseqToNameDict[trid] try: exp = float(line[2]) except IndexError: exp = float(line[1]) # Save the expression value of each NMid in a dict, keep higher value if multiple if trid in expressionDictNM and exp > expressionDictNM[trid]: expressionDictNM[trid] = exp elif trid not in expressionDictNM: expressionDictNM[trid] = exp # Save the value of the expression if it's the highest for that gene if geneName in expressionDictGene and exp > expressionDictGene[geneName]: expressionDictGene[geneName] = exp elif geneName not in expressionDictGene: expressionDictGene[geneName] = exp cutoff = numpy.percentile(expressionDictGene.values(), expCutoff) print 'Expression cutoff: ' + str(cutoff) expressedGenes = [] expressedNM = [] for nmid in expressionDictNM: if float(expressionDictNM[nmid]) > cutoff: expressedGenes.append(refseqToNameDict[nmid]) expressedNM.append(nmid) expressedGenes = utils.uniquify(expressedGenes) Genefilename = projectFolder + projectName + '_EXPRESSED_GENES.txt' utils.unParseTable(expressedGenes, Genefilename, '') expressedNM = utils.uniquify(expressedNM) NMfilename = projectFolder + projectName + '_EXPRESSED_NM.txt' utils.unParseTable(expressedNM, NMfilename, '') return expressedNM, expressionDictNM
def scoreValley(locus, bamFile, projectName, projectFolder): ''' calculate valley scores for a locus based on this refernce: http://bioinformatics.oxfordjournals.org/content/26/17/2071.full ''' nbins = locus.len() / 10 #call bamliquidator on the region and store in a temp file os.system('bamliquidator ' + bamFile + ' ' + locus.chr() + ' ' + str(locus.start()) + ' ' + str(locus.end()) + ' . ' + str(nbins) + ' 0 > ' + projectFolder + 'tempBamliquidator_' + projectName + '.txt') x = utils.parseTable( projectFolder + 'tempBamliquidator_' + projectName + '.txt', '\t') density = [int(y[0]) for y in x] smoothDensity = gaussianSmooth(density, 5) scoreArray = [] regionMax = max(smoothDensity) #Now take the smooth reads and calaculate a valley score for i in range(len(smoothDensity)): score = 0 try: leftmax = max(smoothDensity[i - 25:i - 10]) except: leftmax = 'edge' try: rightmax = max(smoothDensity[i + 10:i + 25]) except: rightmax = 'edge' if rightmax == 'edge' and leftmax == 'edge': shoulderHeightMin = 0 shoulderHeightMax = 0 elif leftmax == 'edge': shoulderHeightMin = rightmax shoulderHeightMax = rightmax elif rightmax == 'edge': shoulderHeightMin = leftmax shoulderHeightMax = leftmax else: shoulderHeightMin = min(leftmax, rightmax) shoulderHeightMax = max(leftmax, rightmax) ratio = (shoulderHeightMax - float(smoothDensity[i])) / regionMax if ratio > 0.3: score = 1 else: score = 0 scoreArray.append(score) return scoreArray
def buildGraph(projectFolder, projectName, motifConvertFile, refseqToNameDict, canidateGenes): ''' import the FIMO output once it's finished build the networkX directed graph ''' motifDatabase = utils.parseTable(motifConvertFile, '\t') motifDatabaseDict = {} motifNames = [line[1] for line in motifDatabase] # The reverse of the other dict, from motif name to gene name for line in motifDatabase: motifDatabaseDict[line[0]] = line[1] fimoFile = projectFolder + 'FIMO/fimo.txt' fimoTable = utils.parseTable(fimoFile, '\t') graph = nx.DiGraph(name=projectName) graph.add_nodes_from(canidateGenes) motifDict = defaultdict(list) for line in fimoTable[1:]: source = motifDatabaseDict[line[0]] #motifId # region = line[1].split('|') region = line[2].split('|') target = refseqToNameDict[ region[0]] #gene name corresponding to the NMid graph.add_edge(source, target) # motifDict[source].append((region[1], int(region[2]) + int(line[2]), int(region[2]) + int(line[3]))) motifDict[source].append((region[1], int(region[2]) + int(line[3]), int(region[2]) + int(line[4]))) utils.formatFolder(projectFolder + 'motifBED/', True) for gene in motifDict.keys(): if motifDict[gene]: bed = [] for loc in motifDict[gene]: bed.append([loc[0], loc[1], loc[2]]) filename = projectFolder + 'motifBED/' + gene + '_' + projectName + '_motifs.bed' utils.unParseTable(bed, filename, '\t') return graph
def makeFoldTable(annotFile, analysisName, testName, controlName, testMMR, controlMMR, testIdxFile, controlIdxFile, outputFolder, epsilon=1): ''' makes the fold table and writes to disk fold table is ranked by fold change first column is guideID, second column is gene name, third is fold change ''' guideDict, geneDict = makeAnnotDict(annotFile) testIdx = utils.parseTable(testIdxFile, '\t') controlIdx = utils.parseTable(controlIdxFile, '\t') #for each guide, divide the count by the MMR then add 1 then take the log2 ratio outTable = [['GUIDE_ID', 'GENE', 'LOG2_RATIO', testName, controlName]] for i in range(len(testIdx)): guideID = testIdx[i][0] gene = guideDict[guideID] testCount = float(testIdx[i][2]) / testMMR + epsilon controlCount = float(controlIdx[i][2]) / controlMMR + epsilon log2Ratio = numpy.log2(testCount / controlCount) newLine = [ guideID, gene, log2Ratio, round(testCount, 4), round(controlCount, 4) ] outTable.append(newLine) outputFile = '%s%s_log2Ratio.txt' % (outputFolder, analysisName) utils.unParseTable(outTable, outputFile, '\t') return outputFile
def makeEnhancerSignalTable(nameDict, mergedRegionMap, medianDict, analysisName, genome, outputFolder): ''' makes a table where each row is an enhancer and each column is the log2 background corrected signal vs. median ''' #load in the region map regionMap = utils.parseTable(mergedRegionMap, '\t') namesList = nameDict.keys() namesList.sort() signalTable = [[ 'REGION_ID', 'CHROM', 'START', 'STOP', 'NUM_LOCI', 'CONSTITUENT_SIZE' ] + namesList] print("len of %s for namesList" % (len(namesList))) print(namesList) for line in regionMap[1:]: newLine = line[0:6] #a little tricky here to add datasets sequentially i = 6 #start w/ the first column w/ data for name in namesList: if nameDict[name]['background'] == True: enhancerIndex = int(i) i += 1 controlIndex = int(i) i += 1 try: enhancerSignal = float(line[enhancerIndex]) - float( line[controlIndex]) except IndexError: print line print len(line) print enhancerIndex print controlIndex sys.exit() else: enhancerIndex = int(i) i += 1 enhancerSignal = float(line[enhancerIndex]) if enhancerSignal < 0: enhancerSignal = 0 enhancerSignal = enhancerSignal / medianDict[name] newLine.append(enhancerSignal) signalTable.append(newLine) outputFile = "%s%s_%s_signalTable.txt" % (outputFolder, genome, analysisName) print "WRITING MEDIAN NORMALIZED SIGNAL TABLE TO %s" % (outputFile) utils.unParseTable(signalTable, outputFile, '\t') return outputFile
def makeEnhancerSignalTable(nameDict,mergedRegionMap,medianDict,analysisName,genome,outputFolder): ''' makes a table where each row is an enhancer and each column is the log2 background corrected signal vs. median ''' #load in the region map regionMap = utils.parseTable(mergedRegionMap,'\t') namesList = nameDict.keys() namesList.sort() signalTable = [['REGION_ID','CHROM','START','STOP','NUM_LOCI','CONSTITUENT_SIZE'] + namesList] print("len of %s for namesList" % (len(namesList))) print(namesList) for line in regionMap[1:]: newLine = line[0:6] #a little tricky here to add datasets sequentially i = 6 #start w/ the first column w/ data for name in namesList: if nameDict[name]['background'] == True: enhancerIndex = int(i) i +=1 controlIndex = int(i) i +=1 try: enhancerSignal = float(line[enhancerIndex]) - float(line[controlIndex]) except IndexError: print line print len(line) print enhancerIndex print controlIndex sys.exit() else: enhancerIndex = int(i) i+=1 enhancerSignal = float(line[enhancerIndex]) if enhancerSignal < 0: enhancerSignal = 0 enhancerSignal = enhancerSignal/medianDict[name] newLine.append(enhancerSignal) signalTable.append(newLine) outputFile = "%s%s_%s_signalTable.txt" % (outputFolder,genome,analysisName) print "WRITING MEDIAN NORMALIZED SIGNAL TABLE TO %s" % (outputFile) utils.unParseTable(signalTable,outputFile,'\t') return outputFile
def summarizeVenn(mapped_path, group_list=['CG', 'THMYCN'], output=''): ''' summarizes binary occupancy across group to make a venn diagram ''' group_table = [['GFF_LINE', 'ID'] + group_list] mapped_table = utils.parseTable(mapped_path, '\t') group_cols = [] for group in group_list: group_names = [ name for name in mapped_table[0] if name.count(group) > 0 ] group_cols.append( [mapped_table[0].index(name) for name in group_names]) print(group_cols) for line in mapped_table[1:]: binary_vector = [] #a 1/0 vector to hold mapping by group for i in range(len(group_list)): cols = group_cols[i] signal = max([int(line[x]) for x in cols]) binary_vector.append(signal) new_line = line[0:2] + binary_vector group_table.append(new_line) print(group_table[0:5]) #now add up the stats #this part assumes only 2 groups for now otherwise gets combinatorially challenging #permute all possible binary combinations given the vector length binary_combinations = [[0], [1]] for i in range(len(group_list) - 1): new_combinations = [] for x in binary_combinations: print(x) x1 = list(x) + [1] x0 = list(x) + [0] new_combinations.append(x1) new_combinations.append(x0) binary_combinations = list(new_combinations) print(binary_combinations) count_table = [group_list + ['count']] for combo in binary_combinations: count = len([line for line in group_table[1:] if line[2:] == combo]) count_table.append(combo + [count]) print(count_table) if len(output) > 0: utils.unParseTable(count_table, output, '\t') else: return count_table
def assignEnhancerRank(enhancerToGeneFile, enhancerFile1, enhancerFile2, name1, name2, rankOutput=''): ''' for all genes in the enhancerToGene Table, assigns the highest overlapping ranked enhancer in the other tables ''' enhancerToGene = utils.parseTable(enhancerToGeneFile, '\t') enhancerCollection1 = makeSECollection(enhancerFile1, name1, False) enhancerCollection2 = makeSECollection(enhancerFile2, name2, False) enhancerDict1 = makeSEDict(enhancerFile1, name1, False) enhancerDict2 = makeSEDict(enhancerFile2, name2, False) #we're going to update the enhancerToGeneTable enhancerToGene[0] += ['%s_rank' % name1, '%s_rank' % name2] for i in range(1, len(enhancerToGene)): line = enhancerToGene[i] locusLine = utils.Locus(line[1], line[2], line[3], '.', line[0]) #if the enhancer doesn't exist, its ranking is dead last on the enhancer list enhancer1Overlap = enhancerCollection1.getOverlap(locusLine, 'both') if len(enhancer1Overlap) == 0: enhancer1Rank = len(enhancerCollection1) else: rankList1 = [ enhancerDict1[x.ID()]['rank'] for x in enhancer1Overlap ] enhancer1Rank = min(rankList1) enhancer2Overlap = enhancerCollection2.getOverlap(locusLine, 'both') if len(enhancer2Overlap) == 0: enhancer2Rank = len(enhancerCollection2) else: rankList2 = [ enhancerDict2[x.ID()]['rank'] for x in enhancer2Overlap ] enhancer2Rank = min(rankList2) enhancerToGene[i] += [enhancer1Rank, enhancer2Rank] if len(rankOutput) == 0: return enhancerToGene else: utils.unParseTable(enhancerToGene, rankOutput, '\t')
def findMotifs(canidateGenes, projectFolder, projectName, motifConvertFile, motifDatabaseFile): ''' takes the refseq to subpeak seq dict returns the networkx object with all connections ''' # Create a dictionary to call motif names keyed on gene names motifDatabase = utils.parseTable(motifConvertFile, '\t') motifDatabaseDict = {} motifNames = [line[1] for line in motifDatabase] for line in motifDatabase: motifDatabaseDict[line[1]] = [] for line in motifDatabase: motifDatabaseDict[line[1]].append(line[0]) print 'GENERATING TF NETWORK' # select the TF candidates that have motifs canidateMotifs = [] for gene in canidateGenes: if gene in motifNames: canidateMotifs.append(gene) print 'Number of annotated canidate TFs that have motifs: ' + str( len(canidateMotifs)) canidateMotifs = sorted(canidateMotifs) #canidateMotifs = ['NANOG', 'POU5F1', 'SOX2'] bgCmd = 'fasta-get-markov -m 1 < ' + projectFolder + projectName + '_SUBPEAKS.fa > ' + projectFolder + projectName + '_bg.meme' subprocess.call(bgCmd, shell=True) utils.formatFolder(projectFolder + 'FIMO/', True) fimoCmd = 'fimo' for TF in canidateMotifs: print TF for x in motifDatabaseDict[TF]: fimoCmd += ' --motif ' + "'%s'" % (str(x)) #fimoCmd += ' --thresh 1e-5' fimoCmd += ' -verbosity 1' # thanks for that ;)! fimoCmd += ' -text' fimoCmd += ' -oc ' + projectFolder + 'FIMO' fimoCmd += ' --bgfile ' + projectFolder + projectName + '_bg.meme' fimoCmd += ' ' + motifDatabaseFile + ' ' fimoCmd += projectFolder + projectName + '_SUBPEAKS.fa' fimoCmd += ' > ' + projectFolder + 'FIMO/fimo.txt' ## print fimoCmd fimoOutput = subprocess.call( fimoCmd, shell=True) #will wait that fimo is done to go on return fimoCmd
def filterPeaks(tabixFolder,mycTablePath,outputPath,repeatList = []): ''' auto filters the 3 repeat classes LINE, LTR, Simple_repeat outputs a bed in the format of [PEAK_ID,CHROM, START,STOP,LENGTH, LINE, LTR, Simple_repeat] ''' if len(repeatList) == 0: repeatList = ['LINE','LTR','Simple_repeat'] repeatTable = [['PEAK_ID','CHROM','START','STOP','LENGTH'] + repeatList] mycTable = utils.parseTable(mycTablePath,'\t') ticker =0 for line in mycTable[1:]: if line[0][0] =='P': continue if ticker % 100 == 0: print ticker ticker +=1 peak_ID = line[0] chrom = line[1] start = int(line[2]) stop = int(line[3]) length = line[4] locusString = '%s:%s-%s' % (chrom,start,stop) repeatFractions = [] for repeatClass in repeatList: tabixGFF = '%shg19_%s_category_sorted.gff.gz' % (tabixFolder,repeatClass) tabixCmd = 'tabix %s %s' % (tabixGFF,locusString) tabix = subprocess.Popen(tabixCmd,stdin = subprocess.PIPE,stderr = subprocess.PIPE,stdout = subprocess.PIPE,shell = True) tabixLines = tabix.stdout.readlines() tabixLines = [x.rstrip().split('\t') for x in tabixLines] #i think you get back essentially gff lines overlapFraction = 0.0 for line in tabixLines: lineStart = int(line[3]) lineStop = int(line[4]) lineStart = max(start,lineStart) lineStop = min(stop,lineStop) overlapLength = lineStop - lineStart overlapFraction += float(overlapLength)/float(length) repeatFractions.append(round(overlapFraction,4)) newLine = [peak_ID,chrom,start,stop,length] + repeatFractions repeatTable.append(newLine) utils.unParseTable(repeatTable,outputPath,'\t')
def generateSubpeakFASTA(TFtoEnhancerDict, subpeaks, genomeDirectory, projectName, projectFolder, constExtension): ''' from a BED file of constituents generate a FASTA for the consituients contained within the canidate supers ''' subpeakDict = {} subpeakBED = [['track name=' + projectName + ' color=204,0,204']] subpeakTable = utils.parseTable(subpeaks, '\t') subpeakLoci = [ utils.Locus(l[0], int(l[1]), int(l[2]), '.') for l in subpeakTable ] subpeakCollection = utils.LocusCollection(subpeakLoci, 50) for gene in TFtoEnhancerDict.keys(): subpeakDict[gene] = [] for region in TFtoEnhancerDict[gene]: overlaps = subpeakCollection.getOverlap(region) extendedOverlaps = [ utils.makeSearchLocus(x, constExtension, constExtension) for x in overlaps ] overlapCollectionTemp = utils.LocusCollection(extendedOverlaps, 50) overlapCollection = overlapCollectionTemp.stitchCollection() for overlap in overlapCollection.getLoci(): subpeakBED.append( [overlap.chr(), overlap.start(), overlap.end()]) subpeakDict[gene].append(overlap) bedfilename = projectFolder + projectName + '_subpeaks.bed' utils.unParseTable(subpeakBED, bedfilename, '\t') fasta = [] for gene in subpeakDict: for subpeak in subpeakDict[gene]: fastaTitle = gene + '|' + subpeak.chr() + '|' + str( subpeak.start()) + '|' + str(subpeak.end()) fastaLine = utils.fetchSeq(genomeDirectory, subpeak.chr(), int(subpeak.start() + 1), int(subpeak.end() + 1)) fasta.append('>' + fastaTitle) fasta.append(upper(fastaLine)) outname = projectFolder + projectName + '_SUBPEAKS.fa' utils.unParseTable(fasta, outname, '')
def findMotifs(subpeakFasta, bg_path, candidate_tf_list, projectFolder, analysis_name, motifConvertFile, motifDatabaseFile): ''' takes the refseq to subpeak seq dict returns the networkx object with all connections ''' fimoFolder = utils.formatFolder(projectFolder + 'FIMO/', True) subpeak_name = subpeakFasta.split('/')[-1].split('.')[0] output = '%s%s_fimo.txt' % (fimoFolder, subpeak_name) # Create a dictionary to call motif names keyed on gene names motifDatabase = utils.parseTable(motifConvertFile, '\t') motifDatabaseDict = {} #create a dict keyed by TF with multiple motifs for line in motifDatabase: motifDatabaseDict[line[1]] = [] for line in motifDatabase: motifDatabaseDict[line[1]].append(line[0]) candidate_tf_list.sort() print(candidate_tf_list) #now make a list of all motifs motif_list = [] for tf in candidate_tf_list: motif_list += motifDatabaseDict[tf] motif_list = utils.uniquify(motif_list) fimo_bash_path = '%s%s_fimo.sh' % (fimoFolder, analysis_name) fimo_bash = open(fimo_bash_path, 'w') fimo_bash.write('#!/usr/bin/bash\n\n') fimoCmd = 'fimo' for motif in motif_list: fimoCmd += ' --motif ' + "'%s'" % (str(motif)) #fimoCmd += ' --thresh 1e-5' #if you want to increase stringency fimoCmd += ' -verbosity 1' # thanks for that ;)! fimoCmd += ' -text' fimoCmd += ' -oc ' + projectFolder + 'FIMO' fimoCmd += ' --bgfile %s' % (bg_path) fimoCmd += ' ' + motifDatabaseFile + ' ' fimoCmd += subpeakFasta fimoCmd += ' > ' + output print fimoCmd fimo_bash.write(fimoCmd) fimo_bash.close() fimoOutput = subprocess.call( fimoCmd, shell=True) #will wait that fimo is done to go on return output
def getMedianSignalEnhancer(enhancerFile, name, dataFile): ''' returns the median enhancer signal of a file ''' dataDict = pipeline_dfci.loadDataTable(dataFile) enhancerTable = utils.parseTable(enhancerFile, '\t') enhancerVector = [float(line[6]) for line in enhancerTable[6:]] median = numpy.median(enhancerVector) return median
def findMotifs(subpeakFasta,bg_path,candidate_tf_list, projectFolder, analysis_name, motifConvertFile, motifDatabaseFile): ''' takes the refseq to subpeak seq dict returns the networkx object with all connections ''' fimoFolder = utils.formatFolder(projectFolder + 'FIMO/', True) subpeak_name = subpeakFasta.split('/')[-1].split('.')[0] output = '%s%s_fimo.txt' % (fimoFolder,subpeak_name) # Create a dictionary to call motif names keyed on gene names motifDatabase = utils.parseTable(motifConvertFile, '\t') motifDatabaseDict = {} #create a dict keyed by TF with multiple motifs for line in motifDatabase: motifDatabaseDict[line[1]] = [] for line in motifDatabase: motifDatabaseDict[line[1]].append(line[0]) candidate_tf_list.sort() print(candidate_tf_list) #now make a list of all motifs motif_list = [] for tf in candidate_tf_list: motif_list += motifDatabaseDict[tf] motif_list = utils.uniquify(motif_list) fimo_bash_path = '%s%s_fimo.sh' % (fimoFolder,analysis_name) fimo_bash = open(fimo_bash_path,'w') fimo_bash.write('#!/usr/bin/bash\n\n') fimoCmd = 'fimo' for motif in motif_list: fimoCmd += ' --motif ' + "'%s'" % (str(motif)) #fimoCmd += ' --thresh 1e-5' #if you want to increase stringency fimoCmd += ' -verbosity 1' # thanks for that ;)! fimoCmd += ' -text' fimoCmd += ' -oc ' + projectFolder + 'FIMO' fimoCmd += ' --bgfile %s' % (bg_path) fimoCmd += ' ' + motifDatabaseFile + ' ' fimoCmd += subpeakFasta fimoCmd += ' > '+ output print fimoCmd fimo_bash.write(fimoCmd) fimo_bash.close() fimoOutput = subprocess.call(fimoCmd, shell=True) #will wait that fimo is done to go on return output
def averagingMappedSignal(mapped_list, output_path, setName): ''' averages signal across a set of mapped gffs and writes the new output ''' #create a list containing all of the tables table_list = [ utils.parseTable(mapped_list[i], '\t') for i in range(len(mapped_list)) ] #first set up the output header output_header = ['GENE_ID', 'locusLine'] nCols = len(table_list[0][0]) - 2 for n in range(nCols): output_header.append('bin_%s_%s' % (n + 1, setName)) output_table = [output_header] #now iterate through each row to set up the gene ID and locus line for i in range(1, len(table_list[0])): line = table_list[0][i] if len(line) > 2: output_table.append(line[0:2]) #now run through the whole matrix in i,j notation and put average signal into the final matrix #iterate through rows row_ticker = 1 for i in range(1, len(table_list[0])): line = table_list[0][i] if len(line) == 2: continue signal_vector = [] #iterate through columns for j in range(2, len(table_list[0][0])): try: signal_vector = [float(table[i][j]) for table in table_list] except IndexError: print(i, j) print(table_list[0][i]) print(table_list[1][i]) signal = max(round(numpy.average(signal_vector), 4), 0) output_table[row_ticker].append(signal) row_ticker += 1 print(len(table_list[0])) print(len(output_table)) utils.unParseTable(output_table, output_path, '\t') return output_path
def make_probe_to_gene_dict(annotFile, array_1_path, array_2_path): ''' keyed by probe ID w/ gene as value ''' #see if it already exists pickle_path = '%soberthuer_outcome/probe_dict.pkl' % (projectFolder) if utils.checkOutput(pickle_path, 0, 0): print('loading previously made probe dict at %s' % (pickle_path)) probe_gene_dict = pickle.load(open(pickle_path, "rb")) return probe_gene_dict #we want to intersect refseq common names w/ the array startDict = utils.makeStartDict(annotFile) ref_name_list = utils.uniquify( [startDict[refID]['name'] for refID in startDict.keys()]) probe_gene_dict = {} array_1 = utils.parseTable(array_1_path, '\t') array_2 = utils.parseTable(array_2_path, '\t') ticker = 0 for line in array_1 + array_2: if len(line) < 5: continue ticker += 1 probe_id = line[4] name = line[-1] # print(probe_id) # print(name) # if ticker== 10: # sys.exit() # print(line) if ref_name_list.count(name) > 0: probe_gene_dict[probe_id] = name pickle.dump(probe_gene_dict, open(pickle_path, 'wb')) return probe_gene_dict
def makeSignalDict(mappedGFFFile, controlMappedGFFFile=''): ''' makes a signal dict ''' print('\t called makeSignalDict on %s (ctrl: %s)' % (mappedGFFFile, controlMappedGFFFile)) signalDict = defaultdict(float) mappedGFF = utils.parseTable(mappedGFFFile, '\t') if len(controlMappedGFFFile) > 0: controlGFF = utils.parseTable(controlMappedGFFFile, '\t') for i in range(1, len(mappedGFF)): signal = float(mappedGFF[i][2]) - float(controlGFF[i][2]) if signal < 0: signal = 0.0 signalDict[mappedGFF[i][0]] = signal else: for i in range(1, len(mappedGFF)): signal = float(mappedGFF[i][2]) signalDict[mappedGFF[i][0]] = signal return signalDict
def getSignalVector(regionFile, name, dataFile): ''' returns the median enhancer signal of a file ''' dataDict = pipeline_dfci.loadDataTable(dataFile) regionTable = utils.parseTable(regionFile, '\t') bamPath = dataDict[name]['bam'] bamName = bamPath.split('/')[-1] colID = regionTable[0].index(bamName) signalVector = [float(line[colID]) for line in regionTable[1:]] return signalVector
def getMedianSignalEnhancer(enhancerFile,name,dataFile): ''' returns the median enhancer signal of a file ''' dataDict = pipeline_dfci.loadDataTable(dataFile) enhancerTable = utils.parseTable(enhancerFile,'\t') enhancerVector = [float(line[6]) for line in enhancerTable[6:]] median= numpy.median(enhancerVector) return median
def assignEnhancerRank(enhancerToGeneFile,enhancerFile1,enhancerFile2,name1,name2,rankOutput=''): ''' for all genes in the enhancerToGene Table, assigns the highest overlapping ranked enhancer in the other tables ''' enhancerToGene = utils.parseTable(enhancerToGeneFile,'\t') enhancerCollection1 = makeSECollection(enhancerFile1,name1,False) enhancerCollection2 = makeSECollection(enhancerFile2,name2,False) enhancerDict1 = makeSEDict(enhancerFile1,name1,False) enhancerDict2 = makeSEDict(enhancerFile2,name2,False) #we're going to update the enhancerToGeneTable enhancerToGene[0] += ['%s_rank' % name1,'%s_rank' % name2] for i in range(1,len(enhancerToGene)): line = enhancerToGene[i] locusLine = utils.Locus(line[1],line[2],line[3],'.',line[0]) #if the enhancer doesn't exist, its ranking is dead last on the enhancer list enhancer1Overlap = enhancerCollection1.getOverlap(locusLine,'both') if len(enhancer1Overlap) == 0: enhancer1Rank = len(enhancerCollection1) else: rankList1 = [enhancerDict1[x.ID()]['rank'] for x in enhancer1Overlap] enhancer1Rank = min(rankList1) enhancer2Overlap = enhancerCollection2.getOverlap(locusLine,'both') if len(enhancer2Overlap) == 0: enhancer2Rank = len(enhancerCollection2) else: rankList2 = [enhancerDict2[x.ID()]['rank'] for x in enhancer2Overlap] enhancer2Rank = min(rankList2) enhancerToGene[i]+=[enhancer1Rank,enhancer2Rank] if len(rankOutput) == 0: return enhancerToGene else: utils.unParseTable(enhancerToGene,rankOutput,'\t')
def makePeakGFFs(peak_path_list): ''' makes a stitched gff for all MYC bound TSS and Distal regions across all datasets ''' #setting the output tss_gff_path = '%sHG19_MYC_TSS_REGIONS_-0_+0.gff' % (gffFolder) distal_gff_path = '%sHG19_MYC_DISTAL_REGIONS_-0_+0.gff' % (gffFolder) #check to see if already done if utils.checkOutput(tss_gff_path,0.1,0.1) and utils.checkOutput(distal_gff_path,0.1,0.1): print('OUTPUT FOUND AT %s and %s' % (tss_gff_path,distal_gff_path)) return tss_gff_path,distal_gff_path #emtpy loci lists to hold everything tss_loci = [] distal_loci = [] for peak_path in peak_path_list: print('processing %s' % (peak_path)) peak_table= utils.parseTable(peak_path,'\t') for line in peak_table[1:]: peak_locus = utils.Locus(line[1],line[2],line[3],'.') if int(line[5]) == 0: distal_loci.append(peak_locus) else: tss_loci.append(peak_locus) #now combind the loci print('stitching loci') distal_collection = utils.LocusCollection(distal_loci,50) tss_collection = utils.LocusCollection(tss_loci,50) stitched_distal_collection = distal_collection.stitchCollection() stitched_tss_collection = tss_collection.stitchCollection() #now make the gffs distal_gff= utils.locusCollectionToGFF(distal_collection) tss_gff= utils.locusCollectionToGFF(tss_collection) #now write to disk utils.unParseTable(distal_gff,distal_gff_path,'\t') utils.unParseTable(tss_gff,tss_gff_path,'\t') return tss_gff_path,distal_gff_path
def callMergeSupers(dataFile, superFile1, superFile2, name1, name2, mergeName, genome, parentFolder): ''' this is the main run function for the script all of the work should occur here, but no functions should be defined here ''' mergedGFFFile = '%s%s_%s_MERGED_REGIONS_-0_+0.gff' % ( parentFolder, string.upper(genome), mergeName) #check to make sure this hasn't been done yet roseOutput = "%s%s_ROSE/%s_%s_MERGED_REGIONS_-0_+0_SuperEnhancers_ENHANCER_TO_GENE.txt" % ( parentFolder, name1, string.upper(genome), mergeName) try: foo = utils.parseTable(roseOutput, '\t') print "ROSE OUTPUT ALREADY FOUND HERE %s" % (roseOutput) return roseOutput except IOError: print "MERGING ENHANCER REGIONS FROM %s and %s" % (superFile1, superFile2) mergedGFF = mergeCollections(superFile1, superFile2, name1, name2, mergedGFFFile) #call rose on the merged shit roseBashFile = callRoseMerged(dataFile, mergedGFF, name1, name2, parentFolder) print('i can has rose bash file %s' % (roseBashFile)) #run the bash command os.system('bash %s' % (roseBashFile)) #check for and return output if utils.checkOutput(roseOutput, 1, 10): return roseOutput else: #try finding it w/ a different name #this will bug out if nothing is there roseFolder = "%s%s_ROSE/" % (parentFolder, name1) roseFileList = [x for x in os.listdir(roseFolder) if x[0] != '.'] #no hidden files if len(roseFileList) == 0: print "No files found in %s" % (roseFolder) sys.exit() enhancerToGeneFile = getFile( '_SuperEnhancers_ENHANCER_TO_GENE.txt', roseFileList, roseFolder)
def getSignalVector(regionFile,name,dataFile): ''' returns the median enhancer signal of a file ''' dataDict = pipeline_dfci.loadDataTable(dataFile) regionTable = utils.parseTable(regionFile,'\t') bamPath = dataDict[name]['bam'] bamName = bamPath.split('/')[-1] colID = regionTable[0].index(bamName) signalVector = [float(line[colID]) for line in regionTable[1:]] return signalVector
def makeAnnotDict(annotFile): ''' makes a dictionary keyed by guideID ''' guideDict = defaultdict(str) geneDict = defaultdict(list) geckoAnnot = utils.parseTable(annotFile, '\t') for line in geckoAnnot[1:]: guideDict[line[1]] = line[0] geneDict[line[0]].append(line[1]) return guideDict, geneDict
def makeAnnotDict(annotFile): ''' makes a dictionary keyed by guideID ''' guideDict = defaultdict(str) geneDict = defaultdict(list) geckoAnnot = utils.parseTable(annotFile,'\t') for line in geckoAnnot[1:]: guideDict[line[1]] = line[0] geneDict[line[0]].append(line[1]) return guideDict,geneDict
def writeSplitBeds(bed, analysisName, outputFolder, window=50, centered=False): dmrList = [x for x in utils.parseTable(bed, '\t')] # print len(refGenes) dmrBed = [] endsBed = [] startsBed = [] centeredBed = [] if centered == False: for line in dmrList: dmrID = line[3] dmrCoords = [line[0], int(line[1]), int(line[2]), dmrID] dmrBed.append(dmrCoords) startExtend = [line[0], int(line[1]) - window, int(line[1]), dmrID] endExtend = [line[0], int(line[2]), int(line[2]) + window, dmrID] endsBed.append(endExtend) startsBed.append(startExtend) print len(dmrBed) utils.unParseTable(dmrBed, outputFolder + analysisName + '_BODY_-0_+0.bed', '\t') print len(startsBed) utils.unParseTable( startsBed, outputFolder + analysisName + '_UPSTREAM_-' + str(window) + '_+' + str(window) + '.bed', '\t') print len(endsBed) utils.unParseTable( endsBed, outputFolder + analysisName + '_DOWNSTREAM_-' + str(window) + '_+' + str(window) + '.bed', '\t') elif centered == True: for line in dmrList: dmrID = line[3] center = (int(line[1]) + int(line[2])) / 2 centeredBed.append( [line[0], center - window, center + window, dmrID]) utils.unParseTable( centeredBed, outputFolder + analysisName + '_CENTERED_-' + str(window) + '_+' + str(window) + '.bed', '\t')
def make_mycn_regions(conserved_rank_path): ''' takes conserved NB MYCN regions then creates a bed and gff of regions ''' conserved_rank_table = utils.parseTable(conserved_rank_path,'\t') mycn_gff = [] mycn_flank_gff = [] mycn_bed = [] mycn_flank_bed = [] for line in conserved_rank_table[1:]: locus_line = utils.Locus(line[1],line[2],line[3],'.') if int(line[3]) < int(line[2]): print('uh oh') print(line) gff_line = [line[1],line[0],'',line[2],line[3],'','.','',line[0]] bed_line = [line[1],line[2],line[3],line[0]] mycn_gff.append(gff_line) mycn_bed.append(bed_line) gff_flank_line = [line[1],line[0],'',int(line[2])-500,int(line[3])+500,'','.','',line[0]] bed_flank_line = [line[1],int(line[2])-500,int(line[3])+500,line[0]] mycn_flank_gff.append(gff_flank_line) mycn_flank_bed.append(bed_flank_line) mycn_gff_path = '%sHG19_NB_MYCN_CONSERVED_-0_+0.gff' % (gffFolder) mycn_flank_gff_path = '%sHG19_NB_MYCN_CONSERVED_-500_+500.gff' % (gffFolder) mycn_bed_path = '%sHG19_NB_MYCN_CONSERVED_-0_+0.bed' % (bedFolder) mycn_flank_bed_path = '%sHG19_NB_MYCN_CONSERVED_-500_+500.bed' % (bedFolder) #writing to disk utils.unParseTable(mycn_gff,mycn_gff_path,'\t') utils.unParseTable(mycn_flank_gff,mycn_flank_gff_path,'\t') utils.unParseTable(mycn_bed,mycn_bed_path,'\t') utils.unParseTable(mycn_flank_bed,mycn_flank_bed_path,'\t') print(mycn_gff_path) print(mycn_flank_gff_path) print(mycn_bed_path) print(mycn_flank_bed_path) return mycn_gff_path,mycn_flank_gff_path
def callMergeSupers(dataFile, superFile1, superFile2, name1, name2, mergeName, genome, parentFolder): """ this is the main run function for the script all of the work should occur here, but no functions should be defined here """ mergedGFFFile = "%s%s_%s_MERGED_REGIONS_-0_+0.gff" % (parentFolder, string.upper(genome), mergeName) # check to make sure this hasn't been done yet roseOutput = "%s%s_ROSE/%s_%s_MERGED_REGIONS_-0_+0_SuperEnhancers_ENHANCER_TO_GENE.txt" % ( parentFolder, name1, string.upper(genome), mergeName, ) try: foo = utils.parseTable(roseOutput, "\t") print "ROSE OUTPUT ALREADY FOUND HERE %s" % (roseOutput) return roseOutput except IOError: print "MERGING ENHANCER REGIONS FROM %s and %s" % (superFile1, superFile2) mergedGFF = mergeCollections(superFile1, superFile2, name1, name2, mergedGFFFile) # call rose on the merged shit roseBashFile = callRoseMerged(dataFile, mergedGFF, name1, name2, parentFolder) print ("i can has rose bash file %s" % (roseBashFile)) # run the bash command os.system("bash %s" % (roseBashFile)) # check for and return output if utils.checkOutput(roseOutput, 1, 10): return roseOutput else: # try finding it w/ a different name # this will bug out if nothing is there roseFolder = "%s%s_ROSE/" % (parentFolder, name1) roseFileList = [x for x in os.listdir(roseFolder) if x[0] != "."] # no hidden files if len(roseFileList) == 0: print "No files found in %s" % (roseFolder) sys.exit() enhancerToGeneFile = getFile("_SuperEnhancers_ENHANCER_TO_GENE.txt", roseFileList, roseFolder)
def getMedianSignal(enhancerFile, name, dataFile): """ returns the median enhancer signal of a file """ dataDict = pipeline_dfci.loadDataTable(dataFile) enhancerTable = utils.parseTable(enhancerFile, "\t") backgroundName = dataDict[name]["background"] if dataDict.has_key(backgroundName): enhancerVector = [float(line[6]) - float(line[7]) for line in enhancerTable[6:]] else: enhancerVector = [float(line[6]) for line in enhancerTable[6:]] median = numpy.median(enhancerVector) return median
def generateSubpeakFASTA(TFandSuperDict, subpeaks, genomeDirectory, projectName, projectFolder, motifExtension): ''' takes as input a BED file of constituents outputs a FASTA file of merged extended super-enhancer consituents and associated formated name ''' print 'MAKE FASTA' subpeakDict = {} subpeakBED = [['track name=' + projectName + ' color=204,0,204']] subpeakTable = utils.parseTable(subpeaks, '\t') subpeakLoci = [utils.Locus(l[0], int(l[1]), int(l[2]), '.') for l in subpeakTable] subpeakCollection = utils.LocusCollection(subpeakLoci, 50) for gene in TFandSuperDict.keys(): subpeakDict[gene] = [] for region in TFandSuperDict[gene]: overlaps = subpeakCollection.getOverlap(region) extendedOverlaps = [utils.makeSearchLocus(x, motifExtension, motifExtension) for x in overlaps] overlapCollectionTemp = utils.LocusCollection(extendedOverlaps, 50) overlapCollection = overlapCollectionTemp.stitchCollection() for overlap in overlapCollection.getLoci(): subpeakBED.append([overlap.chr(), overlap.start(), overlap.end()]) subpeakDict[gene].append(overlap) bedfilename = projectFolder + projectName + '_subpeaks.bed' utils.unParseTable(subpeakBED, bedfilename, '\t') fasta = [] for gene in subpeakDict: for subpeak in subpeakDict[gene]: fastaTitle = gene + '|' + subpeak.chr() + '|' + str(subpeak.start()) + '|' + str(subpeak.end()) fastaLine = utils.fetchSeq(genomeDirectory, subpeak.chr(), int(subpeak.start()+1), int(subpeak.end()+1)) fasta.append('>' + fastaTitle) fasta.append(upper(fastaLine)) # Output the fasta file of extended SE constituents outname = projectFolder + projectName + '_SUBPEAKS.fa' utils.unParseTable(fasta, outname, '')
def makeBedCollection(bedFileList): ''' takes in a list of bedFiles and makes a single huge collection each locus has as its ID the name of the bed file ''' bedLoci = [] print("MAKING BED COLLECTION FOR:") for bedFile in bedFileList: bedName = bedFile.split('/')[-1].split('.')[0] print(bedName) bed = utils.parseTable(bedFile, '\t') for line in bed: bedLocus = utils.Locus(line[0], line[1], line[2], '.', bedName) bedLoci.append(bedLocus) return utils.LocusCollection(bedLoci, 50)
def getMedianSignal(enhancerFile, name, dataFile): ''' returns the median enhancer signal of a file ''' dataDict = pipeline_dfci.loadDataTable(dataFile) enhancerTable = utils.parseTable(enhancerFile, '\t') backgroundName = dataDict[name]['background'] if dataDict.has_key(backgroundName): enhancerVector = [ float(line[6]) - float(line[7]) for line in enhancerTable[6:] ] else: enhancerVector = [float(line[6]) for line in enhancerTable[6:]] median = numpy.median(enhancerVector) return median
def getExpanded(locusTable, expansion, status, output): loci = utils.parseTable(locusTable, '\t') expandedList = [] for line in loci: wtLocus = line[0:4] mutLocus = line[4:8] if status == 'WT': newLine = expansionStat(wtLocus, mutLocus, expansion=0.1) if len(newLine) > 0: expandedList.append(newLine) elif status == 'MUT': newLine = expansionStat(mutLocus, wtLocus, expansion=0.1) if len(newLine) > 0: expandedList.append(newLine) print len(expandedList), ' expanded loci in ', status utils.unParseTable(expandedList, output, '\t')
def findMotifs(candidateGenes, projectFolder, projectName, motifConvertFile, motifDatabaseFile): '''Run the motif search on the extended SE constituents with FIMO ''' print 'MOTIF SEARCH' # Create a dictionary of motif keyed on each TF motifDatabase = utils.parseTable(motifConvertFile, '\t') motifDatabaseDict = {} motifNames = [line[1] for line in motifDatabase] for line in motifDatabase: motifDatabaseDict[line[1]] = [] for line in motifDatabase: motifDatabaseDict[line[1]].append(line[0]) canidateMotifs = [] for gene in candidateGenes: if gene in motifNames: canidateMotifs.append(gene) print 'Number of annotated candidate TFs that have motifs: ' + str(len(canidateMotifs)) canidateMotifs = sorted(canidateMotifs) # Create a backgroud sequence file to use with FIMO bgCmd = 'fasta-get-markov -m 1 < ' + projectFolder + projectName + '_SUBPEAKS.fa > ' + projectFolder + projectName + '_bg.meme' call(bgCmd, shell=True) # Run the motif search with FIMO fimoCmd = 'fimo' for motif in canidateMotifs: for x in motifDatabaseDict[motif]: fimoCmd += ' --motif ' + "'%s'" % (str(x)) fimoCmd += ' -verbosity 1' fimoCmd += ' -text' fimoCmd += ' -oc ' + projectFolder fimoCmd += ' --bgfile ' + projectFolder + projectName + '_bg.meme' fimoCmd += ' ' + motifDatabaseFile + ' ' fimoCmd += projectFolder + projectName + '_SUBPEAKS.fa' fimoCmd += ' > '+ projectFolder + 'fimo.txt' print fimoCmd fimoOutput = call(fimoCmd, shell=True) return fimoCmd
def filterGFF(gffFile,chromList): ''' takes in a gff and filters out all lines that don't belong to a chrom in the chromList ''' gff = utils.parseTable(gffFile,'\t') filteredGFF = [] excludeList=[] for line in gff: if chromList.count(line[0]) ==1: filteredGFF.append(line) else: excludeList.append(line[0]) excludeList = utils.uniquify(excludeList) if len(excludeList) > 0: print("EXCLUDED GFF REGIONS FROM THE FALLING CHROMS: %s" % (','.join(excludeList))) return filteredGFF
def makeSECollection(enhancerFile,name,superOnly = True): ''' returns a locus collection from a super table top gives the number of rows ''' enhancerTable = utils.parseTable(enhancerFile,'\t') enhancerLoci = [] for line in enhancerTable: if line[0][0] == '#' or line[0][0] == 'R': continue else: if superOnly and int(line[-1]) == 0: break enhancerLoci.append(utils.Locus(line[1],line[2],line[3],'.',name+'_'+line[0])) return utils.LocusCollection(enhancerLoci,50)
def makeMedianDict(nameDict): ''' for each dataset returns the median background subtracted enhancer signal ''' medianDict = {} for name in nameDict: #open up the allenhancerTable enhancerTable = utils.parseTable(nameDict[name]['enhancerFile'],'\t') #assume header ends after line 5 enhancerVector = [float(line[6]) - float(line[7]) for line in enhancerTable[6:]] medianDict[name] = numpy.median(enhancerVector) return medianDict
def makeSECollection(enhancerFile, name, top=0): """ returns a locus collection from a super table top gives the number of rows """ enhancerTable = utils.parseTable(enhancerFile, "\t") superLoci = [] ticker = 0 for line in enhancerTable: if line[0][0] == "#" or line[0][0] == "R": continue else: ticker += 1 superLoci.append(utils.Locus(line[1], line[2], line[3], ".", name + "_" + line[0])) if ticker == top: break return utils.LocusCollection(superLoci, 50)
def makeSECollection(enhancerFile,name,top=0): ''' returns a locus collection from a super table top gives the number of rows ''' enhancerTable = utils.parseTable(enhancerFile,'\t') superLoci = [] ticker = 0 for line in enhancerTable: if line[0][0] == '#' or line[0][0] == 'R': continue else: ticker+=1 superLoci.append(utils.Locus(line[1],line[2],line[3],'.',name+'_'+line[0])) if ticker == top: break return utils.LocusCollection(superLoci,50)
def generateSubpeakFASTA(gene_to_enhancer_dict, subpeaks, genome, projectName, projectFolder, constExtension): ''' from a BED file of constituents generate a FASTA for the consituients contained within the canidate supers ''' genomeDirectory = genome.directory() subpeakDict = {} subpeakBED = [['track name=' + projectName + ' color=204,0,204']] subpeakTable = utils.parseTable(subpeaks, '\t') subpeakLoci = [utils.Locus(l[0], int(l[1]), int(l[2]), '.') for l in subpeakTable] subpeakCollection = utils.LocusCollection(subpeakLoci, 50) for gene in gene_to_enhancer_dict.keys(): subpeakDict[gene] = [] for region in gene_to_enhancer_dict[gene]: overlaps = subpeakCollection.getOverlap(region) extendedOverlaps = [utils.makeSearchLocus(x, constExtension, constExtension) for x in overlaps] overlapCollectionTemp = utils.LocusCollection(extendedOverlaps, 50) overlapCollection = overlapCollectionTemp.stitchCollection() for overlap in overlapCollection.getLoci(): subpeakBED.append([overlap.chr(), overlap.start(), overlap.end()]) subpeakDict[gene].append(overlap) fasta = [] for gene in subpeakDict: for subpeak in subpeakDict[gene]: fastaTitle = gene + '|' + subpeak.chr() + '|' + str(subpeak.start()) + '|' + str(subpeak.end()) fastaLine = utils.fetchSeq(genomeDirectory, subpeak.chr(), int(subpeak.start()+1), int(subpeak.end()+1)) fasta.append('>' + fastaTitle) fasta.append(string.upper(fastaLine)) return subpeakBED,fasta
def makeRigerTable(foldTableFile,output=''): ''' blah ''' #need a table of this format rigerTable = [['Construct','GeneSymbol','NormalizedScore','Construct Rank','HairpinWeight']] #set weight to 1 for now foldTable = utils.parseTable(foldTableFile,'\t') constructOrder = utils.order([float(line[2]) for line in foldTable[1:]],decreasing=True) #make geneCountDict print("making gene count dictionary") geneCountDict= defaultdict(int) for line in foldTable[1:]: geneCountDict[line[1]] +=1 print("iterating through constructs") constructRank = 1 for i in constructOrder: rowIndex = i+1 # accounts for the header geneName = foldTable[rowIndex][1] if geneCountDict[geneName] == 1: print("Gene %s only has one guide RNA. Excluding from FRIGER analysis" % (geneName)) continue newLine = foldTable[rowIndex][0:3] + [constructRank,1] rigerTable.append(newLine) constructRank += 1 if len(output) == 0: output = string.replace(foldTableFile,'_log2Ratio.txt','_friger.txt') utils.unParseTable(rigerTable,output,'\t') return output
def collapseRegionMap(regionMapFile,name='',controlBams=False): ''' takes a regionMap file and collapses signal into a single column also fixes any stupid start/stop sorting issues needs to take into account whether or not controls were used ''' regionMap = utils.parseTable(regionMapFile,'\t') for n,line in enumerate(regionMap): if n ==0: #new header if len(name) == 0: name = 'MERGED_SIGNAL' regionMap[n] = line[0:6] +[name] else: newLine = list(line[0:6]) if controlBams: signalLine = [float(x) for x in line[6:]] rankbyIndexes = range(0,len(signalLine)/2,1) controlIndexes = range(len(signalLine)/2,len(signalLine),1) metaVector = [] for i,j in zip(rankbyIndexes,controlIndexes): #min signal is 0 metaVector.append(max(0,signalLine[i] - signalLine[j])) metaSignal = numpy.mean(metaVector) else: metaSignal = numpy.mean([float(x) for x in line[6:]]) regionMap[n] = newLine + [metaSignal] outputFile = string.replace(regionMapFile,'REGION','META') utils.unParseTable(regionMap,outputFile,'\t') return(outputFile)
import utils from sys import argv filename = argv[1] outname = filename[:-3] + 'sorted.bed' bedfile = utils.parseTable(filename, '\t') out = [] for line in bedfile: coords = [int(line[1]), int(line[2])] start = min(coords) end = max(coords) newline = [line[0], start, end] + line[3:] out.append(newline) utils.unParseTable(out, outname, '\t')
def mapCollection(stitchedCollection, referenceCollection, bamFileList, mappedFolder, output, refName): ''' makes a table of factor density in a stitched locus and ranks table by number of loci stitched together ''' print('FORMATTING TABLE') loci = stitchedCollection.getLoci() locusTable = [['REGION_ID', 'CHROM', 'START', 'STOP', 'NUM_LOCI', 'CONSTITUENT_SIZE']] lociLenList = [] # strip out any that are in chrY for locus in list(loci): if locus.chr() == 'chrY': loci.remove(locus) for locus in loci: # numLociList.append(int(stitchLocus.ID().split('_')[1])) lociLenList.append(locus.len()) # numOrder = order(numLociList,decreasing=True) lenOrder = utils.order(lociLenList, decreasing=True) ticker = 0 for i in lenOrder: ticker += 1 if ticker % 1000 == 0: print(ticker) locus = loci[i] # First get the size of the enriched regions within the stitched locus refEnrichSize = 0 refOverlappingLoci = referenceCollection.getOverlap(locus, 'both') for refLocus in refOverlappingLoci: refEnrichSize += refLocus.len() try: stitchCount = int(locus.ID().split('_')[0]) except ValueError: stitchCount = 1 coords = [int(x) for x in locus.coords()] locusTable.append([locus.ID(), locus.chr(), min(coords), max(coords), stitchCount, refEnrichSize]) print('GETTING MAPPED DATA') print("USING A BAMFILE LIST:") print(bamFileList) for bamFile in bamFileList: bamFileName = bamFile.split('/')[-1] print('GETTING MAPPING DATA FOR %s' % bamFile) # assumes standard convention for naming enriched region gffs # opening up the mapped GFF print('OPENING %s%s_%s_MAPPED/matrix.txt' % (mappedFolder, refName, bamFileName)) mappedGFF = utils.parseTable('%s%s_%s_MAPPED/matrix.txt' % (mappedFolder, refName, bamFileName), '\t') signalDict = defaultdict(float) print('MAKING SIGNAL DICT FOR %s' % (bamFile)) mappedLoci = [] for line in mappedGFF[1:]: chrom = line[1].split('(')[0] start = int(line[1].split(':')[-1].split('-')[0]) end = int(line[1].split(':')[-1].split('-')[1]) mappedLoci.append(utils.Locus(chrom, start, end, '.', line[0])) try: signalDict[line[0]] = float(line[2]) * (abs(end - start)) except ValueError: print('WARNING NO SIGNAL FOR LINE:') print(line) continue mappedCollection = utils.LocusCollection(mappedLoci, 500) locusTable[0].append(bamFileName) for i in range(1, len(locusTable)): signal = 0.0 line = locusTable[i] lineLocus = utils.Locus(line[1], line[2], line[3], '.') overlappingRegions = mappedCollection.getOverlap(lineLocus, sense='both') for region in overlappingRegions: signal += signalDict[region.ID()] locusTable[i].append(signal) utils.unParseTable(locusTable, output, '\t')
def finishRankOutput(dataFile,rankOutput,genome,mergeFolder,mergeName,name1,name2,cutOff=1.5,window = 100000,superOnly=True,plotBam=True): ''' cleans up the rank output table makes a gff of all of the gained/lost supers beyond a certain cutoff w/ a window makes a list of gained genes and lost genes makes a bed of gained loss ''' dataDict = pipeline_dfci.loadDataTable(dataFile) #making sure window and cutoff are int/float cutOff = float(cutOff) window = int(window) genome = string.upper(genome) #make the output folder outputFolder =pipeline_dfci.formatFolder(mergeFolder+'output/',True) #bring in the old rank table rankEnhancerTable = utils.parseTable(rankOutput,'\t') #make a new formatted table header = rankEnhancerTable[0] header[-4] = 'DELTA RANK' header[-3] = 'IS_SUPER' formattedRankTable =[header] #the gffs gainedGFF = [] lostGFF = [] gainedWindowGFF = [] lostWindowGFF = [] if superOnly: enhancerType = 'SUPERS' else: enhancerType = 'ENHANCERS' #the beds if superOnly: gainedTrackHeader = 'track name="%s %s only SEs" description="%s super enhancers that are found only in %s vs %s" itemRGB=On color=255,0,0' % (genome,name2,genome,name2,name1) gainedBed = [[gainedTrackHeader]] conservedTrackHeader = 'track name="%s %s and %s SEs" description="%s super enhancers that are found in both %s vs %s" itemRGB=On color=0,0,0' % (genome,name1,name2,genome,name1,name2) conservedBed = [[conservedTrackHeader]] lostTrackHeader = 'track name="%s %s only SEs" description="%s super enhancers that are found only in %s vs %s" itemRGB=On color=0,255,0' % (genome,name1,genome,name1,name2) lostBed = [[lostTrackHeader]] else: gainedTrackHeader = 'track name="%s %s only enhancers" description="%s enhancers that are found only in %s vs %s" itemRGB=On color=255,0,0' % (genome,name2,genome,name2,name1) gainedBed = [[gainedTrackHeader]] conservedTrackHeader = 'track name="%s %s and %s enhancers" description="%s enhancers that are found in both %s vs %s" itemRGB=On color=0,0,0' % (genome,name1,name2,genome,name1,name2) conservedBed = [[conservedTrackHeader]] lostTrackHeader = 'track name="%s %s only enhancers" description="%s enhancers that are found only in %s vs %s" itemRGB=On color=0,255,0' % (genome,name1,genome,name1,name2) lostBed = [[lostTrackHeader]] #the genes geneTable =[['GENE','ENHANCER_ID','ENHANCER_CHROM','ENHANCER_START','ENHANCER_STOP',header[6],header[7],header[8],'STATUS']] for line in rankEnhancerTable[1:]: #fixing the enhancer ID line[0] = line[0].replace('_lociStitched','') formattedRankTable.append(line) #getting the genes geneList = [] geneList += line[9].split(',') geneList += line[10].split(',') geneList += line[11].split(',') geneList = [x for x in geneList if len(x) >0] geneList = utils.uniquify(geneList) geneString = string.join(geneList,',') bedLine = [line[1],line[2],line[3],line[0],line[-4]] #for gained if float(line[6]) > cutOff: gffLine = [line[1],line[0],'',line[2],line[3],'','.','',geneString] gffWindowLine = [line[1],line[0],'',int(line[2])-window,int(line[3])+window,'','.','',geneString] gainedGFF.append(gffLine) gainedWindowGFF.append(gffWindowLine) geneStatus = name2 gainedBed.append(bedLine) #for lost elif float(line[6]) < (-1 * cutOff): gffLine = [line[1],line[0],'',line[2],line[3],'','.','',geneString] gffWindowLine = [line[1],line[0],'',int(line[2])-window,int(line[3])+window,'','.','',geneString] lostGFF.append(gffLine) lostWindowGFF.append(gffWindowLine) geneStatus = name1 lostBed.append(bedLine) #for conserved else: geneStatus = 'CONSERVED' conservedBed.append(bedLine) #now fill in the gene Table for gene in geneList: geneTableLine = [gene,line[0],line[1],line[2],line[3],line[6],line[7],line[8],geneStatus] geneTable.append(geneTableLine) #concat the bed fullBed = gainedBed + conservedBed + lostBed #start writing the output #there's the two gffs, the bed,the formatted table, the gene table #formatted table formattedFilename = "%s%s_%s_MERGED_%s_RANK_TABLE.txt" % (outputFolder,genome,mergeName,enhancerType) utils.unParseTable(formattedRankTable,formattedFilename,'\t') #gffs gffFolder = pipeline_dfci.formatFolder(outputFolder+'gff/',True) gffFilename_gained = "%s%s_%s_%s_ONLY_%s_-0_+0.gff" % (gffFolder,genome,mergeName,string.upper(name2),enhancerType) gffFilenameWindow_gained = "%s%s_%s_%s_ONLY_%s_-%sKB_+%sKB.gff" % (gffFolder,genome,mergeName,string.upper(name2),enhancerType,window/1000,window/1000) gffFilename_lost = "%s%s_%s_%s_ONLY_%s_-0_+0.gff" % (gffFolder,genome,mergeName,string.upper(name1),enhancerType) gffFilenameWindow_lost = "%s%s_%s_%s_ONLY_%s_-%sKB_+%sKB.gff" % (gffFolder,genome,mergeName,string.upper(name1),enhancerType,window/1000,window/1000) utils.unParseTable(gainedGFF,gffFilename_gained,'\t') utils.unParseTable(gainedWindowGFF,gffFilenameWindow_gained,'\t') utils.unParseTable(lostGFF,gffFilename_lost,'\t') utils.unParseTable(lostWindowGFF,gffFilenameWindow_lost,'\t') #bed bedFilename = "%s%s_%s_MERGED_%s.bed" % (outputFolder,genome,mergeName,enhancerType) utils.unParseTable(fullBed,bedFilename,'\t') #geneTable geneFilename = "%s%s_%s_MERGED_%s_GENE_TABLE.txt" % (outputFolder,genome,mergeName,enhancerType) utils.unParseTable(geneTable,geneFilename,'\t') #finally, move all of the plots to the output folder cmd = "cp %s%s_ROSE/*.pdf %s%s_%s_MERGED_%s_DELTA.pdf" % (mergeFolder,name1,outputFolder,genome,mergeName,enhancerType) os.system(cmd) cmd = "cp %s%s_ROSE/*RANK_PLOT.png %s%s_%s_MERGED_%s_RANK_PLOT.png" % (mergeFolder,name1,outputFolder,genome,mergeName,enhancerType) os.system(cmd) #now execute the bamPlot_turbo.py commands if plotBam: bam1 = dataDict[name1]['bam'] bam2 = dataDict[name2]['bam'] bamString = "%s,%s" % (bam1,bam2) nameString = "%s,%s" % (name1,name2) colorString = "0,0,0:100,100,100" #change dir os.chdir(pipelineDir) if len(gainedGFF) > 0: #gained command plotTitle = "%s_ONLY_SE" % (name2) cmd = 'python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE' % (genome,bamString,gffFilename_gained,outputFolder,nameString,colorString,plotTitle) os.system(cmd) #gained window command plotTitle = "%s_ONLY_SE_%sKB_WINDOW" % (name2,window/1000) cmd = 'python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE' % (genome,bamString,gffFilenameWindow_gained,outputFolder,nameString,colorString,plotTitle) os.system(cmd) if len(lostGFF) > 0: #lost command plotTitle = "%s_ONLY_SE" % (name1) cmd = 'python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE' % (genome,bamString,gffFilename_lost,outputFolder,nameString,colorString,plotTitle) os.system(cmd) #lost command plotTitle = "%s_ONLY_SE_%sKB_WINDOW" % (name1,window/1000) cmd = 'python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE' % (genome,bamString,gffFilenameWindow_lost,outputFolder,nameString,colorString,plotTitle) os.system(cmd) return
def main(): ''' main run call ''' debug = False from optparse import OptionParser usage = "usage: %prog [options] -g [GENOME] -i [INPUT_REGION_GFF] -r [RANKBY_BAM_FILE] -o [OUTPUT_FOLDER] [OPTIONAL_FLAGS]" parser = OptionParser(usage=usage) # required flags parser.add_option("-i", "--i", dest="input", nargs=1, default=None, help="Enter a comma separated list of .gff or .bed file of binding sites used to make enhancers") parser.add_option("-r", "--rankby", dest="rankby", nargs=1, default=None, help="Enter a comma separated list of bams to rank by") parser.add_option("-o", "--out", dest="out", nargs=1, default=None, help="Enter an output folder") parser.add_option("-g", "--genome", dest="genome", nargs=1, default=None, help="Enter the genome build (MM9,MM8,HG18,HG19)") # optional flags parser.add_option("-n", "--name", dest="name", nargs=1, default=None, help="Provide a name for the analysis otherwise ROSE will guess") parser.add_option("-c", "--control", dest="control", nargs=1, default=None, help="Enter a comma separated list of control bams. Can either provide a single control bam for all rankby bams, or provide a control bam for each individual bam") parser.add_option("-s", "--stitch", dest="stitch", nargs=1, default='', help="Enter a max linking distance for stitching. Default will determine optimal stitching parameter") parser.add_option("-t", "--tss", dest="tss", nargs=1, default=0, help="Enter a distance from TSS to exclude. 0 = no TSS exclusion") parser.add_option("--mask", dest="mask", nargs=1, default=None, help="Mask a set of regions from analysis. Provide a .bed or .gff of masking regions") # RETRIEVING FLAGS (options, args) = parser.parse_args() if not options.input or not options.rankby or not options.out or not options.genome: print('hi there') parser.print_help() exit() # making the out folder if it doesn't exist outFolder = utils.formatFolder(options.out, True) # figuring out folder schema gffFolder = utils.formatFolder(outFolder + 'gff/', True) mappedFolder = utils.formatFolder(outFolder + 'mappedGFF/', True) # GETTING INPUT FILE(s) inputList = [inputFile for inputFile in options.input.split(',') if len(inputFile) > 1] #converting all input files into GFFs and moving into the GFF folder inputGFFList = [] for inputFile in inputList: if inputFile.split('.')[-1] == 'bed': # CONVERTING A BED TO GFF inputGFFName = inputFile.split('/')[-1][0:-4] #strips the last 4 characters i.e. '.bed' inputGFFFile = '%s%s.gff' % (gffFolder, inputGFFName) utils.bedToGFF(inputFile, inputGFFFile) elif options.input.split('.')[-1] == 'gff': # COPY THE INPUT GFF TO THE GFF FOLDER os.system('cp %s %s' % (inputFile, gffFolder)) inputGFFFile = '%s%s' % (gffFolder,inputFile.split('/')[-1]) else: print('WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT') # COPY THE INPUT GFF TO THE GFF FOLDER os.system('cp %s %s' % (inputFile, gffFolder)) inputGFFFile = '%s%s' % (gffFolder,inputFile.split('/')[-1]) inputGFFList.append(inputGFFFile) # GETTING THE LIST OF BAMFILES TO PROCESS #either same number of bams for rankby and control #or only 1 control #or none! #bamlist should be all rankby bams followed by control bams bamFileList = [] if options.control: controlBamList = [bam for bam in options.control.split(',') if len(bam) >0] rankbyBamList = [bam for bam in options.rankby.split(',') if len(bam) >0] if len(controlBamList) == len(rankbyBamList): #case where an equal number of backgrounds are given bamFileList = rankbyBamList + controlBamList elif len(controlBamList) == 1: #case where a universal background is applied bamFileList = rankbyBamList + controlBamList*len(rankbyBamList) else: print('ERROR: EITHER PROVIDE A SINGLE CONTROL BAM FOR ALL SAMPLES, OR ONE CONTROL BAM FOR EACH SAMPLE') sys.exit() else: bamFileList = [bam for bam in options.rankby.split(',') if len(bam) > 0] # Stitch parameter if options.stitch == '': stitchWindow = '' else: stitchWindow = int(options.stitch) # tss options tssWindow = int(options.tss) if tssWindow != 0: removeTSS = True else: removeTSS = False # GETTING THE GENOME genome = string.upper(options.genome) print('USING %s AS THE GENOME' % (genome)) # GETTING THE CORRECT ANNOT FILE genomeDict = { 'HG18': '%s/annotation/hg18_refseq.ucsc' % (pipeline_dir), 'MM9': '%s/annotation/mm9_refseq.ucsc' % (pipeline_dir), 'HG19': '%s/annotation/hg19_refseq.ucsc' % (pipeline_dir), 'MM8': '%s/annotation/mm8_refseq.ucsc' % (pipeline_dir), 'MM10': '%s/annotation/mm10_refseq.ucsc' % (pipeline_dir), 'RN4': '%s/annotation/rn4_refseq.ucsc' % (pipeline_dir), } try: annotFile = genomeDict[genome.upper()] except KeyError: print('ERROR: UNSUPPORTED GENOMES TYPE %s' % (genome)) sys.exit() #FINDING THE ANALYSIS NAME if options.name: inputName = options.name else: inputName = inputGFFList[0].split('/')[-1].split('.')[0] print('USING %s AS THE ANALYSIS NAME' % (inputName)) print('FORMATTING INPUT REGIONS') # MAKING THE RAW INPUT FILE FROM THE INPUT GFFs #use a simpler unique region naming system if len(inputGFFList) == 1: inputGFF = utils.parseTable(inputGFFList[0],'\t') else: inputLoci = [] for gffFile in inputGFFList: print('\tprocessing %s' % (gffFile)) gff = utils.parseTable(gffFile,'\t') gffCollection = utils.gffToLocusCollection(gff,50) inputLoci += gffCollection.getLoci() inputCollection = utils.LocusCollection(inputLoci,50) inputCollection = inputCollection.stitchCollection() # stitches to produce unique regions inputGFF = utils.locusCollectionToGFF(inputCollection) formattedGFF = [] #now number things appropriately for i,line in enumerate(inputGFF): #use the coordinates to make a new id inputname_chr_sense_start_stop chrom = line[0] coords = [int(line[3]) ,int(line[4])] sense = line[6] lineID = '%s_%s' % (inputName,str(i+1)) #1 indexing newLine = [chrom,lineID,lineID,min(coords),max(coords),'',sense,'',lineID] formattedGFF.append(newLine) #name of the master input gff file masterGFFFile = '%s%s_%s_ALL_-0_+0.gff' % (gffFolder,string.upper(genome),inputName) utils.unParseTable(formattedGFF,masterGFFFile,'\t') print('USING %s AS THE INPUT GFF' % (masterGFFFile)) # MAKING THE START DICT print('MAKING START DICT') startDict = utils.makeStartDict(annotFile) #GET CHROMS FOUND IN THE BAMS print('GETTING CHROMS IN BAMFILES') bamChromList = getBamChromList(bamFileList) print("USING THE FOLLOWING CHROMS") print(bamChromList) #LOADING IN THE GFF AND FILTERING BY CHROM print('LOADING AND FILTERING THE GFF') inputGFF = filterGFF(masterGFFFile,bamChromList) # LOADING IN THE BOUND REGION REFERENCE COLLECTION print('LOADING IN GFF REGIONS') referenceCollection = utils.gffToLocusCollection(inputGFF) print('CHECKING REFERENCE COLLECTION:') checkRefCollection(referenceCollection) # MASKING REFERENCE COLLECTION # see if there's a mask if options.mask: maskFile = options.mask # if it's a bed file if maskFile.split('.')[-1].upper() == 'BED': maskGFF = utils.bedToGFF(maskFile) elif maskFile.split('.')[-1].upper() == 'GFF': maskGFF = utils.parseTable(maskFile, '\t') else: print("MASK MUST BE A .gff or .bed FILE") sys.exit() maskCollection = utils.gffToLocusCollection(maskGFF) # now mask the reference loci referenceLoci = referenceCollection.getLoci() filteredLoci = [locus for locus in referenceLoci if len(maskCollection.getOverlap(locus, 'both')) == 0] print("FILTERED OUT %s LOCI THAT WERE MASKED IN %s" % (len(referenceLoci) - len(filteredLoci), maskFile)) referenceCollection = utils.LocusCollection(filteredLoci, 50) # NOW STITCH REGIONS print('STITCHING REGIONS TOGETHER') stitchedCollection, debugOutput, stitchWindow = regionStitching(referenceCollection, inputName, outFolder, stitchWindow, tssWindow, annotFile, removeTSS) # NOW MAKE A STITCHED COLLECTION GFF print('MAKING GFF FROM STITCHED COLLECTION') stitchedGFF = utils.locusCollectionToGFF(stitchedCollection) print(stitchWindow) print(type(stitchWindow)) if not removeTSS: stitchedGFFFile = '%s%s_%sKB_STITCHED.gff' % (gffFolder, inputName, str(stitchWindow / 1000)) stitchedGFFName = '%s_%sKB_STITCHED' % (inputName, str(stitchWindow / 1000)) debugOutFile = '%s%s_%sKB_STITCHED.debug' % (gffFolder, inputName, str(stitchWindow / 1000)) else: stitchedGFFFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.gff' % (gffFolder, inputName, str(stitchWindow / 1000)) stitchedGFFName = '%s_%sKB_STITCHED_TSS_DISTAL' % (inputName, str(stitchWindow / 1000)) debugOutFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.debug' % (gffFolder, inputName, str(stitchWindow / 1000)) # WRITING DEBUG OUTPUT TO DISK if debug: print('WRITING DEBUG OUTPUT TO DISK AS %s' % (debugOutFile)) utils.unParseTable(debugOutput, debugOutFile, '\t') # WRITE THE GFF TO DISK print('WRITING STITCHED GFF TO DISK AS %s' % (stitchedGFFFile)) utils.unParseTable(stitchedGFF, stitchedGFFFile, '\t') # SETTING UP THE OVERALL OUTPUT FILE outputFile1 = outFolder + stitchedGFFName + '_ENHANCER_REGION_MAP.txt' print('OUTPUT WILL BE WRITTEN TO %s' % (outputFile1)) # MAPPING TO THE NON STITCHED (ORIGINAL GFF) # MAPPING TO THE STITCHED GFF # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail. bamFileListUnique = list(bamFileList) bamFileListUnique = utils.uniquify(bamFileListUnique) #prevent redundant mapping print("MAPPING TO THE FOLLOWING BAMS:") print(bamFileListUnique) for bamFile in bamFileListUnique: bamFileName = bamFile.split('/')[-1] # MAPPING TO THE STITCHED GFF mappedOut1Folder = '%s%s_%s_MAPPED' % (mappedFolder, stitchedGFFName, bamFileName) mappedOut1File = '%s%s_%s_MAPPED/matrix.txt' % (mappedFolder, stitchedGFFName, bamFileName) if utils.checkOutput(mappedOut1File, 0.2, 0.2): print("FOUND %s MAPPING DATA FOR BAM: %s" % (stitchedGFFFile, mappedOut1File)) else: cmd1 = bamliquidator_path + " --sense . -e 200 --match_bamToGFF -r %s -o %s %s" % (stitchedGFFFile, mappedOut1Folder, bamFile) print(cmd1) os.system(cmd1) if utils.checkOutput(mappedOut1File,0.2,5): print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (stitchedGFFFile, bamFileName)) else: print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (stitchedGFFFile, bamFileName)) sys.exit() print('BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS') # CALCULATE DENSITY BY REGION # NEED TO FIX THIS FUNCTION TO ACCOUNT FOR DIFFERENT OUTPUTS OF LIQUIDATOR mapCollection(stitchedCollection, referenceCollection, bamFileList, mappedFolder, outputFile1, refName=stitchedGFFName) print('FINDING AVERAGE SIGNAL AMONGST BAMS') metaOutputFile = collapseRegionMap(outputFile1,inputName + '_MERGED_SIGNAL',controlBams=options.control) #now try the merging print('CALLING AND PLOTTING SUPER-ENHANCERS') rankbyName = inputName + '_MERGED_SIGNAL' controlName = 'NONE' cmd = 'Rscript %sROSE2_callSuper.R %s %s %s %s' % (pipeline_dir,outFolder, metaOutputFile, inputName, controlName) print(cmd) os.system(cmd) # calling the gene mapper print('CALLING GENE MAPPING') superTableFile = "%s_SuperEnhancers.table.txt" % (inputName) #for now don't use ranking bam to call top genes cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % (pipeline_dir,genome, outFolder, superTableFile) print(cmd) os.system(cmd) stretchTableFile = "%s_StretchEnhancers.table.txt" % (inputName) cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % (pipeline_dir,genome, outFolder, stretchTableFile) print(cmd) os.system(cmd) superStretchTableFile = "%s_SuperStretchEnhancers.table.txt" % (inputName) cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % (pipeline_dir,genome, outFolder, superStretchTableFile) os.system(cmd)
def finishRankOutput(dataFile, rankOutput, genome, mergeFolder, mergeName, name1, name2, cutOff=1.5, window=100000): """ cleans up the rank output table makes a gff of all of the gained/lost supers beyond a certain cutoff w/ a window makes a list of gained genes and lost genes makes a bed of gained loss """ dataDict = pipeline_dfci.loadDataTable(dataFile) # making sure window and cutoff are int/float cutOff = float(cutOff) window = int(window) genome = string.upper(genome) # make the output folder outputFolder = pipeline_dfci.formatFolder(mergeFolder + "output/", True) # bring in the old rank table rankEnhancerTable = utils.parseTable(rankOutput, "\t") # make a new formatted table header = rankEnhancerTable[0] header[-4] = "DELTA RANK" header[-3] = "IS_SUPER" formattedRankTable = [header] # the gffs gainedGFF = [] lostGFF = [] gainedWindowGFF = [] lostWindowGFF = [] # the beds gainedTrackHeader = ( 'track name="%s %s only SEs" description="%s super enhancers that are found only in %s vs %s" itemRGB=On color=255,0,0' % (genome, name2, genome, name2, name1) ) gainedBed = [[gainedTrackHeader]] conservedTrackHeader = ( 'track name="%s %s and %s SEs" description="%s super enhancers that are found in both %s vs %s" itemRGB=On color=0,0,0' % (genome, name1, name2, genome, name1, name2) ) conservedBed = [[conservedTrackHeader]] lostTrackHeader = ( 'track name="%s %s only SEs" description="%s super enhancers that are found only in %s vs %s" itemRGB=On color=0,255,0' % (genome, name1, genome, name1, name2) ) lostBed = [[lostTrackHeader]] # the genes geneTable = [ [ "GENE", "ENHANCER_ID", "ENHANCER_CHROM", "ENHANCER_START", "ENHANCER_STOP", header[6], header[7], header[8], "STATUS", ] ] for line in rankEnhancerTable[1:]: # fixing the enhancer ID line[0] = line[0].replace("_lociStitched", "") formattedRankTable.append(line) # getting the genes geneList = [] geneList += line[9].split(",") geneList += line[10].split(",") geneList += line[11].split(",") geneList = [x for x in geneList if len(x) > 0] geneList = utils.uniquify(geneList) geneString = string.join(geneList, ",") bedLine = [line[1], line[2], line[3], line[0], line[-4]] # for gained if float(line[6]) > cutOff: gffLine = [line[1], line[0], "", line[2], line[3], "", ".", "", geneString] gffWindowLine = [ line[1], line[0], "", int(line[2]) - window, int(line[3]) + window, "", ".", "", geneString, ] gainedGFF.append(gffLine) gainedWindowGFF.append(gffWindowLine) geneStatus = name2 gainedBed.append(bedLine) # for lost elif float(line[6]) < (-1 * cutOff): gffLine = [line[1], line[0], "", line[2], line[3], "", ".", "", geneString] gffWindowLine = [ line[1], line[0], "", int(line[2]) - window, int(line[3]) + window, "", ".", "", geneString, ] lostGFF.append(gffLine) lostWindowGFF.append(gffWindowLine) geneStatus = name1 lostBed.append(bedLine) # for conserved else: geneStatus = "CONSERVED" conservedBed.append(bedLine) # now fill in the gene Table for gene in geneList: geneTableLine = [gene, line[0], line[1], line[2], line[3], line[6], line[7], line[8], geneStatus] geneTable.append(geneTableLine) # concat the bed fullBed = gainedBed + conservedBed + lostBed # start writing the output # there's the two gffs, the bed,the formatted table, the gene table # formatted table formattedFilename = "%s%s_%s_MERGED_SUPERS_RANK_TABLE.txt" % (outputFolder, genome, mergeName) utils.unParseTable(formattedRankTable, formattedFilename, "\t") # gffs gffFolder = pipeline_dfci.formatFolder(outputFolder + "gff/", True) gffFilename_gained = "%s%s_%s_%s_ONLY_SUPERS_-0_+0.gff" % (gffFolder, genome, mergeName, string.upper(name2)) gffFilenameWindow_gained = "%s%s_%s_%s_ONLY_SUPERS_-%sKB_+%sKB.gff" % ( gffFolder, genome, mergeName, string.upper(name2), window / 1000, window / 1000, ) gffFilename_lost = "%s%s_%s_%s_ONLY_SUPERS_-0_+0.gff" % (gffFolder, genome, mergeName, string.upper(name1)) gffFilenameWindow_lost = "%s%s_%s_%s_ONLY_SUPERS_-%sKB_+%sKB.gff" % ( gffFolder, genome, mergeName, string.upper(name1), window / 1000, window / 1000, ) utils.unParseTable(gainedGFF, gffFilename_gained, "\t") utils.unParseTable(gainedWindowGFF, gffFilenameWindow_gained, "\t") utils.unParseTable(lostGFF, gffFilename_lost, "\t") utils.unParseTable(lostWindowGFF, gffFilenameWindow_lost, "\t") # bed bedFilename = "%s%s_%s_MERGED_SUPERS.bed" % (outputFolder, genome, mergeName) utils.unParseTable(fullBed, bedFilename, "\t") # geneTable geneFilename = "%s%s_%s_MERGED_SUPERS_GENE_TABLE.txt" % (outputFolder, genome, mergeName) utils.unParseTable(geneTable, geneFilename, "\t") # finally, move all of the plots to the output folder cmd = "cp %s%s_ROSE/*.pdf %s%s_%s_MERGED_SUPERS_DELTA.pdf" % (mergeFolder, name1, outputFolder, genome, mergeName) os.system(cmd) cmd = "cp %s%s_ROSE/*RANK_PLOT.png %s%s_%s_MERGED_SUPERS_RANK_PLOT.png" % ( mergeFolder, name1, outputFolder, genome, mergeName, ) os.system(cmd) # now execute the bamPlot_turbo.py commands bam1 = dataDict[name1]["bam"] bam2 = dataDict[name2]["bam"] bamString = "%s,%s" % (bam1, bam2) nameString = "%s,%s" % (name1, name2) colorString = "0,0,0:100,100,100" # change dir os.chdir("/ark/home/cl512/pipeline/") if len(gainedGFF) > 0: # gained command plotTitle = "%s_ONLY_SE" % (name2) cmd = "python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE" % ( genome, bamString, gffFilename_gained, outputFolder, nameString, colorString, plotTitle, ) os.system(cmd) # gained window command plotTitle = "%s_ONLY_SE_%sKB_WINDOW" % (name2, window / 1000) cmd = "python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE" % ( genome, bamString, gffFilenameWindow_gained, outputFolder, nameString, colorString, plotTitle, ) os.system(cmd) if len(lostGFF) > 0: # lost command plotTitle = "%s_ONLY_SE" % (name1) cmd = "python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE" % ( genome, bamString, gffFilename_lost, outputFolder, nameString, colorString, plotTitle, ) os.system(cmd) # lost command plotTitle = "%s_ONLY_SE_%sKB_WINDOW" % (name1, window / 1000) cmd = "python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE" % ( genome, bamString, gffFilenameWindow_lost, outputFolder, nameString, colorString, plotTitle, ) os.system(cmd) return