def makeFoldTable(annotFile,analysisName,testName,controlName,testMMR,controlMMR,testIdxFile,controlIdxFile,outputFolder,epsilon = 1): ''' makes the fold table and writes to disk fold table is ranked by fold change first column is guideID, second column is gene name, third is fold change ''' guideDict,geneDict = makeAnnotDict(annotFile) testIdx = utils.parseTable(testIdxFile,'\t') controlIdx = utils.parseTable(controlIdxFile,'\t') #for each guide, divide the count by the MMR then add 1 then take the log2 ratio outTable = [['GUIDE_ID','GENE','LOG2_RATIO',testName,controlName]] for i in range(len(testIdx)): guideID = testIdx[i][0] gene = guideDict[guideID] testCount = float(testIdx[i][2])/testMMR + epsilon controlCount = float(controlIdx[i][2])/controlMMR + epsilon log2Ratio = numpy.log2(testCount/controlCount) newLine = [guideID,gene,log2Ratio,round(testCount,4),round(controlCount,4)] outTable.append(newLine) outputFile = '%s%s_log2Ratio.txt' % (outputFolder,analysisName) utils.unParseTable(outTable,outputFile,'\t') return outputFile
def mapBams(bamFileList,splitGFFPath,analysisName,mappedFolder): print("MAPPING TO THE FOLLOWING BAMS:") for bamFile in bamFileList: print(bamFile) bamFileName = bamFile.split('/')[-1] # MAPPING TO THE STITCHED GFF mappedOut1Folder = '%s%s_%s_MAPPED' % (mappedFolder, analysisName, bamFileName) mappedOut1File = '%s%s_%s_MAPPED/matrix.txt' % (mappedFolder, analysisName, bamFileName) if utils.checkOutput(mappedOut1File, 0.2, 0.2): print("FOUND %s MAPPING DATA FOR BAM: %s" % (splitGFFPath, mappedOut1File)) else: cmd1 = bamliquidator_path + " --sense . -e 200 --match_bamToGFF -r %s -o %s %s" % (splitGFFPath, mappedOut1Folder, bamFile) print(cmd1) os.system(cmd1) if utils.checkOutput(mappedOut1File,0.2,5): print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (splitGFFPath, bamFileName)) else: print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (splitGFFPath, bamFileName)) sys.exit() print('BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS') #now we make a signal table #set up the table using the first bam if len(bamFileList) > 1: #set up the first pass at the table signalTable = [['REGION_ID','locusLine'] + [name.split('/')[-1] for name in bamFileList]] bamFileName = bamFileList[0].split('/')[-1] mappedTable = utils.parseTable( '%s%s_%s_MAPPED/matrix.txt' % (mappedFolder, analysisName, bamFileName),'\t') for i in range(1,len(mappedTable)): signalTable.append(mappedTable[i]) for bamFile in bamFileList[1:]: bamFileName = bamFile.split('/')[-1] mappedTable = utils.parseTable( '%s%s_%s_MAPPED/matrix.txt' % (mappedFolder, analysisName, bamFileName),'\t') for i in range(1,len(mappedTable[i])): mapSignal = mappedTable[i][2] signalTable[i].append(mapSignal) else: bamFileName = bamFileList[0].split('/')[-1] signalTable = utils.parseTable( '%s%s_%s_MAPPED/matrix.txt' % (mappedFolder, analysisName, bamFileName),'\t') return(signalTable)
def addLengths(gene_table_path, peak_table_path): ''' add tss and distal lengths to a gene table using the peak table ''' output_path = string.replace(gene_table_path, 'GENE_TABLE', 'GENE_TABLE_LENGTH') print(output_path) tss_dict = defaultdict(int) distal_dict = defaultdict(int) peak_table = utils.parseTable(peak_table_path, '\t') for line in peak_table[1:]: #get the genes gene_list = [] if len(line) == 15: gene_list += line[-1].split(',') gene_list += line[-2].split(',') elif len(line) == 14: gene_list += line[-1].split(',') else: continue gene_list = utils.uniquify( [gene for gene in gene_list if len(gene) > 0]) for gene in gene_list: if int(line[5]) == 1: tss_dict[gene] += int(line[4]) else: distal_dict[gene] += int(line[4]) #now fill out the gene table gene_table = utils.parseTable(gene_table_path, '\t') output_table = [gene_table[0] + ['TSS_LENGTH', 'DISTAL_LENGTH']] for line in gene_table[1:]: gene = line[0] new_line = line + [tss_dict[gene], distal_dict[gene]] output_table.append(new_line) utils.unParseTable(output_table, output_path, '\t') return output_path
def callMergeSupers(dataFile,superFile1,superFile2,name1,name2,mergeName,genome,parentFolder): ''' this is the main run function for the script all of the work should occur here, but no functions should be defined here ''' mergedGFFFile = '%s%s_%s_MERGED_REGIONS_-0_+0.gff' % (parentFolder,string.upper(genome),mergeName) #check to make sure this hasn't been done yet roseOutput = "%s%s_ROSE/%s_%s_MERGED_REGIONS_-0_+0_SuperEnhancers_ENHANCER_TO_GENE.txt" % (parentFolder,name1,string.upper(genome),mergeName) try: foo = utils.parseTable(roseOutput,'\t') print "ROSE OUTPUT ALREADY FOUND HERE %s" % (roseOutput) return roseOutput except IOError: print "MERGING ENHANCER REGIONS FROM %s and %s" % (superFile1,superFile2) mergedGFF = mergeCollections(superFile1,superFile2,name1,name2,mergedGFFFile) #call rose on the merged shit roseBashFile = callRoseMerged(dataFile,mergedGFF,name1,name2,parentFolder) print('i can has rose bash file %s' % (roseBashFile)) #run the bash command os.system('bash %s' % (roseBashFile)) #check for and return output if utils.checkOutput(roseOutput,1,30): return roseOutput else: print "ERROR: ROSE CALL ON MERGED REGIONS FAILED" sys.exit()
def makeSEDict(enhancerFile, name, superOnly=True): ''' makes an attribute dict for enhancers keyed by uniqueID ''' seDict = {} enhancerTable = utils.parseTable(enhancerFile, '\t') superLoci = [] for line in enhancerTable: if line[0][0] == '#': continue if line[0][0] == 'R': header = line supColumn = header.index('isSuper') continue if superOnly: if int(line[supColumn]) == 1: rank = int(line[-2]) enhancerID = name + '_' + line[0] seDict[enhancerID] = {'rank': rank} else: signal = float(line[6]) - float(line[7]) rank = int(line[-2]) enhancerID = name + '_' + line[0] seDict[enhancerID] = {'rank': rank} return seDict
def makeEnhancerSignalTable(mergedRegionMap,medianDict,analysisName,genome,outputFolder): ''' makes a table where each row is an enhancer and each column is the log2 background corrected signal vs. median ''' #load in the region map regionMap = utils.parseTable(mergedRegionMap,'\t') namesList = medianDict.keys() signalTable = [['REGION_ID','CHROM','START','STOP','NUM_LOCI','CONSTITUENT_SIZE'] + namesList] for line in regionMap[1:]: newLine = line[0:6] for i in range(len(namesList)): enhancerIndex = (i*2) + 6 controlIndex = (i*2) + 7 enhancerSignal = float(line[enhancerIndex]) - float(line[controlIndex]) if enhancerSignal < 0: enhancerSignal = 0 enhancerSignal = enhancerSignal/medianDict[namesList[i]] newLine.append(enhancerSignal) signalTable.append(newLine) outputFile = "%s%s_%s_signalTable.txt" % (outputFolder,genome,analysisName) print "WRITING MEDIAN NORMALIZED SIGNAL TABLE TO %s" % (outputFile) utils.unParseTable(signalTable,outputFile,'\t') return outputFile
def collapseRegionMap(regionMapFile, name='', controlBams=False): ''' takes a regionMap file and collapses signal into a single column also fixes any stupid start/stop sorting issues needs to take into account whether or not controls were used ''' regionMap = utils.parseTable(regionMapFile, '\t') for n, line in enumerate(regionMap): if n == 0: #new header if len(name) == 0: name = 'MERGED_SIGNAL' regionMap[n] = line[0:6] + [name] else: newLine = list(line[0:6]) if controlBams: signalLine = [float(x) for x in line[6:]] rankbyIndexes = range(0, len(signalLine) / 2, 1) controlIndexes = range(len(signalLine) / 2, len(signalLine), 1) metaVector = [] for i, j in zip(rankbyIndexes, controlIndexes): #min signal is 0 metaVector.append(max(0, signalLine[i] - signalLine[j])) metaSignal = numpy.mean(metaVector) else: metaSignal = numpy.mean([float(x) for x in line[6:]]) regionMap[n] = newLine + [metaSignal] outputFile = string.replace(regionMapFile, 'REGION', 'META') utils.unParseTable(regionMap, outputFile, '\t') return (outputFile)
def makeBedCollection(bedFileList): ''' takes in a list of bedFiles and makes a single huge collection each locus has as its ID the name of the bed file ''' bedLoci = [] print("MAKING BED COLLECTION FOR:") for bedFile in bedFileList: bedName = bedFile.split('/')[-1].split('.')[0] print(bedName) bed = utils.parseTable(bedFile, '\t') for line in bed: if len(line) >= 3: #check that line[0] if line[0][0:3] == 'chr': try: coords = [int(line[1]), int(line[2])] bedLocus = utils.Locus(line[0], min(coords), max(coords), '.', bedName) bedLoci.append(bedLocus) except ValueError: pass print("IDENTIFIED %s BED REGIONS" % (len(bedLoci))) return utils.LocusCollection(bedLoci, 50)
def loadGenome(genome_build,config_file = ''): ''' loads annotation for a genome into a genome object ''' #this nested dictionary has all of the useful information and likely will have to be #edited so it can be configured any time genome_build = string.upper(genome_build) genomeDict = { 'HG19':{'annot_file':'%sannotation/hg19_refseq.ucsc' % (pipeline_dir), 'genome_directory':'/storage/cylin/grail/genomes/Homo_sapiens/UCSC/hg19/Sequence/Chromosomes/', 'tf_file':'%s/annotation/TFlist_NMid_hg19.txt' % (whereAmI), 'mask_file':'/storage/cylin/grail/genomes/Homo_sapiens/UCSC/hg19/Annotation/Masks/hg19_encode_blacklist.bed', 'motif_convert':'%s/annotation/MotifDictionary.txt' % (whereAmI), 'motif_database':'%s/annotation/VertebratePWMs.txt' % (whereAmI), }, 'RN6':{'annot_file':'%sannotation/rn6_refseq.ucsc' % (pipeline_dir), 'genome_directory':'/storage/cylin/grail/genomes/Rattus_norvegicus/UCSC/rn6/Sequence/Chromosomes/', 'tf_file':'%s/annotation/TFlist_NMid_rn6.txt' % (whereAmI), 'motif_convert':'%s/annotation/MotifDictionary.txt' % (whereAmI), 'motif_database':'%s/annotation/VertebratePWMs.txt' % (whereAmI), }, 'MM10':{'annot_file':'%sannotation/mm10_refseq.ucsc' % (pipeline_dir), 'genome_directory':'/storage/cylin/grail/genomes/Mus_musculus/UCSC/mm10/Sequence/Chromosomes/', 'tf_file':'%s/annotation/TFlist_NMid_mm10.txt' % (whereAmI), 'motif_convert':'%s/annotation/MotifDictionary.txt' % (whereAmI), 'motif_database':'%s/annotation/VertebratePWMs.txt' % (whereAmI), } } #allow an optional config file to overwrite default paths if len(config_file) >0: config_table = utils.parseTable(config_file,'\t') for line in config_table[1:]: (build,field,feature_path) = line[0].split(':') genomeDict[string.upper(build)][string.lower(field)] = feature_path if genome_build not in genomeDict: print('ERROR: UNSUPPORTED GENOME BUILD %s. EXITING NOW' % (genome_build)) sys.exit() else: print('USING BUILD %s WITH FOLLOWING FIELDS:' % (genome_build)) print(genomeDict[genome_build]) #now attempt to load the genome genome = Genome(genome_build,genomeDict[genome_build]['genome_directory'],genomeDict[genome_build]['annot_file']) #adding additional optional features genome.addFeature('tf_file',genomeDict[genome_build]['tf_file']) if genome_build == 'HG19': genome.addFeature('mask',genomeDict[genome_build]['mask_file']) genome.addFeature('motif_convert',genomeDict[genome_build]['motif_convert']) genome.addFeature('motif_database',genomeDict[genome_build]['motif_database']) return genome
def createExpressionDict(annotationFile, projectFolder, projectName, refseqToNameDict, expCutoff,expressionFile=''): ''' input: an activity table with refseq in first column and expression or promoter acetylation in second column output: a dictionary keyed by refseq that points to activity ''' print 'CREATING EXPRESSION DICTIONARY' if not expressionFile: expressionFilename = projectFolder + 'bamliquidator/matrix.txt' else: expressionFilename = expressionFile expressionTable = utils.parseTable(expressionFilename, '\t') expressionDictNM = {} expressionDictGene = {} for line in expressionTable[1:]: trid = line[0] geneName = refseqToNameDict[trid] try: exp = float(line[2]) except IndexError: exp = float(line[1]) # Save the expression value of each NMid in a dict, keep higher value if multiple if trid in expressionDictNM and exp > expressionDictNM[trid]: expressionDictNM[trid] = exp elif trid not in expressionDictNM: expressionDictNM[trid] = exp # Save the value of the expression if it's the highest for that gene if geneName in expressionDictGene and exp > expressionDictGene[geneName]: expressionDictGene[geneName] = exp elif geneName not in expressionDictGene: expressionDictGene[geneName] = exp cutoff = numpy.percentile(expressionDictGene.values(), expCutoff) print 'Expression cutoff: ' + str(cutoff) expressedGenes = [] expressedNM = [] for nmid in expressionDictNM: if float(expressionDictNM[nmid]) > cutoff: expressedGenes.append(refseqToNameDict[nmid]) expressedNM.append(nmid) expressedGenes = utils.uniquify(expressedGenes) Genefilename = projectFolder + projectName + '_EXPRESSED_GENES.txt' utils.unParseTable(expressedGenes, Genefilename, '') expressedNM = utils.uniquify(expressedNM) NMfilename = projectFolder + projectName + '_EXPRESSED_NM.txt' utils.unParseTable(expressedNM, NMfilename, '') return expressedNM, expressionDictNM
def scoreValley(locus, bamFile, projectName, projectFolder): ''' calculate valley scores for a locus based on this refernce: ''' nbins = locus.len() / 10 #call bamliquidator on the region and store in a temp file os.system('bamliquidator ' + bamFile + ' ' + locus.chr() + ' ' + str(locus.start()) + ' ' + str(locus.end()) + ' . ' + str(nbins) + ' 0 > ' + projectFolder + 'tempBamliquidator_' + projectName + '.txt') x = utils.parseTable( projectFolder + 'tempBamliquidator_' + projectName + '.txt', '\t') density = [int(y[0]) for y in x] smoothDensity = gaussianSmooth(density, 5) scoreArray = [] regionMax = max(smoothDensity) #Now take the smooth reads and calaculate a valley score for i in range(len(smoothDensity)): score = 0 try: leftmax = max(smoothDensity[i - 25:i - 10]) except: leftmax = 'edge' try: rightmax = max(smoothDensity[i + 10:i + 25]) except: rightmax = 'edge' if rightmax == 'edge' and leftmax == 'edge': shoulderHeightMin = 0 shoulderHeightMax = 0 elif leftmax == 'edge': shoulderHeightMin = rightmax shoulderHeightMax = rightmax elif rightmax == 'edge': shoulderHeightMin = leftmax shoulderHeightMax = leftmax else: shoulderHeightMin = min(leftmax, rightmax) shoulderHeightMax = max(leftmax, rightmax) ratio = (shoulderHeightMax - float(smoothDensity[i])) / regionMax if ratio > 0.3: score = 1 else: score = 0 scoreArray.append(score) return scoreArray
def buildGraph(projectFolder, projectName, motifConvertFile, refseqToNameDict, canidateGenes): ''' import the FIMO output once it's finished build the networkX directed graph ''' motifDatabase = utils.parseTable(motifConvertFile, '\t') motifDatabaseDict = {} motifNames = [line[1] for line in motifDatabase] # The reverse of the other dict, from motif name to gene name for line in motifDatabase: motifDatabaseDict[line[0]] = line[1] fimoFile = projectFolder + 'FIMO/fimo.txt' fimoTable = utils.parseTable(fimoFile, '\t') graph = nx.DiGraph(name=projectName) graph.add_nodes_from(canidateGenes) motifDict = defaultdict(list) for line in fimoTable[1:]: source = motifDatabaseDict[line[0]] #motifId # region = line[1].split('|') region = line[2].split('|') target = refseqToNameDict[ region[0]] #gene name corresponding to the NMid graph.add_edge(source, target) # motifDict[source].append((region[1], int(region[2]) + int(line[2]), int(region[2]) + int(line[3]))) motifDict[source].append((region[1], int(region[2]) + int(line[3]), int(region[2]) + int(line[4]))) utils.formatFolder(projectFolder + 'motifBED/', True) for gene in motifDict.keys(): if motifDict[gene]: bed = [] for loc in motifDict[gene]: bed.append([loc[0], loc[1], loc[2]]) filename = projectFolder + 'motifBED/' + gene + '_' + projectName + '_motifs.bed' utils.unParseTable(bed, filename, '\t') return graph
def makeEnhancerSignalTable(nameDict, mergedRegionMap, medianDict, analysisName, genome, outputFolder): ''' makes a table where each row is an enhancer and each column is the log2 background corrected signal vs. median ''' #load in the region map regionMap = utils.parseTable(mergedRegionMap, '\t') namesList = nameDict.keys() namesList.sort() signalTable = [[ 'REGION_ID', 'CHROM', 'START', 'STOP', 'NUM_LOCI', 'CONSTITUENT_SIZE' ] + namesList] print("len of %s for namesList" % (len(namesList))) print(namesList) for line in regionMap[1:]: newLine = line[0:6] #a little tricky here to add datasets sequentially i = 6 #start w/ the first column w/ data for name in namesList: if nameDict[name]['background'] == True: enhancerIndex = int(i) i += 1 controlIndex = int(i) i += 1 try: enhancerSignal = float(line[enhancerIndex]) - float( line[controlIndex]) except IndexError: print line print len(line) print enhancerIndex print controlIndex sys.exit() else: enhancerIndex = int(i) i += 1 enhancerSignal = float(line[enhancerIndex]) if enhancerSignal < 0: enhancerSignal = 0 enhancerSignal = enhancerSignal / medianDict[name] newLine.append(enhancerSignal) signalTable.append(newLine) outputFile = "%s%s_%s_signalTable.txt" % (outputFolder, genome, analysisName) print "WRITING MEDIAN NORMALIZED SIGNAL TABLE TO %s" % (outputFile) utils.unParseTable(signalTable, outputFile, '\t') return outputFile
def summarizeVenn(mapped_path, group_list=['CG', 'THMYCN'], output=''): ''' summarizes binary occupancy across group to make a venn diagram ''' group_table = [['GFF_LINE', 'ID'] + group_list] mapped_table = utils.parseTable(mapped_path, '\t') group_cols = [] for group in group_list: group_names = [ name for name in mapped_table[0] if name.count(group) > 0 ] group_cols.append( [mapped_table[0].index(name) for name in group_names]) print(group_cols) for line in mapped_table[1:]: binary_vector = [] #a 1/0 vector to hold mapping by group for i in range(len(group_list)): cols = group_cols[i] signal = max([int(line[x]) for x in cols]) binary_vector.append(signal) new_line = line[0:2] + binary_vector group_table.append(new_line) print(group_table[0:5]) #now add up the stats #this part assumes only 2 groups for now otherwise gets combinatorially challenging #permute all possible binary combinations given the vector length binary_combinations = [[0], [1]] for i in range(len(group_list) - 1): new_combinations = [] for x in binary_combinations: print(x) x1 = list(x) + [1] x0 = list(x) + [0] new_combinations.append(x1) new_combinations.append(x0) binary_combinations = list(new_combinations) print(binary_combinations) count_table = [group_list + ['count']] for combo in binary_combinations: count = len([line for line in group_table[1:] if line[2:] == combo]) count_table.append(combo + [count]) print(count_table) if len(output) > 0: utils.unParseTable(count_table, output, '\t') else: return count_table
def assignEnhancerRank(enhancerToGeneFile, enhancerFile1, enhancerFile2, name1, name2, rankOutput=''): ''' for all genes in the enhancerToGene Table, assigns the highest overlapping ranked enhancer in the other tables ''' enhancerToGene = utils.parseTable(enhancerToGeneFile, '\t') enhancerCollection1 = makeSECollection(enhancerFile1, name1, False) enhancerCollection2 = makeSECollection(enhancerFile2, name2, False) enhancerDict1 = makeSEDict(enhancerFile1, name1, False) enhancerDict2 = makeSEDict(enhancerFile2, name2, False) #we're going to update the enhancerToGeneTable enhancerToGene[0] += ['%s_rank' % name1, '%s_rank' % name2] for i in range(1, len(enhancerToGene)): line = enhancerToGene[i] locusLine = utils.Locus(line[1], line[2], line[3], '.', line[0]) #if the enhancer doesn't exist, its ranking is dead last on the enhancer list enhancer1Overlap = enhancerCollection1.getOverlap(locusLine, 'both') if len(enhancer1Overlap) == 0: enhancer1Rank = len(enhancerCollection1) else: rankList1 = [ enhancerDict1[x.ID()]['rank'] for x in enhancer1Overlap ] enhancer1Rank = min(rankList1) enhancer2Overlap = enhancerCollection2.getOverlap(locusLine, 'both') if len(enhancer2Overlap) == 0: enhancer2Rank = len(enhancerCollection2) else: rankList2 = [ enhancerDict2[x.ID()]['rank'] for x in enhancer2Overlap ] enhancer2Rank = min(rankList2) enhancerToGene[i] += [enhancer1Rank, enhancer2Rank] if len(rankOutput) == 0: return enhancerToGene else: utils.unParseTable(enhancerToGene, rankOutput, '\t')
def findMotifs(canidateGenes, projectFolder, projectName, motifConvertFile, motifDatabaseFile): ''' takes the refseq to subpeak seq dict returns the networkx object with all connections ''' # Create a dictionary to call motif names keyed on gene names motifDatabase = utils.parseTable(motifConvertFile, '\t') motifDatabaseDict = {} motifNames = [line[1] for line in motifDatabase] for line in motifDatabase: motifDatabaseDict[line[1]] = [] for line in motifDatabase: motifDatabaseDict[line[1]].append(line[0]) print 'GENERATING TF NETWORK' # select the TF candidates that have motifs canidateMotifs = [] for gene in canidateGenes: if gene in motifNames: canidateMotifs.append(gene) print 'Number of annotated canidate TFs that have motifs: ' + str( len(canidateMotifs)) canidateMotifs = sorted(canidateMotifs) #canidateMotifs = ['NANOG', 'POU5F1', 'SOX2'] bgCmd = 'fasta-get-markov -m 1 < ' + projectFolder + projectName + '_SUBPEAKS.fa > ' + projectFolder + projectName + '', shell=True) utils.formatFolder(projectFolder + 'FIMO/', True) fimoCmd = 'fimo' for TF in canidateMotifs: print TF for x in motifDatabaseDict[TF]: fimoCmd += ' --motif ' + "'%s'" % (str(x)) #fimoCmd += ' --thresh 1e-5' fimoCmd += ' -verbosity 1' # thanks for that ;)! fimoCmd += ' -text' fimoCmd += ' -oc ' + projectFolder + 'FIMO' fimoCmd += ' --bgfile ' + projectFolder + projectName + '' fimoCmd += ' ' + motifDatabaseFile + ' ' fimoCmd += projectFolder + projectName + '_SUBPEAKS.fa' fimoCmd += ' > ' + projectFolder + 'FIMO/fimo.txt' ## print fimoCmd fimoOutput = fimoCmd, shell=True) #will wait that fimo is done to go on return fimoCmd
def filterPeaks(tabixFolder,mycTablePath,outputPath,repeatList = []): ''' auto filters the 3 repeat classes LINE, LTR, Simple_repeat outputs a bed in the format of [PEAK_ID,CHROM, START,STOP,LENGTH, LINE, LTR, Simple_repeat] ''' if len(repeatList) == 0: repeatList = ['LINE','LTR','Simple_repeat'] repeatTable = [['PEAK_ID','CHROM','START','STOP','LENGTH'] + repeatList] mycTable = utils.parseTable(mycTablePath,'\t') ticker =0 for line in mycTable[1:]: if line[0][0] =='P': continue if ticker % 100 == 0: print ticker ticker +=1 peak_ID = line[0] chrom = line[1] start = int(line[2]) stop = int(line[3]) length = line[4] locusString = '%s:%s-%s' % (chrom,start,stop) repeatFractions = [] for repeatClass in repeatList: tabixGFF = '%shg19_%s_category_sorted.gff.gz' % (tabixFolder,repeatClass) tabixCmd = 'tabix %s %s' % (tabixGFF,locusString) tabix = subprocess.Popen(tabixCmd,stdin = subprocess.PIPE,stderr = subprocess.PIPE,stdout = subprocess.PIPE,shell = True) tabixLines = tabix.stdout.readlines() tabixLines = [x.rstrip().split('\t') for x in tabixLines] #i think you get back essentially gff lines overlapFraction = 0.0 for line in tabixLines: lineStart = int(line[3]) lineStop = int(line[4]) lineStart = max(start,lineStart) lineStop = min(stop,lineStop) overlapLength = lineStop - lineStart overlapFraction += float(overlapLength)/float(length) repeatFractions.append(round(overlapFraction,4)) newLine = [peak_ID,chrom,start,stop,length] + repeatFractions repeatTable.append(newLine) utils.unParseTable(repeatTable,outputPath,'\t')
def generateSubpeakFASTA(TFtoEnhancerDict, subpeaks, genomeDirectory, projectName, projectFolder, constExtension): ''' from a BED file of constituents generate a FASTA for the consituients contained within the canidate supers ''' subpeakDict = {} subpeakBED = [['track name=' + projectName + ' color=204,0,204']] subpeakTable = utils.parseTable(subpeaks, '\t') subpeakLoci = [ utils.Locus(l[0], int(l[1]), int(l[2]), '.') for l in subpeakTable ] subpeakCollection = utils.LocusCollection(subpeakLoci, 50) for gene in TFtoEnhancerDict.keys(): subpeakDict[gene] = [] for region in TFtoEnhancerDict[gene]: overlaps = subpeakCollection.getOverlap(region) extendedOverlaps = [ utils.makeSearchLocus(x, constExtension, constExtension) for x in overlaps ] overlapCollectionTemp = utils.LocusCollection(extendedOverlaps, 50) overlapCollection = overlapCollectionTemp.stitchCollection() for overlap in overlapCollection.getLoci(): subpeakBED.append( [overlap.chr(), overlap.start(), overlap.end()]) subpeakDict[gene].append(overlap) bedfilename = projectFolder + projectName + '_subpeaks.bed' utils.unParseTable(subpeakBED, bedfilename, '\t') fasta = [] for gene in subpeakDict: for subpeak in subpeakDict[gene]: fastaTitle = gene + '|' + subpeak.chr() + '|' + str( subpeak.start()) + '|' + str(subpeak.end()) fastaLine = utils.fetchSeq(genomeDirectory, subpeak.chr(), int(subpeak.start() + 1), int(subpeak.end() + 1)) fasta.append('>' + fastaTitle) fasta.append(upper(fastaLine)) outname = projectFolder + projectName + '_SUBPEAKS.fa' utils.unParseTable(fasta, outname, '')
def findMotifs(subpeakFasta, bg_path, candidate_tf_list, projectFolder, analysis_name, motifConvertFile, motifDatabaseFile): ''' takes the refseq to subpeak seq dict returns the networkx object with all connections ''' fimoFolder = utils.formatFolder(projectFolder + 'FIMO/', True) subpeak_name = subpeakFasta.split('/')[-1].split('.')[0] output = '%s%s_fimo.txt' % (fimoFolder, subpeak_name) # Create a dictionary to call motif names keyed on gene names motifDatabase = utils.parseTable(motifConvertFile, '\t') motifDatabaseDict = {} #create a dict keyed by TF with multiple motifs for line in motifDatabase: motifDatabaseDict[line[1]] = [] for line in motifDatabase: motifDatabaseDict[line[1]].append(line[0]) candidate_tf_list.sort() print(candidate_tf_list) #now make a list of all motifs motif_list = [] for tf in candidate_tf_list: motif_list += motifDatabaseDict[tf] motif_list = utils.uniquify(motif_list) fimo_bash_path = '' % (fimoFolder, analysis_name) fimo_bash = open(fimo_bash_path, 'w') fimo_bash.write('#!/usr/bin/bash\n\n') fimoCmd = 'fimo' for motif in motif_list: fimoCmd += ' --motif ' + "'%s'" % (str(motif)) #fimoCmd += ' --thresh 1e-5' #if you want to increase stringency fimoCmd += ' -verbosity 1' # thanks for that ;)! fimoCmd += ' -text' fimoCmd += ' -oc ' + projectFolder + 'FIMO' fimoCmd += ' --bgfile %s' % (bg_path) fimoCmd += ' ' + motifDatabaseFile + ' ' fimoCmd += subpeakFasta fimoCmd += ' > ' + output print fimoCmd fimo_bash.write(fimoCmd) fimo_bash.close() fimoOutput = fimoCmd, shell=True) #will wait that fimo is done to go on return output
def getMedianSignalEnhancer(enhancerFile, name, dataFile): ''' returns the median enhancer signal of a file ''' dataDict = pipeline_dfci.loadDataTable(dataFile) enhancerTable = utils.parseTable(enhancerFile, '\t') enhancerVector = [float(line[6]) for line in enhancerTable[6:]] median = numpy.median(enhancerVector) return median
def averagingMappedSignal(mapped_list, output_path, setName): ''' averages signal across a set of mapped gffs and writes the new output ''' #create a list containing all of the tables table_list = [ utils.parseTable(mapped_list[i], '\t') for i in range(len(mapped_list)) ] #first set up the output header output_header = ['GENE_ID', 'locusLine'] nCols = len(table_list[0][0]) - 2 for n in range(nCols): output_header.append('bin_%s_%s' % (n + 1, setName)) output_table = [output_header] #now iterate through each row to set up the gene ID and locus line for i in range(1, len(table_list[0])): line = table_list[0][i] if len(line) > 2: output_table.append(line[0:2]) #now run through the whole matrix in i,j notation and put average signal into the final matrix #iterate through rows row_ticker = 1 for i in range(1, len(table_list[0])): line = table_list[0][i] if len(line) == 2: continue signal_vector = [] #iterate through columns for j in range(2, len(table_list[0][0])): try: signal_vector = [float(table[i][j]) for table in table_list] except IndexError: print(i, j) print(table_list[0][i]) print(table_list[1][i]) signal = max(round(numpy.average(signal_vector), 4), 0) output_table[row_ticker].append(signal) row_ticker += 1 print(len(table_list[0])) print(len(output_table)) utils.unParseTable(output_table, output_path, '\t') return output_path
def make_probe_to_gene_dict(annotFile, array_1_path, array_2_path): ''' keyed by probe ID w/ gene as value ''' #see if it already exists pickle_path = '%soberthuer_outcome/probe_dict.pkl' % (projectFolder) if utils.checkOutput(pickle_path, 0, 0): print('loading previously made probe dict at %s' % (pickle_path)) probe_gene_dict = pickle.load(open(pickle_path, "rb")) return probe_gene_dict #we want to intersect refseq common names w/ the array startDict = utils.makeStartDict(annotFile) ref_name_list = utils.uniquify( [startDict[refID]['name'] for refID in startDict.keys()]) probe_gene_dict = {} array_1 = utils.parseTable(array_1_path, '\t') array_2 = utils.parseTable(array_2_path, '\t') ticker = 0 for line in array_1 + array_2: if len(line) < 5: continue ticker += 1 probe_id = line[4] name = line[-1] # print(probe_id) # print(name) # if ticker== 10: # sys.exit() # print(line) if ref_name_list.count(name) > 0: probe_gene_dict[probe_id] = name pickle.dump(probe_gene_dict, open(pickle_path, 'wb')) return probe_gene_dict
def makeSignalDict(mappedGFFFile, controlMappedGFFFile=''): ''' makes a signal dict ''' print('\t called makeSignalDict on %s (ctrl: %s)' % (mappedGFFFile, controlMappedGFFFile)) signalDict = defaultdict(float) mappedGFF = utils.parseTable(mappedGFFFile, '\t') if len(controlMappedGFFFile) > 0: controlGFF = utils.parseTable(controlMappedGFFFile, '\t') for i in range(1, len(mappedGFF)): signal = float(mappedGFF[i][2]) - float(controlGFF[i][2]) if signal < 0: signal = 0.0 signalDict[mappedGFF[i][0]] = signal else: for i in range(1, len(mappedGFF)): signal = float(mappedGFF[i][2]) signalDict[mappedGFF[i][0]] = signal return signalDict
def getSignalVector(regionFile, name, dataFile): ''' returns the median enhancer signal of a file ''' dataDict = pipeline_dfci.loadDataTable(dataFile) regionTable = utils.parseTable(regionFile, '\t') bamPath = dataDict[name]['bam'] bamName = bamPath.split('/')[-1] colID = regionTable[0].index(bamName) signalVector = [float(line[colID]) for line in regionTable[1:]] return signalVector
def makePeakGFFs(peak_path_list): ''' makes a stitched gff for all MYC bound TSS and Distal regions across all datasets ''' #setting the output tss_gff_path = '%sHG19_MYC_TSS_REGIONS_-0_+0.gff' % (gffFolder) distal_gff_path = '%sHG19_MYC_DISTAL_REGIONS_-0_+0.gff' % (gffFolder) #check to see if already done if utils.checkOutput(tss_gff_path,0.1,0.1) and utils.checkOutput(distal_gff_path,0.1,0.1): print('OUTPUT FOUND AT %s and %s' % (tss_gff_path,distal_gff_path)) return tss_gff_path,distal_gff_path #emtpy loci lists to hold everything tss_loci = [] distal_loci = [] for peak_path in peak_path_list: print('processing %s' % (peak_path)) peak_table= utils.parseTable(peak_path,'\t') for line in peak_table[1:]: peak_locus = utils.Locus(line[1],line[2],line[3],'.') if int(line[5]) == 0: distal_loci.append(peak_locus) else: tss_loci.append(peak_locus) #now combind the loci print('stitching loci') distal_collection = utils.LocusCollection(distal_loci,50) tss_collection = utils.LocusCollection(tss_loci,50) stitched_distal_collection = distal_collection.stitchCollection() stitched_tss_collection = tss_collection.stitchCollection() #now make the gffs distal_gff= utils.locusCollectionToGFF(distal_collection) tss_gff= utils.locusCollectionToGFF(tss_collection) #now write to disk utils.unParseTable(distal_gff,distal_gff_path,'\t') utils.unParseTable(tss_gff,tss_gff_path,'\t') return tss_gff_path,distal_gff_path
def callMergeSupers(dataFile, superFile1, superFile2, name1, name2, mergeName, genome, parentFolder): ''' this is the main run function for the script all of the work should occur here, but no functions should be defined here ''' mergedGFFFile = '%s%s_%s_MERGED_REGIONS_-0_+0.gff' % ( parentFolder, string.upper(genome), mergeName) #check to make sure this hasn't been done yet roseOutput = "%s%s_ROSE/%s_%s_MERGED_REGIONS_-0_+0_SuperEnhancers_ENHANCER_TO_GENE.txt" % ( parentFolder, name1, string.upper(genome), mergeName) try: foo = utils.parseTable(roseOutput, '\t') print "ROSE OUTPUT ALREADY FOUND HERE %s" % (roseOutput) return roseOutput except IOError: print "MERGING ENHANCER REGIONS FROM %s and %s" % (superFile1, superFile2) mergedGFF = mergeCollections(superFile1, superFile2, name1, name2, mergedGFFFile) #call rose on the merged shit roseBashFile = callRoseMerged(dataFile, mergedGFF, name1, name2, parentFolder) print('i can has rose bash file %s' % (roseBashFile)) #run the bash command os.system('bash %s' % (roseBashFile)) #check for and return output if utils.checkOutput(roseOutput, 1, 10): return roseOutput else: #try finding it w/ a different name #this will bug out if nothing is there roseFolder = "%s%s_ROSE/" % (parentFolder, name1) roseFileList = [x for x in os.listdir(roseFolder) if x[0] != '.'] #no hidden files if len(roseFileList) == 0: print "No files found in %s" % (roseFolder) sys.exit() enhancerToGeneFile = getFile( '_SuperEnhancers_ENHANCER_TO_GENE.txt', roseFileList, roseFolder)
def makeAnnotDict(annotFile): ''' makes a dictionary keyed by guideID ''' guideDict = defaultdict(str) geneDict = defaultdict(list) geckoAnnot = utils.parseTable(annotFile, '\t') for line in geckoAnnot[1:]: guideDict[line[1]] = line[0] geneDict[line[0]].append(line[1]) return guideDict, geneDict
def writeSplitBeds(bed, analysisName, outputFolder, window=50, centered=False): dmrList = [x for x in utils.parseTable(bed, '\t')] # print len(refGenes) dmrBed = [] endsBed = [] startsBed = [] centeredBed = [] if centered == False: for line in dmrList: dmrID = line[3] dmrCoords = [line[0], int(line[1]), int(line[2]), dmrID] dmrBed.append(dmrCoords) startExtend = [line[0], int(line[1]) - window, int(line[1]), dmrID] endExtend = [line[0], int(line[2]), int(line[2]) + window, dmrID] endsBed.append(endExtend) startsBed.append(startExtend) print len(dmrBed) utils.unParseTable(dmrBed, outputFolder + analysisName + '_BODY_-0_+0.bed', '\t') print len(startsBed) utils.unParseTable( startsBed, outputFolder + analysisName + '_UPSTREAM_-' + str(window) + '_+' + str(window) + '.bed', '\t') print len(endsBed) utils.unParseTable( endsBed, outputFolder + analysisName + '_DOWNSTREAM_-' + str(window) + '_+' + str(window) + '.bed', '\t') elif centered == True: for line in dmrList: dmrID = line[3] center = (int(line[1]) + int(line[2])) / 2 centeredBed.append( [line[0], center - window, center + window, dmrID]) utils.unParseTable( centeredBed, outputFolder + analysisName + '_CENTERED_-' + str(window) + '_+' + str(window) + '.bed', '\t')
def make_mycn_regions(conserved_rank_path): ''' takes conserved NB MYCN regions then creates a bed and gff of regions ''' conserved_rank_table = utils.parseTable(conserved_rank_path,'\t') mycn_gff = [] mycn_flank_gff = [] mycn_bed = [] mycn_flank_bed = [] for line in conserved_rank_table[1:]: locus_line = utils.Locus(line[1],line[2],line[3],'.') if int(line[3]) < int(line[2]): print('uh oh') print(line) gff_line = [line[1],line[0],'',line[2],line[3],'','.','',line[0]] bed_line = [line[1],line[2],line[3],line[0]] mycn_gff.append(gff_line) mycn_bed.append(bed_line) gff_flank_line = [line[1],line[0],'',int(line[2])-500,int(line[3])+500,'','.','',line[0]] bed_flank_line = [line[1],int(line[2])-500,int(line[3])+500,line[0]] mycn_flank_gff.append(gff_flank_line) mycn_flank_bed.append(bed_flank_line) mycn_gff_path = '%sHG19_NB_MYCN_CONSERVED_-0_+0.gff' % (gffFolder) mycn_flank_gff_path = '%sHG19_NB_MYCN_CONSERVED_-500_+500.gff' % (gffFolder) mycn_bed_path = '%sHG19_NB_MYCN_CONSERVED_-0_+0.bed' % (bedFolder) mycn_flank_bed_path = '%sHG19_NB_MYCN_CONSERVED_-500_+500.bed' % (bedFolder) #writing to disk utils.unParseTable(mycn_gff,mycn_gff_path,'\t') utils.unParseTable(mycn_flank_gff,mycn_flank_gff_path,'\t') utils.unParseTable(mycn_bed,mycn_bed_path,'\t') utils.unParseTable(mycn_flank_bed,mycn_flank_bed_path,'\t') print(mycn_gff_path) print(mycn_flank_gff_path) print(mycn_bed_path) print(mycn_flank_bed_path) return mycn_gff_path,mycn_flank_gff_path
def getMedianSignal(enhancerFile, name, dataFile): """ returns the median enhancer signal of a file """ dataDict = pipeline_dfci.loadDataTable(dataFile) enhancerTable = utils.parseTable(enhancerFile, "\t") backgroundName = dataDict[name]["background"] if dataDict.has_key(backgroundName): enhancerVector = [float(line[6]) - float(line[7]) for line in enhancerTable[6:]] else: enhancerVector = [float(line[6]) for line in enhancerTable[6:]] median = numpy.median(enhancerVector) return median
def generateSubpeakFASTA(TFandSuperDict, subpeaks, genomeDirectory, projectName, projectFolder, motifExtension): ''' takes as input a BED file of constituents outputs a FASTA file of merged extended super-enhancer consituents and associated formated name ''' print 'MAKE FASTA' subpeakDict = {} subpeakBED = [['track name=' + projectName + ' color=204,0,204']] subpeakTable = utils.parseTable(subpeaks, '\t') subpeakLoci = [utils.Locus(l[0], int(l[1]), int(l[2]), '.') for l in subpeakTable] subpeakCollection = utils.LocusCollection(subpeakLoci, 50) for gene in TFandSuperDict.keys(): subpeakDict[gene] = [] for region in TFandSuperDict[gene]: overlaps = subpeakCollection.getOverlap(region) extendedOverlaps = [utils.makeSearchLocus(x, motifExtension, motifExtension) for x in overlaps] overlapCollectionTemp = utils.LocusCollection(extendedOverlaps, 50) overlapCollection = overlapCollectionTemp.stitchCollection() for overlap in overlapCollection.getLoci(): subpeakBED.append([overlap.chr(), overlap.start(), overlap.end()]) subpeakDict[gene].append(overlap) bedfilename = projectFolder + projectName + '_subpeaks.bed' utils.unParseTable(subpeakBED, bedfilename, '\t') fasta = [] for gene in subpeakDict: for subpeak in subpeakDict[gene]: fastaTitle = gene + '|' + subpeak.chr() + '|' + str(subpeak.start()) + '|' + str(subpeak.end()) fastaLine = utils.fetchSeq(genomeDirectory, subpeak.chr(), int(subpeak.start()+1), int(subpeak.end()+1)) fasta.append('>' + fastaTitle) fasta.append(upper(fastaLine)) # Output the fasta file of extended SE constituents outname = projectFolder + projectName + '_SUBPEAKS.fa' utils.unParseTable(fasta, outname, '')
def getExpanded(locusTable, expansion, status, output): loci = utils.parseTable(locusTable, '\t') expandedList = [] for line in loci: wtLocus = line[0:4] mutLocus = line[4:8] if status == 'WT': newLine = expansionStat(wtLocus, mutLocus, expansion=0.1) if len(newLine) > 0: expandedList.append(newLine) elif status == 'MUT': newLine = expansionStat(mutLocus, wtLocus, expansion=0.1) if len(newLine) > 0: expandedList.append(newLine) print len(expandedList), ' expanded loci in ', status utils.unParseTable(expandedList, output, '\t')
def findMotifs(candidateGenes, projectFolder, projectName, motifConvertFile, motifDatabaseFile): '''Run the motif search on the extended SE constituents with FIMO ''' print 'MOTIF SEARCH' # Create a dictionary of motif keyed on each TF motifDatabase = utils.parseTable(motifConvertFile, '\t') motifDatabaseDict = {} motifNames = [line[1] for line in motifDatabase] for line in motifDatabase: motifDatabaseDict[line[1]] = [] for line in motifDatabase: motifDatabaseDict[line[1]].append(line[0]) canidateMotifs = [] for gene in candidateGenes: if gene in motifNames: canidateMotifs.append(gene) print 'Number of annotated candidate TFs that have motifs: ' + str(len(canidateMotifs)) canidateMotifs = sorted(canidateMotifs) # Create a backgroud sequence file to use with FIMO bgCmd = 'fasta-get-markov -m 1 < ' + projectFolder + projectName + '_SUBPEAKS.fa > ' + projectFolder + projectName + '' call(bgCmd, shell=True) # Run the motif search with FIMO fimoCmd = 'fimo' for motif in canidateMotifs: for x in motifDatabaseDict[motif]: fimoCmd += ' --motif ' + "'%s'" % (str(x)) fimoCmd += ' -verbosity 1' fimoCmd += ' -text' fimoCmd += ' -oc ' + projectFolder fimoCmd += ' --bgfile ' + projectFolder + projectName + '' fimoCmd += ' ' + motifDatabaseFile + ' ' fimoCmd += projectFolder + projectName + '_SUBPEAKS.fa' fimoCmd += ' > '+ projectFolder + 'fimo.txt' print fimoCmd fimoOutput = call(fimoCmd, shell=True) return fimoCmd
def filterGFF(gffFile,chromList): ''' takes in a gff and filters out all lines that don't belong to a chrom in the chromList ''' gff = utils.parseTable(gffFile,'\t') filteredGFF = [] excludeList=[] for line in gff: if chromList.count(line[0]) ==1: filteredGFF.append(line) else: excludeList.append(line[0]) excludeList = utils.uniquify(excludeList) if len(excludeList) > 0: print("EXCLUDED GFF REGIONS FROM THE FALLING CHROMS: %s" % (','.join(excludeList))) return filteredGFF
def makeMedianDict(nameDict): ''' for each dataset returns the median background subtracted enhancer signal ''' medianDict = {} for name in nameDict: #open up the allenhancerTable enhancerTable = utils.parseTable(nameDict[name]['enhancerFile'],'\t') #assume header ends after line 5 enhancerVector = [float(line[6]) - float(line[7]) for line in enhancerTable[6:]] medianDict[name] = numpy.median(enhancerVector) return medianDict
def generateSubpeakFASTA(gene_to_enhancer_dict, subpeaks, genome, projectName, projectFolder, constExtension): ''' from a BED file of constituents generate a FASTA for the consituients contained within the canidate supers ''' genomeDirectory = subpeakDict = {} subpeakBED = [['track name=' + projectName + ' color=204,0,204']] subpeakTable = utils.parseTable(subpeaks, '\t') subpeakLoci = [utils.Locus(l[0], int(l[1]), int(l[2]), '.') for l in subpeakTable] subpeakCollection = utils.LocusCollection(subpeakLoci, 50) for gene in gene_to_enhancer_dict.keys(): subpeakDict[gene] = [] for region in gene_to_enhancer_dict[gene]: overlaps = subpeakCollection.getOverlap(region) extendedOverlaps = [utils.makeSearchLocus(x, constExtension, constExtension) for x in overlaps] overlapCollectionTemp = utils.LocusCollection(extendedOverlaps, 50) overlapCollection = overlapCollectionTemp.stitchCollection() for overlap in overlapCollection.getLoci(): subpeakBED.append([overlap.chr(), overlap.start(), overlap.end()]) subpeakDict[gene].append(overlap) fasta = [] for gene in subpeakDict: for subpeak in subpeakDict[gene]: fastaTitle = gene + '|' + subpeak.chr() + '|' + str(subpeak.start()) + '|' + str(subpeak.end()) fastaLine = utils.fetchSeq(genomeDirectory, subpeak.chr(), int(subpeak.start()+1), int(subpeak.end()+1)) fasta.append('>' + fastaTitle) fasta.append(string.upper(fastaLine)) return subpeakBED,fasta
def makeRigerTable(foldTableFile,output=''): ''' blah ''' #need a table of this format rigerTable = [['Construct','GeneSymbol','NormalizedScore','Construct Rank','HairpinWeight']] #set weight to 1 for now foldTable = utils.parseTable(foldTableFile,'\t') constructOrder = utils.order([float(line[2]) for line in foldTable[1:]],decreasing=True) #make geneCountDict print("making gene count dictionary") geneCountDict= defaultdict(int) for line in foldTable[1:]: geneCountDict[line[1]] +=1 print("iterating through constructs") constructRank = 1 for i in constructOrder: rowIndex = i+1 # accounts for the header geneName = foldTable[rowIndex][1] if geneCountDict[geneName] == 1: print("Gene %s only has one guide RNA. Excluding from FRIGER analysis" % (geneName)) continue newLine = foldTable[rowIndex][0:3] + [constructRank,1] rigerTable.append(newLine) constructRank += 1 if len(output) == 0: output = string.replace(foldTableFile,'_log2Ratio.txt','_friger.txt') utils.unParseTable(rigerTable,output,'\t') return output
import utils from sys import argv filename = argv[1] outname = filename[:-3] + 'sorted.bed' bedfile = utils.parseTable(filename, '\t') out = [] for line in bedfile: coords = [int(line[1]), int(line[2])] start = min(coords) end = max(coords) newline = [line[0], start, end] + line[3:] out.append(newline) utils.unParseTable(out, outname, '\t')
def mapCollection(stitchedCollection, referenceCollection, bamFileList, mappedFolder, output, refName): ''' makes a table of factor density in a stitched locus and ranks table by number of loci stitched together ''' print('FORMATTING TABLE') loci = stitchedCollection.getLoci() locusTable = [['REGION_ID', 'CHROM', 'START', 'STOP', 'NUM_LOCI', 'CONSTITUENT_SIZE']] lociLenList = [] # strip out any that are in chrY for locus in list(loci): if locus.chr() == 'chrY': loci.remove(locus) for locus in loci: # numLociList.append(int(stitchLocus.ID().split('_')[1])) lociLenList.append(locus.len()) # numOrder = order(numLociList,decreasing=True) lenOrder = utils.order(lociLenList, decreasing=True) ticker = 0 for i in lenOrder: ticker += 1 if ticker % 1000 == 0: print(ticker) locus = loci[i] # First get the size of the enriched regions within the stitched locus refEnrichSize = 0 refOverlappingLoci = referenceCollection.getOverlap(locus, 'both') for refLocus in refOverlappingLoci: refEnrichSize += refLocus.len() try: stitchCount = int(locus.ID().split('_')[0]) except ValueError: stitchCount = 1 coords = [int(x) for x in locus.coords()] locusTable.append([locus.ID(), locus.chr(), min(coords), max(coords), stitchCount, refEnrichSize]) print('GETTING MAPPED DATA') print("USING A BAMFILE LIST:") print(bamFileList) for bamFile in bamFileList: bamFileName = bamFile.split('/')[-1] print('GETTING MAPPING DATA FOR %s' % bamFile) # assumes standard convention for naming enriched region gffs # opening up the mapped GFF print('OPENING %s%s_%s_MAPPED/matrix.txt' % (mappedFolder, refName, bamFileName)) mappedGFF = utils.parseTable('%s%s_%s_MAPPED/matrix.txt' % (mappedFolder, refName, bamFileName), '\t') signalDict = defaultdict(float) print('MAKING SIGNAL DICT FOR %s' % (bamFile)) mappedLoci = [] for line in mappedGFF[1:]: chrom = line[1].split('(')[0] start = int(line[1].split(':')[-1].split('-')[0]) end = int(line[1].split(':')[-1].split('-')[1]) mappedLoci.append(utils.Locus(chrom, start, end, '.', line[0])) try: signalDict[line[0]] = float(line[2]) * (abs(end - start)) except ValueError: print('WARNING NO SIGNAL FOR LINE:') print(line) continue mappedCollection = utils.LocusCollection(mappedLoci, 500) locusTable[0].append(bamFileName) for i in range(1, len(locusTable)): signal = 0.0 line = locusTable[i] lineLocus = utils.Locus(line[1], line[2], line[3], '.') overlappingRegions = mappedCollection.getOverlap(lineLocus, sense='both') for region in overlappingRegions: signal += signalDict[region.ID()] locusTable[i].append(signal) utils.unParseTable(locusTable, output, '\t')
def finishRankOutput(dataFile,rankOutput,genome,mergeFolder,mergeName,name1,name2,cutOff=1.5,window = 100000,superOnly=True,plotBam=True): ''' cleans up the rank output table makes a gff of all of the gained/lost supers beyond a certain cutoff w/ a window makes a list of gained genes and lost genes makes a bed of gained loss ''' dataDict = pipeline_dfci.loadDataTable(dataFile) #making sure window and cutoff are int/float cutOff = float(cutOff) window = int(window) genome = string.upper(genome) #make the output folder outputFolder =pipeline_dfci.formatFolder(mergeFolder+'output/',True) #bring in the old rank table rankEnhancerTable = utils.parseTable(rankOutput,'\t') #make a new formatted table header = rankEnhancerTable[0] header[-4] = 'DELTA RANK' header[-3] = 'IS_SUPER' formattedRankTable =[header] #the gffs gainedGFF = [] lostGFF = [] gainedWindowGFF = [] lostWindowGFF = [] if superOnly: enhancerType = 'SUPERS' else: enhancerType = 'ENHANCERS' #the beds if superOnly: gainedTrackHeader = 'track name="%s %s only SEs" description="%s super enhancers that are found only in %s vs %s" itemRGB=On color=255,0,0' % (genome,name2,genome,name2,name1) gainedBed = [[gainedTrackHeader]] conservedTrackHeader = 'track name="%s %s and %s SEs" description="%s super enhancers that are found in both %s vs %s" itemRGB=On color=0,0,0' % (genome,name1,name2,genome,name1,name2) conservedBed = [[conservedTrackHeader]] lostTrackHeader = 'track name="%s %s only SEs" description="%s super enhancers that are found only in %s vs %s" itemRGB=On color=0,255,0' % (genome,name1,genome,name1,name2) lostBed = [[lostTrackHeader]] else: gainedTrackHeader = 'track name="%s %s only enhancers" description="%s enhancers that are found only in %s vs %s" itemRGB=On color=255,0,0' % (genome,name2,genome,name2,name1) gainedBed = [[gainedTrackHeader]] conservedTrackHeader = 'track name="%s %s and %s enhancers" description="%s enhancers that are found in both %s vs %s" itemRGB=On color=0,0,0' % (genome,name1,name2,genome,name1,name2) conservedBed = [[conservedTrackHeader]] lostTrackHeader = 'track name="%s %s only enhancers" description="%s enhancers that are found only in %s vs %s" itemRGB=On color=0,255,0' % (genome,name1,genome,name1,name2) lostBed = [[lostTrackHeader]] #the genes geneTable =[['GENE','ENHANCER_ID','ENHANCER_CHROM','ENHANCER_START','ENHANCER_STOP',header[6],header[7],header[8],'STATUS']] for line in rankEnhancerTable[1:]: #fixing the enhancer ID line[0] = line[0].replace('_lociStitched','') formattedRankTable.append(line) #getting the genes geneList = [] geneList += line[9].split(',') geneList += line[10].split(',') geneList += line[11].split(',') geneList = [x for x in geneList if len(x) >0] geneList = utils.uniquify(geneList) geneString = string.join(geneList,',') bedLine = [line[1],line[2],line[3],line[0],line[-4]] #for gained if float(line[6]) > cutOff: gffLine = [line[1],line[0],'',line[2],line[3],'','.','',geneString] gffWindowLine = [line[1],line[0],'',int(line[2])-window,int(line[3])+window,'','.','',geneString] gainedGFF.append(gffLine) gainedWindowGFF.append(gffWindowLine) geneStatus = name2 gainedBed.append(bedLine) #for lost elif float(line[6]) < (-1 * cutOff): gffLine = [line[1],line[0],'',line[2],line[3],'','.','',geneString] gffWindowLine = [line[1],line[0],'',int(line[2])-window,int(line[3])+window,'','.','',geneString] lostGFF.append(gffLine) lostWindowGFF.append(gffWindowLine) geneStatus = name1 lostBed.append(bedLine) #for conserved else: geneStatus = 'CONSERVED' conservedBed.append(bedLine) #now fill in the gene Table for gene in geneList: geneTableLine = [gene,line[0],line[1],line[2],line[3],line[6],line[7],line[8],geneStatus] geneTable.append(geneTableLine) #concat the bed fullBed = gainedBed + conservedBed + lostBed #start writing the output #there's the two gffs, the bed,the formatted table, the gene table #formatted table formattedFilename = "%s%s_%s_MERGED_%s_RANK_TABLE.txt" % (outputFolder,genome,mergeName,enhancerType) utils.unParseTable(formattedRankTable,formattedFilename,'\t') #gffs gffFolder = pipeline_dfci.formatFolder(outputFolder+'gff/',True) gffFilename_gained = "%s%s_%s_%s_ONLY_%s_-0_+0.gff" % (gffFolder,genome,mergeName,string.upper(name2),enhancerType) gffFilenameWindow_gained = "%s%s_%s_%s_ONLY_%s_-%sKB_+%sKB.gff" % (gffFolder,genome,mergeName,string.upper(name2),enhancerType,window/1000,window/1000) gffFilename_lost = "%s%s_%s_%s_ONLY_%s_-0_+0.gff" % (gffFolder,genome,mergeName,string.upper(name1),enhancerType) gffFilenameWindow_lost = "%s%s_%s_%s_ONLY_%s_-%sKB_+%sKB.gff" % (gffFolder,genome,mergeName,string.upper(name1),enhancerType,window/1000,window/1000) utils.unParseTable(gainedGFF,gffFilename_gained,'\t') utils.unParseTable(gainedWindowGFF,gffFilenameWindow_gained,'\t') utils.unParseTable(lostGFF,gffFilename_lost,'\t') utils.unParseTable(lostWindowGFF,gffFilenameWindow_lost,'\t') #bed bedFilename = "%s%s_%s_MERGED_%s.bed" % (outputFolder,genome,mergeName,enhancerType) utils.unParseTable(fullBed,bedFilename,'\t') #geneTable geneFilename = "%s%s_%s_MERGED_%s_GENE_TABLE.txt" % (outputFolder,genome,mergeName,enhancerType) utils.unParseTable(geneTable,geneFilename,'\t') #finally, move all of the plots to the output folder cmd = "cp %s%s_ROSE/*.pdf %s%s_%s_MERGED_%s_DELTA.pdf" % (mergeFolder,name1,outputFolder,genome,mergeName,enhancerType) os.system(cmd) cmd = "cp %s%s_ROSE/*RANK_PLOT.png %s%s_%s_MERGED_%s_RANK_PLOT.png" % (mergeFolder,name1,outputFolder,genome,mergeName,enhancerType) os.system(cmd) #now execute the commands if plotBam: bam1 = dataDict[name1]['bam'] bam2 = dataDict[name2]['bam'] bamString = "%s,%s" % (bam1,bam2) nameString = "%s,%s" % (name1,name2) colorString = "0,0,0:100,100,100" #change dir os.chdir(pipelineDir) if len(gainedGFF) > 0: #gained command plotTitle = "%s_ONLY_SE" % (name2) cmd = 'python -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE' % (genome,bamString,gffFilename_gained,outputFolder,nameString,colorString,plotTitle) os.system(cmd) #gained window command plotTitle = "%s_ONLY_SE_%sKB_WINDOW" % (name2,window/1000) cmd = 'python -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE' % (genome,bamString,gffFilenameWindow_gained,outputFolder,nameString,colorString,plotTitle) os.system(cmd) if len(lostGFF) > 0: #lost command plotTitle = "%s_ONLY_SE" % (name1) cmd = 'python -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE' % (genome,bamString,gffFilename_lost,outputFolder,nameString,colorString,plotTitle) os.system(cmd) #lost command plotTitle = "%s_ONLY_SE_%sKB_WINDOW" % (name1,window/1000) cmd = 'python -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE' % (genome,bamString,gffFilenameWindow_lost,outputFolder,nameString,colorString,plotTitle) os.system(cmd) return
def main(): ''' main run call ''' debug = False from optparse import OptionParser usage = "usage: %prog [options] -g [GENOME] -i [INPUT_REGION_GFF] -r [RANKBY_BAM_FILE] -o [OUTPUT_FOLDER] [OPTIONAL_FLAGS]" parser = OptionParser(usage=usage) # required flags parser.add_option("-i", "--i", dest="input", nargs=1, default=None, help="Enter a comma separated list of .gff or .bed file of binding sites used to make enhancers") parser.add_option("-r", "--rankby", dest="rankby", nargs=1, default=None, help="Enter a comma separated list of bams to rank by") parser.add_option("-o", "--out", dest="out", nargs=1, default=None, help="Enter an output folder") parser.add_option("-g", "--genome", dest="genome", nargs=1, default=None, help="Enter the genome build (MM9,MM8,HG18,HG19)") # optional flags parser.add_option("-n", "--name", dest="name", nargs=1, default=None, help="Provide a name for the analysis otherwise ROSE will guess") parser.add_option("-c", "--control", dest="control", nargs=1, default=None, help="Enter a comma separated list of control bams. Can either provide a single control bam for all rankby bams, or provide a control bam for each individual bam") parser.add_option("-s", "--stitch", dest="stitch", nargs=1, default='', help="Enter a max linking distance for stitching. Default will determine optimal stitching parameter") parser.add_option("-t", "--tss", dest="tss", nargs=1, default=0, help="Enter a distance from TSS to exclude. 0 = no TSS exclusion") parser.add_option("--mask", dest="mask", nargs=1, default=None, help="Mask a set of regions from analysis. Provide a .bed or .gff of masking regions") # RETRIEVING FLAGS (options, args) = parser.parse_args() if not options.input or not options.rankby or not options.out or not options.genome: print('hi there') parser.print_help() exit() # making the out folder if it doesn't exist outFolder = utils.formatFolder(options.out, True) # figuring out folder schema gffFolder = utils.formatFolder(outFolder + 'gff/', True) mappedFolder = utils.formatFolder(outFolder + 'mappedGFF/', True) # GETTING INPUT FILE(s) inputList = [inputFile for inputFile in options.input.split(',') if len(inputFile) > 1] #converting all input files into GFFs and moving into the GFF folder inputGFFList = [] for inputFile in inputList: if inputFile.split('.')[-1] == 'bed': # CONVERTING A BED TO GFF inputGFFName = inputFile.split('/')[-1][0:-4] #strips the last 4 characters i.e. '.bed' inputGFFFile = '%s%s.gff' % (gffFolder, inputGFFName) utils.bedToGFF(inputFile, inputGFFFile) elif options.input.split('.')[-1] == 'gff': # COPY THE INPUT GFF TO THE GFF FOLDER os.system('cp %s %s' % (inputFile, gffFolder)) inputGFFFile = '%s%s' % (gffFolder,inputFile.split('/')[-1]) else: print('WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT') # COPY THE INPUT GFF TO THE GFF FOLDER os.system('cp %s %s' % (inputFile, gffFolder)) inputGFFFile = '%s%s' % (gffFolder,inputFile.split('/')[-1]) inputGFFList.append(inputGFFFile) # GETTING THE LIST OF BAMFILES TO PROCESS #either same number of bams for rankby and control #or only 1 control #or none! #bamlist should be all rankby bams followed by control bams bamFileList = [] if options.control: controlBamList = [bam for bam in options.control.split(',') if len(bam) >0] rankbyBamList = [bam for bam in options.rankby.split(',') if len(bam) >0] if len(controlBamList) == len(rankbyBamList): #case where an equal number of backgrounds are given bamFileList = rankbyBamList + controlBamList elif len(controlBamList) == 1: #case where a universal background is applied bamFileList = rankbyBamList + controlBamList*len(rankbyBamList) else: print('ERROR: EITHER PROVIDE A SINGLE CONTROL BAM FOR ALL SAMPLES, OR ONE CONTROL BAM FOR EACH SAMPLE') sys.exit() else: bamFileList = [bam for bam in options.rankby.split(',') if len(bam) > 0] # Stitch parameter if options.stitch == '': stitchWindow = '' else: stitchWindow = int(options.stitch) # tss options tssWindow = int(options.tss) if tssWindow != 0: removeTSS = True else: removeTSS = False # GETTING THE GENOME genome = string.upper(options.genome) print('USING %s AS THE GENOME' % (genome)) # GETTING THE CORRECT ANNOT FILE genomeDict = { 'HG18': '%s/annotation/hg18_refseq.ucsc' % (pipeline_dir), 'MM9': '%s/annotation/mm9_refseq.ucsc' % (pipeline_dir), 'HG19': '%s/annotation/hg19_refseq.ucsc' % (pipeline_dir), 'MM8': '%s/annotation/mm8_refseq.ucsc' % (pipeline_dir), 'MM10': '%s/annotation/mm10_refseq.ucsc' % (pipeline_dir), 'RN4': '%s/annotation/rn4_refseq.ucsc' % (pipeline_dir), } try: annotFile = genomeDict[genome.upper()] except KeyError: print('ERROR: UNSUPPORTED GENOMES TYPE %s' % (genome)) sys.exit() #FINDING THE ANALYSIS NAME if inputName = else: inputName = inputGFFList[0].split('/')[-1].split('.')[0] print('USING %s AS THE ANALYSIS NAME' % (inputName)) print('FORMATTING INPUT REGIONS') # MAKING THE RAW INPUT FILE FROM THE INPUT GFFs #use a simpler unique region naming system if len(inputGFFList) == 1: inputGFF = utils.parseTable(inputGFFList[0],'\t') else: inputLoci = [] for gffFile in inputGFFList: print('\tprocessing %s' % (gffFile)) gff = utils.parseTable(gffFile,'\t') gffCollection = utils.gffToLocusCollection(gff,50) inputLoci += gffCollection.getLoci() inputCollection = utils.LocusCollection(inputLoci,50) inputCollection = inputCollection.stitchCollection() # stitches to produce unique regions inputGFF = utils.locusCollectionToGFF(inputCollection) formattedGFF = [] #now number things appropriately for i,line in enumerate(inputGFF): #use the coordinates to make a new id inputname_chr_sense_start_stop chrom = line[0] coords = [int(line[3]) ,int(line[4])] sense = line[6] lineID = '%s_%s' % (inputName,str(i+1)) #1 indexing newLine = [chrom,lineID,lineID,min(coords),max(coords),'',sense,'',lineID] formattedGFF.append(newLine) #name of the master input gff file masterGFFFile = '%s%s_%s_ALL_-0_+0.gff' % (gffFolder,string.upper(genome),inputName) utils.unParseTable(formattedGFF,masterGFFFile,'\t') print('USING %s AS THE INPUT GFF' % (masterGFFFile)) # MAKING THE START DICT print('MAKING START DICT') startDict = utils.makeStartDict(annotFile) #GET CHROMS FOUND IN THE BAMS print('GETTING CHROMS IN BAMFILES') bamChromList = getBamChromList(bamFileList) print("USING THE FOLLOWING CHROMS") print(bamChromList) #LOADING IN THE GFF AND FILTERING BY CHROM print('LOADING AND FILTERING THE GFF') inputGFF = filterGFF(masterGFFFile,bamChromList) # LOADING IN THE BOUND REGION REFERENCE COLLECTION print('LOADING IN GFF REGIONS') referenceCollection = utils.gffToLocusCollection(inputGFF) print('CHECKING REFERENCE COLLECTION:') checkRefCollection(referenceCollection) # MASKING REFERENCE COLLECTION # see if there's a mask if options.mask: maskFile = options.mask # if it's a bed file if maskFile.split('.')[-1].upper() == 'BED': maskGFF = utils.bedToGFF(maskFile) elif maskFile.split('.')[-1].upper() == 'GFF': maskGFF = utils.parseTable(maskFile, '\t') else: print("MASK MUST BE A .gff or .bed FILE") sys.exit() maskCollection = utils.gffToLocusCollection(maskGFF) # now mask the reference loci referenceLoci = referenceCollection.getLoci() filteredLoci = [locus for locus in referenceLoci if len(maskCollection.getOverlap(locus, 'both')) == 0] print("FILTERED OUT %s LOCI THAT WERE MASKED IN %s" % (len(referenceLoci) - len(filteredLoci), maskFile)) referenceCollection = utils.LocusCollection(filteredLoci, 50) # NOW STITCH REGIONS print('STITCHING REGIONS TOGETHER') stitchedCollection, debugOutput, stitchWindow = regionStitching(referenceCollection, inputName, outFolder, stitchWindow, tssWindow, annotFile, removeTSS) # NOW MAKE A STITCHED COLLECTION GFF print('MAKING GFF FROM STITCHED COLLECTION') stitchedGFF = utils.locusCollectionToGFF(stitchedCollection) print(stitchWindow) print(type(stitchWindow)) if not removeTSS: stitchedGFFFile = '%s%s_%sKB_STITCHED.gff' % (gffFolder, inputName, str(stitchWindow / 1000)) stitchedGFFName = '%s_%sKB_STITCHED' % (inputName, str(stitchWindow / 1000)) debugOutFile = '%s%s_%sKB_STITCHED.debug' % (gffFolder, inputName, str(stitchWindow / 1000)) else: stitchedGFFFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.gff' % (gffFolder, inputName, str(stitchWindow / 1000)) stitchedGFFName = '%s_%sKB_STITCHED_TSS_DISTAL' % (inputName, str(stitchWindow / 1000)) debugOutFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.debug' % (gffFolder, inputName, str(stitchWindow / 1000)) # WRITING DEBUG OUTPUT TO DISK if debug: print('WRITING DEBUG OUTPUT TO DISK AS %s' % (debugOutFile)) utils.unParseTable(debugOutput, debugOutFile, '\t') # WRITE THE GFF TO DISK print('WRITING STITCHED GFF TO DISK AS %s' % (stitchedGFFFile)) utils.unParseTable(stitchedGFF, stitchedGFFFile, '\t') # SETTING UP THE OVERALL OUTPUT FILE outputFile1 = outFolder + stitchedGFFName + '_ENHANCER_REGION_MAP.txt' print('OUTPUT WILL BE WRITTEN TO %s' % (outputFile1)) # MAPPING TO THE NON STITCHED (ORIGINAL GFF) # MAPPING TO THE STITCHED GFF # Try to use the script on cluster, otherwise, failover to local (in path), otherwise fail. bamFileListUnique = list(bamFileList) bamFileListUnique = utils.uniquify(bamFileListUnique) #prevent redundant mapping print("MAPPING TO THE FOLLOWING BAMS:") print(bamFileListUnique) for bamFile in bamFileListUnique: bamFileName = bamFile.split('/')[-1] # MAPPING TO THE STITCHED GFF mappedOut1Folder = '%s%s_%s_MAPPED' % (mappedFolder, stitchedGFFName, bamFileName) mappedOut1File = '%s%s_%s_MAPPED/matrix.txt' % (mappedFolder, stitchedGFFName, bamFileName) if utils.checkOutput(mappedOut1File, 0.2, 0.2): print("FOUND %s MAPPING DATA FOR BAM: %s" % (stitchedGFFFile, mappedOut1File)) else: cmd1 = bamliquidator_path + " --sense . -e 200 --match_bamToGFF -r %s -o %s %s" % (stitchedGFFFile, mappedOut1Folder, bamFile) print(cmd1) os.system(cmd1) if utils.checkOutput(mappedOut1File,0.2,5): print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (stitchedGFFFile, bamFileName)) else: print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (stitchedGFFFile, bamFileName)) sys.exit() print('BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS') # CALCULATE DENSITY BY REGION # NEED TO FIX THIS FUNCTION TO ACCOUNT FOR DIFFERENT OUTPUTS OF LIQUIDATOR mapCollection(stitchedCollection, referenceCollection, bamFileList, mappedFolder, outputFile1, refName=stitchedGFFName) print('FINDING AVERAGE SIGNAL AMONGST BAMS') metaOutputFile = collapseRegionMap(outputFile1,inputName + '_MERGED_SIGNAL',controlBams=options.control) #now try the merging print('CALLING AND PLOTTING SUPER-ENHANCERS') rankbyName = inputName + '_MERGED_SIGNAL' controlName = 'NONE' cmd = 'Rscript %sROSE2_callSuper.R %s %s %s %s' % (pipeline_dir,outFolder, metaOutputFile, inputName, controlName) print(cmd) os.system(cmd) # calling the gene mapper print('CALLING GENE MAPPING') superTableFile = "%s_SuperEnhancers.table.txt" % (inputName) #for now don't use ranking bam to call top genes cmd = "python -g %s -i %s%s -f" % (pipeline_dir,genome, outFolder, superTableFile) print(cmd) os.system(cmd) stretchTableFile = "%s_StretchEnhancers.table.txt" % (inputName) cmd = "python -g %s -i %s%s -f" % (pipeline_dir,genome, outFolder, stretchTableFile) print(cmd) os.system(cmd) superStretchTableFile = "%s_SuperStretchEnhancers.table.txt" % (inputName) cmd = "python -g %s -i %s%s -f" % (pipeline_dir,genome, outFolder, superStretchTableFile) os.system(cmd)
def finishRankOutput(dataFile, rankOutput, genome, mergeFolder, mergeName, name1, name2, cutOff=1.5, window=100000): """ cleans up the rank output table makes a gff of all of the gained/lost supers beyond a certain cutoff w/ a window makes a list of gained genes and lost genes makes a bed of gained loss """ dataDict = pipeline_dfci.loadDataTable(dataFile) # making sure window and cutoff are int/float cutOff = float(cutOff) window = int(window) genome = string.upper(genome) # make the output folder outputFolder = pipeline_dfci.formatFolder(mergeFolder + "output/", True) # bring in the old rank table rankEnhancerTable = utils.parseTable(rankOutput, "\t") # make a new formatted table header = rankEnhancerTable[0] header[-4] = "DELTA RANK" header[-3] = "IS_SUPER" formattedRankTable = [header] # the gffs gainedGFF = [] lostGFF = [] gainedWindowGFF = [] lostWindowGFF = [] # the beds gainedTrackHeader = ( 'track name="%s %s only SEs" description="%s super enhancers that are found only in %s vs %s" itemRGB=On color=255,0,0' % (genome, name2, genome, name2, name1) ) gainedBed = [[gainedTrackHeader]] conservedTrackHeader = ( 'track name="%s %s and %s SEs" description="%s super enhancers that are found in both %s vs %s" itemRGB=On color=0,0,0' % (genome, name1, name2, genome, name1, name2) ) conservedBed = [[conservedTrackHeader]] lostTrackHeader = ( 'track name="%s %s only SEs" description="%s super enhancers that are found only in %s vs %s" itemRGB=On color=0,255,0' % (genome, name1, genome, name1, name2) ) lostBed = [[lostTrackHeader]] # the genes geneTable = [ [ "GENE", "ENHANCER_ID", "ENHANCER_CHROM", "ENHANCER_START", "ENHANCER_STOP", header[6], header[7], header[8], "STATUS", ] ] for line in rankEnhancerTable[1:]: # fixing the enhancer ID line[0] = line[0].replace("_lociStitched", "") formattedRankTable.append(line) # getting the genes geneList = [] geneList += line[9].split(",") geneList += line[10].split(",") geneList += line[11].split(",") geneList = [x for x in geneList if len(x) > 0] geneList = utils.uniquify(geneList) geneString = string.join(geneList, ",") bedLine = [line[1], line[2], line[3], line[0], line[-4]] # for gained if float(line[6]) > cutOff: gffLine = [line[1], line[0], "", line[2], line[3], "", ".", "", geneString] gffWindowLine = [ line[1], line[0], "", int(line[2]) - window, int(line[3]) + window, "", ".", "", geneString, ] gainedGFF.append(gffLine) gainedWindowGFF.append(gffWindowLine) geneStatus = name2 gainedBed.append(bedLine) # for lost elif float(line[6]) < (-1 * cutOff): gffLine = [line[1], line[0], "", line[2], line[3], "", ".", "", geneString] gffWindowLine = [ line[1], line[0], "", int(line[2]) - window, int(line[3]) + window, "", ".", "", geneString, ] lostGFF.append(gffLine) lostWindowGFF.append(gffWindowLine) geneStatus = name1 lostBed.append(bedLine) # for conserved else: geneStatus = "CONSERVED" conservedBed.append(bedLine) # now fill in the gene Table for gene in geneList: geneTableLine = [gene, line[0], line[1], line[2], line[3], line[6], line[7], line[8], geneStatus] geneTable.append(geneTableLine) # concat the bed fullBed = gainedBed + conservedBed + lostBed # start writing the output # there's the two gffs, the bed,the formatted table, the gene table # formatted table formattedFilename = "%s%s_%s_MERGED_SUPERS_RANK_TABLE.txt" % (outputFolder, genome, mergeName) utils.unParseTable(formattedRankTable, formattedFilename, "\t") # gffs gffFolder = pipeline_dfci.formatFolder(outputFolder + "gff/", True) gffFilename_gained = "%s%s_%s_%s_ONLY_SUPERS_-0_+0.gff" % (gffFolder, genome, mergeName, string.upper(name2)) gffFilenameWindow_gained = "%s%s_%s_%s_ONLY_SUPERS_-%sKB_+%sKB.gff" % ( gffFolder, genome, mergeName, string.upper(name2), window / 1000, window / 1000, ) gffFilename_lost = "%s%s_%s_%s_ONLY_SUPERS_-0_+0.gff" % (gffFolder, genome, mergeName, string.upper(name1)) gffFilenameWindow_lost = "%s%s_%s_%s_ONLY_SUPERS_-%sKB_+%sKB.gff" % ( gffFolder, genome, mergeName, string.upper(name1), window / 1000, window / 1000, ) utils.unParseTable(gainedGFF, gffFilename_gained, "\t") utils.unParseTable(gainedWindowGFF, gffFilenameWindow_gained, "\t") utils.unParseTable(lostGFF, gffFilename_lost, "\t") utils.unParseTable(lostWindowGFF, gffFilenameWindow_lost, "\t") # bed bedFilename = "%s%s_%s_MERGED_SUPERS.bed" % (outputFolder, genome, mergeName) utils.unParseTable(fullBed, bedFilename, "\t") # geneTable geneFilename = "%s%s_%s_MERGED_SUPERS_GENE_TABLE.txt" % (outputFolder, genome, mergeName) utils.unParseTable(geneTable, geneFilename, "\t") # finally, move all of the plots to the output folder cmd = "cp %s%s_ROSE/*.pdf %s%s_%s_MERGED_SUPERS_DELTA.pdf" % (mergeFolder, name1, outputFolder, genome, mergeName) os.system(cmd) cmd = "cp %s%s_ROSE/*RANK_PLOT.png %s%s_%s_MERGED_SUPERS_RANK_PLOT.png" % ( mergeFolder, name1, outputFolder, genome, mergeName, ) os.system(cmd) # now execute the commands bam1 = dataDict[name1]["bam"] bam2 = dataDict[name2]["bam"] bamString = "%s,%s" % (bam1, bam2) nameString = "%s,%s" % (name1, name2) colorString = "0,0,0:100,100,100" # change dir os.chdir("/ark/home/cl512/pipeline/") if len(gainedGFF) > 0: # gained command plotTitle = "%s_ONLY_SE" % (name2) cmd = "python -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE" % ( genome, bamString, gffFilename_gained, outputFolder, nameString, colorString, plotTitle, ) os.system(cmd) # gained window command plotTitle = "%s_ONLY_SE_%sKB_WINDOW" % (name2, window / 1000) cmd = "python -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE" % ( genome, bamString, gffFilenameWindow_gained, outputFolder, nameString, colorString, plotTitle, ) os.system(cmd) if len(lostGFF) > 0: # lost command plotTitle = "%s_ONLY_SE" % (name1) cmd = "python -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE" % ( genome, bamString, gffFilename_lost, outputFolder, nameString, colorString, plotTitle, ) os.system(cmd) # lost command plotTitle = "%s_ONLY_SE_%sKB_WINDOW" % (name1, window / 1000) cmd = "python -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE" % ( genome, bamString, gffFilenameWindow_lost, outputFolder, nameString, colorString, plotTitle, ) os.system(cmd) return