def findOverrepresentedTFsFromGeneSet(genome, tfSource, ensembleGeneIdList,upFlankSize, downFlankSize, geneSource, galaxyFn): #galaxyFn = '/usit/insilico/web/lookalike/galaxy_dist-20090924-dev/database/files/003/dataset_3347.dat' #print 'overriding galaxyFN!: ', galaxyFn galaxyId = extractIdFromGalaxyFn(galaxyFn) uniqueWebPath = getUniqueWebPath(extractIdFromGalaxyFn(galaxyFn)) assert genome == 'hg18' tfTrackNameMappings = TfInfo.getTfTrackNameMappings(genome) tfTrackName = tfTrackNameMappings[tfSource] #Get gene track assert geneSource == 'Ensembl' targetGeneRegsTempFn = uniqueWebPath + os.sep + 'geneRegs.bed' geneRegsTrackName = GenomeInfo.getStdGeneRegsTn(genome) geneRegsFn = getOrigFn(genome, geneRegsTrackName, '.category.bed') GalaxyInterface.getGeneTrackFromGeneList(genome, geneRegsTrackName, ensembleGeneIdList, targetGeneRegsTempFn ) assert upFlankSize == downFlankSize == 0 #Should instead extend regions to include flanks tcGeneRegsTempFn = uniqueWebPath + os.sep + 'tcGeneRegs.targetcontrol.bedgraph' #Think this will be okay, subtraction not necessary as targets are put first: controlGeneRegsTempFn = geneRegsFn #print targetGeneRegsTempFn, controlGeneRegsTempFn, tcGeneRegsTempFn GalaxyInterface.combineToTargetControl(targetGeneRegsTempFn, controlGeneRegsTempFn, tcGeneRegsTempFn) #tcGeneRegsExternalTN = ['external'] +galaxyId + [tcGeneRegsTempFn] tcGeneRegsExternalTN = ExternalTrackManager.createStdTrackName(galaxyId, 'tempTc') #tcGeneRegsExternalTN = ['external'] +targetGalaxyId + [tcGeneRegsTempFn] #tcGeneRegsExternalTN = ['galaxy', externalId, tcGeneRegsTempFn] targetGeneRegsExternalTN = ExternalTrackManager.createStdTrackName(galaxyId, 'tempTc', '1') controlGeneRegsExternalTN = ExternalTrackManager.createStdTrackName(galaxyId, 'tempTc', '0') #pre-process print 'Pre-processing file: %s, with trackname: %s ' % (tcGeneRegsTempFn, tcGeneRegsExternalTN) ExternalTrackManager.preProcess(tcGeneRegsTempFn, tcGeneRegsExternalTN, 'targetcontrol.bedgraph',genome) print 'Pre-processing TN: ', targetGeneRegsExternalTN ExternalTrackManager.preProcess(targetGeneRegsTempFn, targetGeneRegsExternalTN, 'bed',genome) print 'Pre-processing TN: ', controlGeneRegsExternalTN ExternalTrackManager.preProcess(controlGeneRegsTempFn, controlGeneRegsExternalTN, 'bed',genome) #print tcGeneRegsExternalTN trackName1, trackName2 = tfTrackName, tcGeneRegsExternalTN analysisDef = 'Categories differentially located in targets?: Which categories of track1-points fall more inside case than control track2-segments? [rawStatistic:=PointCountInsideSegsStat:]' +\ '[tf1:=SegmentToStartPointFormatConverter:] [tf2:=TrivialFormatConverter:]' +\ '-> DivergentRowsInCategoryMatrixStat' regSpec, binSpec = '*','*' #print 'skipping preproc!!' #ExternalTrackManager.preProcess(tcGeneRegsExternalTN[-1], tcGeneRegsExternalTN, 'targetcontrol.bedgraph', genome) #ExternalTrackManager.preProcess(targetGeneRegsTempFn, targetGeneRegsExternalTN, 'bed', genome) GalaxyInterface.runManual([trackName1, trackName2], analysisDef, regSpec, binSpec, genome, printResults=True, printHtmlWarningMsgs=False)
def findTFsTargetingGenes(cls, genome, tfSource, ensembleGeneIdList, upFlankSize, downFlankSize, geneSource, galaxyFn): #galaxyFn = '/usit/insilico/web/lookalike/galaxy_dist-20090924-dev/database/files/003/dataset_3347.dat' #print 'overriding galaxyFN!: ', galaxyFn uniqueWebPath = GalaxyRunSpecificFile([], galaxyFn).getDiskPath() assert genome in [ 'mm9', 'hg18', 'hg19' ] #other genomes not supported. TF id links do not specify genome for pre-selection of analysis #if tfSource == 'UCSC tfbs conserved': # tfTrackName = ['Gene regulation','TFBS','UCSC prediction track'] #else: # raise tfTrackNameMappings = TfInfo.getTfTrackNameMappings(genome) tfTrackName = tfTrackNameMappings[tfSource] #Get gene track #targetGeneRegsTempFn = uniqueWebPath + os.sep + 'geneRegs.bed' #geneRegsTrackName = GenomeInfo.getStdGeneRegsTn(genome) #geneRegsFn = getOrigFn(genome, geneRegsTrackName, '.category.bed') #GalaxyInterface.getGeneTrackFromGeneList(genome, geneRegsTrackName, ensembleGeneIdList, targetGeneRegsTempFn ) if not (upFlankSize == downFlankSize == 0): unflankedGeneRegsTempFn = uniqueWebPath + os.sep + '_geneRegs.bed' #flankedGeneRegsTempFn = uniqueWebPath + os.sep + 'flankedGeneRegs.bed' flankedGeneRegsTempStaticFile = GalaxyRunSpecificFile( ['flankedGeneRegs.bed'], galaxyFn) flankedGeneRegsTempFn = flankedGeneRegsTempStaticFile.getDiskPath() geneRegsTrackName = GenomeInfo.getStdGeneRegsTn(genome) #geneRegsFn = getOrigFn(genome, geneRegsTrackName, '.category.bed') GalaxyInterface.getGeneTrackFromGeneList(genome, geneRegsTrackName, ensembleGeneIdList, unflankedGeneRegsTempFn) GalaxyInterface.expandBedSegments(unflankedGeneRegsTempFn, flankedGeneRegsTempFn, genome, upFlankSize, downFlankSize, suffix='category.bed') #flankedGeneRegsExternalTN = ['external'] +galaxyId + [flankedGeneRegsTempFn] regSpec, binSpec = 'category.bed', flankedGeneRegsTempFn else: regSpec, binSpec = '__genes__', ','.join(ensembleGeneIdList) res = cls._runCategoryPointCount(genome, regSpec, binSpec, tfTrackName) #trackName1 = tfTrackName # #analysisDef = 'Category point count: Number of elements each category of track1 (with overlaps)'+\ # '[tf1:=SegmentToStartPointFormatConverter:]'+\ # '-> FreqByCatStat' ##assert len(ensembleGeneIdList)==1 ##geneId = ensembleGeneIdList[0] # #print '<div class="debug">' #userBinSource, fullRunArgs = GalaxyInterface._prepareRun(trackName1, None, analysisDef, regSpec, binSpec, genome) #res = AnalysisDefJob(analysisDef, trackName1, None, userBinSource, **fullRunArgs).run() # #print res ##GalaxyInterface._viewResults([res], galaxyFn) #print '</div>' tfs = res.getResDictKeys() genesPlural = 's' if len(ensembleGeneIdList) > 1 else '' tfsPlural = 's' if len(tfs) != 1 else '' print '<p>There are %i TF%s targeting your gene%s of interest (%s), using "%s" as source of TF occurrences.</p>' % ( len(tfs), tfsPlural, genesPlural, ','.join(ensembleGeneIdList), tfSource) if not (upFlankSize == downFlankSize == 0): print '(using ', flankedGeneRegsTempStaticFile.getLink( 'these genomic regions'), ' for genes)' expansionStr = ' flanked' if not ( upFlankSize == downFlankSize == 0) else '' idHtmlFileNamer = GalaxyRunSpecificFile(['allTfIds.html'], galaxyFn) idHtmlFileNamer.writeTextToFile('<br>'.join([ '<a href=%s/hyper?dbkey=%s&track1=%s&track2=>%s</a>' % (URL_PREFIX, genome, quote(':'.join(tfTrackName + [tf])), tf) for tf in tfs ])) #idHtmlFileNamer.writeTextToFile('<br>'.join(['<a href=/hbdev/hyper?track1=%s&track2=>%s</a>'%( ':'.join(tfTrackName+[tf]), tf) for tf in tfs])) print '<p>', idHtmlFileNamer.getLink( 'Inspect html file' ), ' of all TF IDs occurring 1 or more times within your%s gene region%s of interest, with each TF ID linking to analysis with this TF pre-selected.</p>' % ( expansionStr, genesPlural) idFileNamer = GalaxyRunSpecificFile(['allTfIds.txt'], galaxyFn) idFileNamer.writeTextToFile(os.linesep.join(tfs) + os.linesep) print '<p>', idFileNamer.getLink( 'Inspect text file' ), ' listing all TF IDs occurring 1 or more times within your%s gene region%s of interest.</p>' % ( expansionStr, genesPlural) extractedTfbsFileNamer = GalaxyRunSpecificFile( ['tfbsInGeneRegions.bed'], galaxyFn) GalaxyInterface.extractTrackManyBins( genome, tfTrackName, regSpec, binSpec, True, 'bed', False, False, extractedTfbsFileNamer.getDiskPath()) print '<p>', extractedTfbsFileNamer.getLink( 'Inspect bed-file' ), 'of all TF binding sites occurring within your%s gene region%s of interest.</p>' % ( expansionStr, genesPlural)
def findOverrepresentedTFsFromGeneSet(genome, tfSource, ensembleGeneIdList, upFlankSize, downFlankSize, geneSource, galaxyFn): #galaxyFn = '/usit/insilico/web/lookalike/galaxy_dist-20090924-dev/database/files/003/dataset_3347.dat' #print 'overriding galaxyFN!: ', galaxyFn galaxyId = extractIdFromGalaxyFn(galaxyFn) uniqueWebPath = GalaxyRunSpecificFile([], galaxyFn).getDiskPath() assert genome == 'hg18' tfTrackNameMappings = TfInfo.getTfTrackNameMappings(genome) tfTrackName = tfTrackNameMappings[tfSource] #Get gene track assert geneSource == 'Ensembl' targetGeneRegsTempFn = uniqueWebPath + os.sep + 'geneRegs.bed' geneRegsTrackName = GenomeInfo.getStdGeneRegsTn(genome) geneRegsFn = getOrigFn(genome, geneRegsTrackName, '.category.bed') GalaxyInterface.getGeneTrackFromGeneList(genome, geneRegsTrackName, ensembleGeneIdList, targetGeneRegsTempFn) assert upFlankSize == downFlankSize == 0 #Should instead extend regions to include flanks tcGeneRegsTempFn = uniqueWebPath + os.sep + 'tcGeneRegs.targetcontrol.bedgraph' #Think this will be okay, subtraction not necessary as targets are put first: controlGeneRegsTempFn = geneRegsFn #print targetGeneRegsTempFn, controlGeneRegsTempFn, tcGeneRegsTempFn GalaxyInterface.combineToTargetControl(targetGeneRegsTempFn, controlGeneRegsTempFn, tcGeneRegsTempFn) #tcGeneRegsExternalTN = ['external'] +galaxyId + [tcGeneRegsTempFn] tcGeneRegsExternalTN = ExternalTrackManager.createStdTrackName( galaxyId + ['tempTc']) #tcGeneRegsExternalTN = ['external'] +targetGalaxyId + [tcGeneRegsTempFn] #tcGeneRegsExternalTN = ['galaxy', externalId, tcGeneRegsTempFn] targetGeneRegsExternalTN = ExternalTrackManager.createStdTrackName( galaxyId + ['tempTc', '1']) controlGeneRegsExternalTN = ExternalTrackManager.createStdTrackName( galaxyId + ['tempTc', '0']) #pre-process print 'Pre-processing file: %s, with trackname: %s ' % ( tcGeneRegsTempFn, tcGeneRegsExternalTN) ExternalTrackManager.preProcess(tcGeneRegsTempFn, tcGeneRegsExternalTN, 'targetcontrol.bedgraph', genome) print 'Pre-processing TN: ', targetGeneRegsExternalTN ExternalTrackManager.preProcess(targetGeneRegsTempFn, targetGeneRegsExternalTN, 'bed', genome) print 'Pre-processing TN: ', controlGeneRegsExternalTN ExternalTrackManager.preProcess(controlGeneRegsTempFn, controlGeneRegsExternalTN, 'bed', genome) #print tcGeneRegsExternalTN trackName1, trackName2 = tfTrackName, tcGeneRegsExternalTN analysisDef = 'Categories differentially located in targets?: Which categories of track1-points fall more inside case than control track2-segments? [rawStatistic:=PointCountInsideSegsStat:]' +\ '[tf1:=SegmentToStartPointFormatConverter:] [tf2:=TrivialFormatConverter:]' +\ '-> DivergentRowsInCategoryMatrixStat' regSpec, binSpec = '*', '*' #print 'skipping preproc!!' #ExternalTrackManager.preProcess(tcGeneRegsExternalTN[-1], tcGeneRegsExternalTN, 'targetcontrol.bedgraph', genome) #ExternalTrackManager.preProcess(targetGeneRegsTempFn, targetGeneRegsExternalTN, 'bed', genome) GalaxyInterface.runManual([trackName1, trackName2], analysisDef, regSpec, binSpec, genome, printResults=True, printHtmlWarningMsgs=False)
def findTFsTargetingGenes(cls, genome, tfSource, ensembleGeneIdList,upFlankSize, downFlankSize, geneSource, galaxyFn): #galaxyFn = '/usit/insilico/web/lookalike/galaxy_dist-20090924-dev/database/files/003/dataset_3347.dat' #print 'overriding galaxyFN!: ', galaxyFn uniqueWebPath = getUniqueWebPath(extractIdFromGalaxyFn(galaxyFn)) assert genome in ['mm9','hg18'] #other genomes not supported. TF id links do not specify genome for pre-selection of analysis #if tfSource == 'UCSC tfbs conserved': # tfTrackName = ['Gene regulation','TFBS','UCSC prediction track'] #else: # raise tfTrackNameMappings = TfInfo.getTfTrackNameMappings(genome) tfTrackName = tfTrackNameMappings[tfSource] #Get gene track #targetGeneRegsTempFn = uniqueWebPath + os.sep + 'geneRegs.bed' #geneRegsTrackName = GenomeInfo.getStdGeneRegsTn(genome) #geneRegsFn = getOrigFn(genome, geneRegsTrackName, '.category.bed') #GalaxyInterface.getGeneTrackFromGeneList(genome, geneRegsTrackName, ensembleGeneIdList, targetGeneRegsTempFn ) if not (upFlankSize == downFlankSize == 0): unflankedGeneRegsTempFn = uniqueWebPath + os.sep + '_geneRegs.bed' flankedGeneRegsTempFn = uniqueWebPath + os.sep + 'flankedGeneRegs.bed' geneRegsTrackName = GenomeInfo.getStdGeneRegsTn(genome) #geneRegsFn = getOrigFn(genome, geneRegsTrackName, '.category.bed') GalaxyInterface.getGeneTrackFromGeneList(genome, geneRegsTrackName, ensembleGeneIdList, unflankedGeneRegsTempFn ) GalaxyInterface.expandBedSegments(unflankedGeneRegsTempFn, flankedGeneRegsTempFn, genome, upFlankSize, downFlankSize) #flankedGeneRegsExternalTN = ['external'] +galaxyId + [flankedGeneRegsTempFn] regSpec, binSpec = 'file', flankedGeneRegsTempFn else: regSpec, binSpec = '__genes__', ','.join(ensembleGeneIdList) res = cls._runCategoryPointCount(genome, regSpec, binSpec, tfTrackName) #trackName1 = tfTrackName # #analysisDef = 'Category point count: Number of elements each category of track1 (with overlaps)'+\ # '[tf1:=SegmentToStartPointFormatConverter:]'+\ # '-> FreqByCatStat' ##assert len(ensembleGeneIdList)==1 ##geneId = ensembleGeneIdList[0] # #print '<div class="debug">' #userBinSource, fullRunArgs = GalaxyInterface._prepareRun(trackName1, None, analysisDef, regSpec, binSpec, genome) #res = AnalysisDefJob(analysisDef, trackName1, None, userBinSource, **fullRunArgs).run() # #print res ##GalaxyInterface._viewResults([res], galaxyFn) #print '</div>' tfs = res.getResDictKeys() genesPlural = 's' if len(ensembleGeneIdList)>1 else '' tfsPlural = 's' if len(tfs)!=1 else '' print '<p>There are %i TF%s targeting your gene%s of interest (%s), using "%s" as source of TF occurrences.</p>' % (len(tfs), tfsPlural, genesPlural, ','.join(ensembleGeneIdList), tfSource) expansionStr = ' flanked' if not (upFlankSize == downFlankSize == 0) else '' idHtmlFileNamer = GalaxyRunSpecificFile(['allTfIds.html'],galaxyFn) idHtmlFileNamer.writeTextToFile('<br>'.join(['<a href=%s/hyper?dbkey=%s&track1=%s&track2=>%s</a>'%(URL_PREFIX, genome, quote(':'.join(tfTrackName+[tf])), tf) for tf in tfs])) #idHtmlFileNamer.writeTextToFile('<br>'.join(['<a href=/hbdev/hyper?track1=%s&track2=>%s</a>'%( ':'.join(tfTrackName+[tf]), tf) for tf in tfs])) print '<p>', idHtmlFileNamer.getLink('Inspect html file'), ' of all TF IDs occurring 1 or more times within your%s gene region%s of interest, with each TF ID linking to analysis with this TF pre-selected.</p>' % (expansionStr, genesPlural) idFileNamer = GalaxyRunSpecificFile(['allTfIds.txt'],galaxyFn) idFileNamer.writeTextToFile(os.linesep.join(tfs) + os.linesep) print '<p>', idFileNamer.getLink('Inspect text file'), ' listing all TF IDs occurring 1 or more times within your%s gene region%s of interest.</p>' % (expansionStr, genesPlural) extractedTfbsFileNamer = GalaxyRunSpecificFile(['tfbsInGeneRegions.bed'],galaxyFn) GalaxyInterface.extractTrackManyBins(genome, tfTrackName, regSpec, binSpec, True, 'bed', False, False, extractedTfbsFileNamer.getDiskPath()) print '<p>', extractedTfbsFileNamer.getLink('Inspect bed-file'), 'of all TF binding sites occurring within your%s gene region%s of interest.</p>' % (expansionStr, genesPlural) #idFile = idFileNamer.getFile() #idFile.write(', '.join([str(bin.val) for bin in targetBins if res[bin][resDictKey]>0]) + os.sep) #idFile.close() #print idFileNamer.getLink('Text file'), ' of TF IDs' #GalaxyInterface.run(tfTrackName, tcGeneRegsExternalTN, analysisDef, regSpec, binSpec, genome, galaxyFn) #GalaxyInterface.run(':'.join(tfTrackName), ':'.join(tcGeneRegsExternalTN), analysisDef, regSpec, binSpec, genome, galaxyFn)