Exemplo n.º 1
0
    def findOverrepresentedTFsFromGeneSet(genome, tfSource, ensembleGeneIdList,upFlankSize, downFlankSize, geneSource, galaxyFn):
        #galaxyFn = '/usit/insilico/web/lookalike/galaxy_dist-20090924-dev/database/files/003/dataset_3347.dat'
        #print 'overriding galaxyFN!: ', galaxyFn
        galaxyId = extractIdFromGalaxyFn(galaxyFn)
        uniqueWebPath = getUniqueWebPath(extractIdFromGalaxyFn(galaxyFn))

        assert genome == 'hg18'
        
        tfTrackNameMappings = TfInfo.getTfTrackNameMappings(genome)
        tfTrackName = tfTrackNameMappings[tfSource]
        
        
        #Get gene track
        assert geneSource == 'Ensembl'
        targetGeneRegsTempFn = uniqueWebPath + os.sep + 'geneRegs.bed'
        geneRegsTrackName = GenomeInfo.getStdGeneRegsTn(genome)
        geneRegsFn = getOrigFn(genome, geneRegsTrackName, '.category.bed')
        GalaxyInterface.getGeneTrackFromGeneList(genome, geneRegsTrackName, ensembleGeneIdList, targetGeneRegsTempFn )
        
        assert upFlankSize == downFlankSize == 0 #Should instead extend regions to include flanks
        
        tcGeneRegsTempFn = uniqueWebPath + os.sep + 'tcGeneRegs.targetcontrol.bedgraph'
        #Think this will be okay, subtraction not necessary as targets are put first:
        controlGeneRegsTempFn = geneRegsFn
        #print targetGeneRegsTempFn, controlGeneRegsTempFn, tcGeneRegsTempFn
        GalaxyInterface.combineToTargetControl(targetGeneRegsTempFn, controlGeneRegsTempFn, tcGeneRegsTempFn)
        
        #tcGeneRegsExternalTN = ['external'] +galaxyId +  [tcGeneRegsTempFn]
        tcGeneRegsExternalTN = ExternalTrackManager.createStdTrackName(galaxyId, 'tempTc')
        
        #tcGeneRegsExternalTN = ['external'] +targetGalaxyId +  [tcGeneRegsTempFn]
        #tcGeneRegsExternalTN = ['galaxy', externalId, tcGeneRegsTempFn]
        
        targetGeneRegsExternalTN = ExternalTrackManager.createStdTrackName(galaxyId, 'tempTc', '1')
        controlGeneRegsExternalTN = ExternalTrackManager.createStdTrackName(galaxyId, 'tempTc', '0')
        
        #pre-process
        print 'Pre-processing file: %s, with trackname: %s ' % (tcGeneRegsTempFn, tcGeneRegsExternalTN)
        ExternalTrackManager.preProcess(tcGeneRegsTempFn, tcGeneRegsExternalTN, 'targetcontrol.bedgraph',genome)
        print 'Pre-processing TN: ', targetGeneRegsExternalTN
        ExternalTrackManager.preProcess(targetGeneRegsTempFn, targetGeneRegsExternalTN, 'bed',genome)
        print 'Pre-processing TN: ', controlGeneRegsExternalTN
        ExternalTrackManager.preProcess(controlGeneRegsTempFn, controlGeneRegsExternalTN, 'bed',genome)
        
        #print tcGeneRegsExternalTN
        trackName1, trackName2 = tfTrackName, tcGeneRegsExternalTN
        
        analysisDef = 'Categories differentially located in targets?: Which categories of track1-points fall more inside case than control track2-segments? [rawStatistic:=PointCountInsideSegsStat:]' +\
                  '[tf1:=SegmentToStartPointFormatConverter:] [tf2:=TrivialFormatConverter:]' +\
                  '-> DivergentRowsInCategoryMatrixStat'
        regSpec, binSpec = '*','*'
        
        #print 'skipping preproc!!'
        #ExternalTrackManager.preProcess(tcGeneRegsExternalTN[-1], tcGeneRegsExternalTN, 'targetcontrol.bedgraph', genome)
        #ExternalTrackManager.preProcess(targetGeneRegsTempFn, targetGeneRegsExternalTN, 'bed', genome)
        
        GalaxyInterface.runManual([trackName1, trackName2], analysisDef, regSpec, binSpec, genome, printResults=True, printHtmlWarningMsgs=False)
Exemplo n.º 2
0
    def findTFsTargetingGenes(cls, genome, tfSource, ensembleGeneIdList,
                              upFlankSize, downFlankSize, geneSource,
                              galaxyFn):
        #galaxyFn = '/usit/insilico/web/lookalike/galaxy_dist-20090924-dev/database/files/003/dataset_3347.dat'
        #print 'overriding galaxyFN!: ', galaxyFn
        uniqueWebPath = GalaxyRunSpecificFile([], galaxyFn).getDiskPath()

        assert genome in [
            'mm9', 'hg18', 'hg19'
        ]  #other genomes not supported. TF id links do not specify genome for pre-selection of analysis

        #if tfSource == 'UCSC tfbs conserved':
        #    tfTrackName = ['Gene regulation','TFBS','UCSC prediction track']
        #else:
        #    raise
        tfTrackNameMappings = TfInfo.getTfTrackNameMappings(genome)
        tfTrackName = tfTrackNameMappings[tfSource]

        #Get gene track
        #targetGeneRegsTempFn = uniqueWebPath + os.sep + 'geneRegs.bed'
        #geneRegsTrackName = GenomeInfo.getStdGeneRegsTn(genome)
        #geneRegsFn = getOrigFn(genome, geneRegsTrackName, '.category.bed')
        #GalaxyInterface.getGeneTrackFromGeneList(genome, geneRegsTrackName, ensembleGeneIdList, targetGeneRegsTempFn )

        if not (upFlankSize == downFlankSize == 0):
            unflankedGeneRegsTempFn = uniqueWebPath + os.sep + '_geneRegs.bed'
            #flankedGeneRegsTempFn  = uniqueWebPath + os.sep + 'flankedGeneRegs.bed'
            flankedGeneRegsTempStaticFile = GalaxyRunSpecificFile(
                ['flankedGeneRegs.bed'], galaxyFn)
            flankedGeneRegsTempFn = flankedGeneRegsTempStaticFile.getDiskPath()
            geneRegsTrackName = GenomeInfo.getStdGeneRegsTn(genome)
            #geneRegsFn = getOrigFn(genome, geneRegsTrackName, '.category.bed')
            GalaxyInterface.getGeneTrackFromGeneList(genome, geneRegsTrackName,
                                                     ensembleGeneIdList,
                                                     unflankedGeneRegsTempFn)
            GalaxyInterface.expandBedSegments(unflankedGeneRegsTempFn,
                                              flankedGeneRegsTempFn,
                                              genome,
                                              upFlankSize,
                                              downFlankSize,
                                              suffix='category.bed')
            #flankedGeneRegsExternalTN = ['external'] +galaxyId +  [flankedGeneRegsTempFn]
            regSpec, binSpec = 'category.bed', flankedGeneRegsTempFn
        else:
            regSpec, binSpec = '__genes__', ','.join(ensembleGeneIdList)

        res = cls._runCategoryPointCount(genome, regSpec, binSpec, tfTrackName)

        #trackName1 = tfTrackName
        #
        #analysisDef = 'Category point count: Number of elements each category of track1 (with overlaps)'+\
        #          '[tf1:=SegmentToStartPointFormatConverter:]'+\
        #          '-> FreqByCatStat'
        ##assert len(ensembleGeneIdList)==1
        ##geneId = ensembleGeneIdList[0]
        #
        #print '<div class="debug">'
        #userBinSource, fullRunArgs = GalaxyInterface._prepareRun(trackName1, None, analysisDef, regSpec, binSpec, genome)
        #res = AnalysisDefJob(analysisDef, trackName1, None, userBinSource, **fullRunArgs).run()
        #
        #print res
        ##GalaxyInterface._viewResults([res], galaxyFn)
        #print '</div>'
        tfs = res.getResDictKeys()

        genesPlural = 's' if len(ensembleGeneIdList) > 1 else ''
        tfsPlural = 's' if len(tfs) != 1 else ''
        print '<p>There are %i TF%s targeting your gene%s of interest (%s), using "%s" as source of TF occurrences.</p>' % (
            len(tfs), tfsPlural, genesPlural, ','.join(ensembleGeneIdList),
            tfSource)
        if not (upFlankSize == downFlankSize == 0):
            print '(using ', flankedGeneRegsTempStaticFile.getLink(
                'these genomic regions'), ' for genes)'
        expansionStr = ' flanked' if not (
            upFlankSize == downFlankSize == 0) else ''

        idHtmlFileNamer = GalaxyRunSpecificFile(['allTfIds.html'], galaxyFn)
        idHtmlFileNamer.writeTextToFile('<br>'.join([
            '<a href=%s/hyper?dbkey=%s&track1=%s&track2=>%s</a>' %
            (URL_PREFIX, genome, quote(':'.join(tfTrackName + [tf])), tf)
            for tf in tfs
        ]))
        #idHtmlFileNamer.writeTextToFile('<br>'.join(['<a href=/hbdev/hyper?track1=%s&track2=>%s</a>'%( ':'.join(tfTrackName+[tf]), tf) for tf in tfs]))
        print '<p>', idHtmlFileNamer.getLink(
            'Inspect html file'
        ), ' of all TF IDs occurring 1 or more times within your%s gene region%s of interest, with each TF ID linking to analysis with this TF pre-selected.</p>' % (
            expansionStr, genesPlural)

        idFileNamer = GalaxyRunSpecificFile(['allTfIds.txt'], galaxyFn)
        idFileNamer.writeTextToFile(os.linesep.join(tfs) + os.linesep)
        print '<p>', idFileNamer.getLink(
            'Inspect text file'
        ), ' listing all TF IDs occurring 1 or more times within your%s gene region%s of interest.</p>' % (
            expansionStr, genesPlural)

        extractedTfbsFileNamer = GalaxyRunSpecificFile(
            ['tfbsInGeneRegions.bed'], galaxyFn)
        GalaxyInterface.extractTrackManyBins(
            genome, tfTrackName, regSpec, binSpec, True, 'bed', False, False,
            extractedTfbsFileNamer.getDiskPath())
        print '<p>', extractedTfbsFileNamer.getLink(
            'Inspect bed-file'
        ), 'of all TF binding sites occurring within your%s gene region%s of interest.</p>' % (
            expansionStr, genesPlural)
Exemplo n.º 3
0
    def findOverrepresentedTFsFromGeneSet(genome, tfSource, ensembleGeneIdList,
                                          upFlankSize, downFlankSize,
                                          geneSource, galaxyFn):
        #galaxyFn = '/usit/insilico/web/lookalike/galaxy_dist-20090924-dev/database/files/003/dataset_3347.dat'
        #print 'overriding galaxyFN!: ', galaxyFn
        galaxyId = extractIdFromGalaxyFn(galaxyFn)
        uniqueWebPath = GalaxyRunSpecificFile([], galaxyFn).getDiskPath()

        assert genome == 'hg18'

        tfTrackNameMappings = TfInfo.getTfTrackNameMappings(genome)
        tfTrackName = tfTrackNameMappings[tfSource]

        #Get gene track
        assert geneSource == 'Ensembl'
        targetGeneRegsTempFn = uniqueWebPath + os.sep + 'geneRegs.bed'
        geneRegsTrackName = GenomeInfo.getStdGeneRegsTn(genome)
        geneRegsFn = getOrigFn(genome, geneRegsTrackName, '.category.bed')
        GalaxyInterface.getGeneTrackFromGeneList(genome, geneRegsTrackName,
                                                 ensembleGeneIdList,
                                                 targetGeneRegsTempFn)

        assert upFlankSize == downFlankSize == 0  #Should instead extend regions to include flanks

        tcGeneRegsTempFn = uniqueWebPath + os.sep + 'tcGeneRegs.targetcontrol.bedgraph'
        #Think this will be okay, subtraction not necessary as targets are put first:
        controlGeneRegsTempFn = geneRegsFn
        #print targetGeneRegsTempFn, controlGeneRegsTempFn, tcGeneRegsTempFn
        GalaxyInterface.combineToTargetControl(targetGeneRegsTempFn,
                                               controlGeneRegsTempFn,
                                               tcGeneRegsTempFn)

        #tcGeneRegsExternalTN = ['external'] +galaxyId +  [tcGeneRegsTempFn]
        tcGeneRegsExternalTN = ExternalTrackManager.createStdTrackName(
            galaxyId + ['tempTc'])

        #tcGeneRegsExternalTN = ['external'] +targetGalaxyId +  [tcGeneRegsTempFn]
        #tcGeneRegsExternalTN = ['galaxy', externalId, tcGeneRegsTempFn]

        targetGeneRegsExternalTN = ExternalTrackManager.createStdTrackName(
            galaxyId + ['tempTc', '1'])
        controlGeneRegsExternalTN = ExternalTrackManager.createStdTrackName(
            galaxyId + ['tempTc', '0'])

        #pre-process
        print 'Pre-processing file: %s, with trackname: %s ' % (
            tcGeneRegsTempFn, tcGeneRegsExternalTN)
        ExternalTrackManager.preProcess(tcGeneRegsTempFn, tcGeneRegsExternalTN,
                                        'targetcontrol.bedgraph', genome)
        print 'Pre-processing TN: ', targetGeneRegsExternalTN
        ExternalTrackManager.preProcess(targetGeneRegsTempFn,
                                        targetGeneRegsExternalTN, 'bed',
                                        genome)
        print 'Pre-processing TN: ', controlGeneRegsExternalTN
        ExternalTrackManager.preProcess(controlGeneRegsTempFn,
                                        controlGeneRegsExternalTN, 'bed',
                                        genome)

        #print tcGeneRegsExternalTN
        trackName1, trackName2 = tfTrackName, tcGeneRegsExternalTN

        analysisDef = 'Categories differentially located in targets?: Which categories of track1-points fall more inside case than control track2-segments? [rawStatistic:=PointCountInsideSegsStat:]' +\
                  '[tf1:=SegmentToStartPointFormatConverter:] [tf2:=TrivialFormatConverter:]' +\
                  '-> DivergentRowsInCategoryMatrixStat'
        regSpec, binSpec = '*', '*'

        #print 'skipping preproc!!'
        #ExternalTrackManager.preProcess(tcGeneRegsExternalTN[-1], tcGeneRegsExternalTN, 'targetcontrol.bedgraph', genome)
        #ExternalTrackManager.preProcess(targetGeneRegsTempFn, targetGeneRegsExternalTN, 'bed', genome)

        GalaxyInterface.runManual([trackName1, trackName2],
                                  analysisDef,
                                  regSpec,
                                  binSpec,
                                  genome,
                                  printResults=True,
                                  printHtmlWarningMsgs=False)
Exemplo n.º 4
0
    def findTFsTargetingGenes(cls, genome, tfSource, ensembleGeneIdList,upFlankSize, downFlankSize, geneSource, galaxyFn):
        #galaxyFn = '/usit/insilico/web/lookalike/galaxy_dist-20090924-dev/database/files/003/dataset_3347.dat'
        #print 'overriding galaxyFN!: ', galaxyFn
        uniqueWebPath = getUniqueWebPath(extractIdFromGalaxyFn(galaxyFn))

        assert genome in ['mm9','hg18'] #other genomes not supported. TF id links do not specify genome for pre-selection of analysis
        
        #if tfSource == 'UCSC tfbs conserved':
        #    tfTrackName = ['Gene regulation','TFBS','UCSC prediction track']
        #else:
        #    raise
        tfTrackNameMappings = TfInfo.getTfTrackNameMappings(genome)
        tfTrackName = tfTrackNameMappings[tfSource]
                
        #Get gene track
        #targetGeneRegsTempFn = uniqueWebPath + os.sep + 'geneRegs.bed'
        #geneRegsTrackName = GenomeInfo.getStdGeneRegsTn(genome)
        #geneRegsFn = getOrigFn(genome, geneRegsTrackName, '.category.bed')
        #GalaxyInterface.getGeneTrackFromGeneList(genome, geneRegsTrackName, ensembleGeneIdList, targetGeneRegsTempFn )
        
        if not (upFlankSize == downFlankSize == 0):            
            unflankedGeneRegsTempFn = uniqueWebPath + os.sep + '_geneRegs.bed'
            flankedGeneRegsTempFn  = uniqueWebPath + os.sep + 'flankedGeneRegs.bed'
            geneRegsTrackName = GenomeInfo.getStdGeneRegsTn(genome)
            #geneRegsFn = getOrigFn(genome, geneRegsTrackName, '.category.bed')
            GalaxyInterface.getGeneTrackFromGeneList(genome, geneRegsTrackName, ensembleGeneIdList, unflankedGeneRegsTempFn )
            GalaxyInterface.expandBedSegments(unflankedGeneRegsTempFn, flankedGeneRegsTempFn, genome, upFlankSize, downFlankSize)
            #flankedGeneRegsExternalTN = ['external'] +galaxyId +  [flankedGeneRegsTempFn]
            regSpec, binSpec = 'file', flankedGeneRegsTempFn
        else:
            regSpec, binSpec = '__genes__', ','.join(ensembleGeneIdList)

        res = cls._runCategoryPointCount(genome, regSpec, binSpec, tfTrackName)

        #trackName1 = tfTrackName
        #
        #analysisDef = 'Category point count: Number of elements each category of track1 (with overlaps)'+\
        #          '[tf1:=SegmentToStartPointFormatConverter:]'+\
        #          '-> FreqByCatStat'
        ##assert len(ensembleGeneIdList)==1
        ##geneId = ensembleGeneIdList[0]
        #
        #print '<div class="debug">'        
        #userBinSource, fullRunArgs = GalaxyInterface._prepareRun(trackName1, None, analysisDef, regSpec, binSpec, genome)
        #res = AnalysisDefJob(analysisDef, trackName1, None, userBinSource, **fullRunArgs).run()
        #
        #print res        
        ##GalaxyInterface._viewResults([res], galaxyFn)
        #print '</div>'
        tfs = res.getResDictKeys()
        
        genesPlural = 's' if len(ensembleGeneIdList)>1 else ''
        tfsPlural = 's' if len(tfs)!=1 else ''
        print '<p>There are %i TF%s targeting your gene%s of interest (%s), using "%s" as source of TF occurrences.</p>' % (len(tfs), tfsPlural, genesPlural, ','.join(ensembleGeneIdList), tfSource)
        
        expansionStr = ' flanked' if not (upFlankSize == downFlankSize == 0) else ''                

        idHtmlFileNamer = GalaxyRunSpecificFile(['allTfIds.html'],galaxyFn)
        idHtmlFileNamer.writeTextToFile('<br>'.join(['<a href=%s/hyper?dbkey=%s&track1=%s&track2=>%s</a>'%(URL_PREFIX, genome, quote(':'.join(tfTrackName+[tf])), tf) for tf in tfs]))
        #idHtmlFileNamer.writeTextToFile('<br>'.join(['<a href=/hbdev/hyper?track1=%s&track2=>%s</a>'%( ':'.join(tfTrackName+[tf]), tf) for tf in tfs]))
        print '<p>', idHtmlFileNamer.getLink('Inspect html file'), ' of all TF IDs occurring 1 or more times within your%s gene region%s of interest, with each TF ID linking to analysis with this TF pre-selected.</p>' % (expansionStr, genesPlural)

        idFileNamer = GalaxyRunSpecificFile(['allTfIds.txt'],galaxyFn)
        idFileNamer.writeTextToFile(os.linesep.join(tfs) + os.linesep)
        print '<p>', idFileNamer.getLink('Inspect text file'), ' listing all TF IDs occurring 1 or more times within your%s gene region%s of interest.</p>' % (expansionStr, genesPlural)
    
        extractedTfbsFileNamer = GalaxyRunSpecificFile(['tfbsInGeneRegions.bed'],galaxyFn)
        GalaxyInterface.extractTrackManyBins(genome, tfTrackName, regSpec, binSpec, True, 'bed', False, False, extractedTfbsFileNamer.getDiskPath())
        print '<p>', extractedTfbsFileNamer.getLink('Inspect bed-file'), 'of all TF binding sites occurring within your%s gene region%s of interest.</p>' % (expansionStr, genesPlural)
        
        #idFile = idFileNamer.getFile()
        #idFile.write(', '.join([str(bin.val) for bin in targetBins if res[bin][resDictKey]>0]) + os.sep)
        #idFile.close()
        
        #print idFileNamer.getLink('Text file'), ' of TF IDs'
        
        #GalaxyInterface.run(tfTrackName, tcGeneRegsExternalTN, analysisDef, regSpec, binSpec, genome, galaxyFn)
        #GalaxyInterface.run(':'.join(tfTrackName), ':'.join(tcGeneRegsExternalTN), analysisDef, regSpec, binSpec, genome, galaxyFn)