def getGeneIdStaticFileWithContent(self):
     targetBins = self.getIntersectedReferenceBins()
     idFileNamer = GalaxyRunSpecificFile(self._getFileId('allGeneIds.txt'),
                                         self._galaxyFn)
     idFileNamer.writeTextToFile(
         os.linesep.join([str(bin.val).split('|')[0]
                          for bin in targetBins]) + os.linesep)
     return idFileNamer
Exemplo n.º 2
0
 def makeHtmlStr(self):
     htmlPage = GalaxyRunSpecificFile(
         ['html', '_'.join(self.track), 'page.html'], self.galaxyFn)
     htmlStr = 'TF: ' + self.tf + '<br/>\nChip-seq peaks: ' + self.chipSeqPeaks + '<br/>\nPWM: ' + self.pwm + '<br/>\nNumber of SNV-intersected binding regions: ' + self.intersectingPoints + '<br/>\nHighest binding difference: ' + self.maxPwmDiff + '<br/>\nAvg binding difference: ' + self.avgPwmDiff + '<br/>\n' + self.regularFasta.getLink(
         'Original Fasta') + '<br/>\n' + self.mutatedFasta.getLink(
             'Mutated Fasta') + '<br/>\n' + self.pwmDiffScore.getLink(
                 'PWM score for each region'
             ) + '<br/>\n' + self.gtrackDiffScore.getLink(
                 'Gtrack of PWM score for each region')
     htmlPage.writeTextToFile(htmlStr)
     return htmlPage.getLink(self.tf + ':   ' + self.track[-1])
    def getLinkToSingleLocalHtmlResultsTable(self, linkText, disease,
                                             resDictKey, galaxyFn):
        core = HtmlCore()
        core.begin()
        core.paragraph(
            self.getHtmlLocalResultsTable(resDictKey, fillInNoneValues=True))
        core.end()

        staticFile = GalaxyRunSpecificFile(
            ['LocalResultTables', resDictKey, disease + '.html'], galaxyFn)
        staticFile.writeTextToFile(str(core))
        return staticFile.getLink(linkText)
Exemplo n.º 4
0
    def singleSimulation(self, numH0, numH1, replicateIndex, verbose=False):
        tests = MultipleTestCollection(numH0, numH1, self._maxNumSamples, self._h, self._fdrThreshold,self._a,self._b)
        tests.addSamples(self.NUM_SAMPLES_INITIALLY)
        while not tests.allTestsAreDetermined():            
            tests.addSamples(self.NUM_SAMPLES_PER_CHUNK)
            #if verbose:
                #print tests.getTotalNumSamples()
        #As sampling is now anyway over, we set fdrThreshold to a threshold used after computations are finished (i.e. affects final rejection/acception, but not stopping of samples)
        tests.setFdrThresholdAtAllCounters(self._postFdrThreshold)
        
        #print 'FINALLY, #samples: ',
        if self._galaxyFn is not None:
            if self._h is None:
                scheme = 'Basic'
            elif self._fdrThreshold is None:
                scheme = 'Sequential'
            else:
                scheme = 'McFdr'
            staticFile = GalaxyRunSpecificFile([scheme,str(numH1),str(replicateIndex),'PandQvals.txt'], self._galaxyFn)              
            tests.writeAllPandQVals(staticFile.getFile() )                        
            linkToRaw = staticFile.getLink('Raw p and q-vals') + ' under %s scheme with %i true H1, (replication %i)' % (scheme, numH1, replicateIndex)
            
            figStaticFile = GalaxyRunSpecificFile([scheme,str(numH1),str(replicateIndex),'PandQvals.png'], self._galaxyFn)
            figStaticFile.openRFigure()
            tests.makeAllPandQValsFigure()
            figStaticFile.closeRFigure()
            linkToFig = figStaticFile.getLink(' (p/q-figure) ') + '<br>'

            figNumSamplesStaticFile = GalaxyRunSpecificFile([scheme,str(numH1),str(replicateIndex),'NumSamples.png'], self._galaxyFn)
            figNumSamplesStaticFile.openRFigure()
            tests.makeNumSamplesFigure()
            figNumSamplesStaticFile.closeRFigure()
            linkToNumSamplesFig = figNumSamplesStaticFile.getLink(' (numSamples-figure) ') + '<br>'

            catalogStaticFile = GalaxyRunSpecificFile([str(numH1),'cat.html'], self._galaxyFn)
            catalogStaticFile.writeTextToFile(linkToRaw + linkToFig + linkToNumSamplesFig, mode='a')

                        
        #if verbose:
            #print sorted(tests.getFdrVals())
            #print 'NumS ign Below 0.2: ', sum([1 if t<0.2 else 0 for t in tests.getFdrVals()])
        #return tests.getTotalNumSamples(), tests.getTotalNumRejected()
        return tests.getTotalNumSamples(), tests.getTotalNumRejected(), tests.getClassificationSummaries()
    def getResultTableLink(self, refSubType, linkText):
        assert self._galaxyFn is not None and self._gwasId is not None
        res = self.getResult(refSubType)
        basedir = GalaxyRunSpecificFile(
            ['ResultTableDetails', self._gwasId, refSubType],
            self._galaxyFn).getDiskPath(ensurePath=True)
        staticFile = GalaxyRunSpecificFile(
            ['ResultTables', self._gwasId, refSubType + '.html'],
            self._galaxyFn)

        core = HtmlCore()
        core.begin()
        if hasattr(res, 'batchText'):
            core.paragraph('<pre> Corresponding batch command line:\n ' +
                           res.batchText + '</pre>')
        core.paragraph(str(ResultsViewer(res, basedir)))
        core.end()
        staticFile.writeTextToFile(str(core))
        #staticFile.writeTextToFile( str(ResultsViewer(res, basedir) ) )
        return staticFile.getLink(linkText)
    def execute(cls, choices, galaxyFn=None, username=''):
        '''Is called when execute-button is pushed by web-user.
        Should print output as HTML to standard out, which will be directed to a results page in Galaxy history.
        If getOutputFormat is anything else than HTML, the output should be written to the file with path galaxyFn.
        If needed, StaticFile can be used to get a path where additional files can be put (e.g. generated image files).
        choices is a list of selections made by web-user in each options box.
        '''
        import subprocess
        import os
        from proto.hyperbrowser.StaticFile import GalaxyRunSpecificFile
        from config.Config import HB_SOURCE_CODE_BASE_DIR
        from quick.application.ExternalTrackManager import ExternalTrackManager

        tempInStaticFile = GalaxyRunSpecificFile(['tempIn.txt'], galaxyFn)
        outStaticFile = GalaxyRunSpecificFile(['tempOut.fasta'], galaxyFn)
        #print os.getcwd()
        inFn = ExternalTrackManager.extractFnFromGalaxyTN(
            choices[0].split(':'))
        #print inFn
        tempOutFn = outStaticFile.getDiskPath(True)
        #print tempOutFn
        os.chdir(HB_SOURCE_CODE_BASE_DIR + '/third_party/nonpython')
        #print outStaticFile.getLink('output')
        markovOrder = int(choices[1])

        seqs = []
        for line in open(inFn):
            if line.startswith('>'):
                seqs.append([line[1:].strip(), []])
            else:
                seqs[-1][1].append(line.strip())
        for seq in seqs:
            seq[1] = ''.join(seq[1])

        pureSequence = ''.join([seq[1] for seq in seqs])
        totalSeqLen = len(pureSequence)
        #pureSequence = ''.join([line.replace('\n','') for line in open(inFn) if not line.startswith('>')])
        tempInStaticFile.writeTextToFile(pureSequence)
        numSamples = int(choices[2])

        if numSamples > 1:
            zipOutStatic = GalaxyRunSpecificFile(['randomFastas.zip'],
                                                 galaxyFn)
            zipOut = zipfile.ZipFile(zipOutStatic.getDiskPath(True), 'w')

        for iteration in range(numSamples):
            if numSamples > 1:
                fastaOutStatic = GalaxyRunSpecificFile(
                    ['random', 's%s.fa' % iteration], galaxyFn)
                fastaOutFn = fastaOutStatic.getDiskPath(True)
            else:
                fastaOutFn = galaxyFn
            #fastaOutStatic = GalaxyRunSpecificFile(['random%s'%iteration], galaxyFn)
            #subprocess.call('javac',shell=True)
            #subprocess.call('javac',shell=False)
            #subprocess.call('javac MarkovModel.java',shell=True)
            subprocess.call('java MarkovModel %s %s %s >%s' %
                            (tempInStaticFile.getDiskPath(), markovOrder,
                             totalSeqLen, tempOutFn),
                            shell=True)
            #subprocess.call('javac third_party/nonpython/MarkovModel.java')
            #subprocess.call('java third_party/nonpython/MarkovModel.java')
            pureMarkovSequence = open(tempOutFn).readline().strip()
            pmsIndex = 0
            fastaOutF = open(fastaOutFn, 'w')
            for seq in seqs:
                fastaOutF.write('>' + seq[0] + os.linesep)
                nextPmsIndex = pmsIndex + len(seq[1])
                #seq.append(pureMarkovSequence[pmsIndex:nextPmsIndex])
                fastaOutF.write(pureMarkovSequence[pmsIndex:nextPmsIndex] +
                                os.linesep)
                pmsIndex = nextPmsIndex
            fastaOutF.close()
            assert pmsIndex == totalSeqLen == len(pureMarkovSequence), (
                pmsIndex, totalSeqLen, len(pureMarkovSequence))
            if numSamples > 1:
                #print 'Adding %s to archive' % fastaOutFn.split('/')[-1]
                zipOut.write(fastaOutFn, fastaOutFn.split('/')[-1])

        if numSamples > 1:
            zipOut.close()
            print zipOutStatic.getLink('Zipped random sequences')
Exemplo n.º 7
0
    def findTFsTargetingGenes(cls, genome, tfSource, ensembleGeneIdList,
                              upFlankSize, downFlankSize, geneSource,
                              galaxyFn):
        #galaxyFn = '/usit/insilico/web/lookalike/galaxy_dist-20090924-dev/database/files/003/dataset_3347.dat'
        #print 'overriding galaxyFN!: ', galaxyFn
        uniqueWebPath = GalaxyRunSpecificFile([], galaxyFn).getDiskPath()

        assert genome in [
            'mm9', 'hg18', 'hg19'
        ]  #other genomes not supported. TF id links do not specify genome for pre-selection of analysis

        #if tfSource == 'UCSC tfbs conserved':
        #    tfTrackName = ['Gene regulation','TFBS','UCSC prediction track']
        #else:
        #    raise
        tfTrackNameMappings = TfInfo.getTfTrackNameMappings(genome)
        tfTrackName = tfTrackNameMappings[tfSource]

        #Get gene track
        #targetGeneRegsTempFn = uniqueWebPath + os.sep + 'geneRegs.bed'
        #geneRegsTrackName = GenomeInfo.getStdGeneRegsTn(genome)
        #geneRegsFn = getOrigFn(genome, geneRegsTrackName, '.category.bed')
        #GalaxyInterface.getGeneTrackFromGeneList(genome, geneRegsTrackName, ensembleGeneIdList, targetGeneRegsTempFn )

        if not (upFlankSize == downFlankSize == 0):
            unflankedGeneRegsTempFn = uniqueWebPath + os.sep + '_geneRegs.bed'
            #flankedGeneRegsTempFn  = uniqueWebPath + os.sep + 'flankedGeneRegs.bed'
            flankedGeneRegsTempStaticFile = GalaxyRunSpecificFile(
                ['flankedGeneRegs.bed'], galaxyFn)
            flankedGeneRegsTempFn = flankedGeneRegsTempStaticFile.getDiskPath()
            geneRegsTrackName = GenomeInfo.getStdGeneRegsTn(genome)
            #geneRegsFn = getOrigFn(genome, geneRegsTrackName, '.category.bed')
            GalaxyInterface.getGeneTrackFromGeneList(genome, geneRegsTrackName,
                                                     ensembleGeneIdList,
                                                     unflankedGeneRegsTempFn)
            GalaxyInterface.expandBedSegments(unflankedGeneRegsTempFn,
                                              flankedGeneRegsTempFn,
                                              genome,
                                              upFlankSize,
                                              downFlankSize,
                                              suffix='category.bed')
            #flankedGeneRegsExternalTN = ['external'] +galaxyId +  [flankedGeneRegsTempFn]
            regSpec, binSpec = 'category.bed', flankedGeneRegsTempFn
        else:
            regSpec, binSpec = '__genes__', ','.join(ensembleGeneIdList)

        res = cls._runCategoryPointCount(genome, regSpec, binSpec, tfTrackName)

        #trackName1 = tfTrackName
        #
        #analysisDef = 'Category point count: Number of elements each category of track1 (with overlaps)'+\
        #          '[tf1:=SegmentToStartPointFormatConverter:]'+\
        #          '-> FreqByCatStat'
        ##assert len(ensembleGeneIdList)==1
        ##geneId = ensembleGeneIdList[0]
        #
        #print '<div class="debug">'
        #userBinSource, fullRunArgs = GalaxyInterface._prepareRun(trackName1, None, analysisDef, regSpec, binSpec, genome)
        #res = AnalysisDefJob(analysisDef, trackName1, None, userBinSource, **fullRunArgs).run()
        #
        #print res
        ##GalaxyInterface._viewResults([res], galaxyFn)
        #print '</div>'
        tfs = res.getResDictKeys()

        genesPlural = 's' if len(ensembleGeneIdList) > 1 else ''
        tfsPlural = 's' if len(tfs) != 1 else ''
        print '<p>There are %i TF%s targeting your gene%s of interest (%s), using "%s" as source of TF occurrences.</p>' % (
            len(tfs), tfsPlural, genesPlural, ','.join(ensembleGeneIdList),
            tfSource)
        if not (upFlankSize == downFlankSize == 0):
            print '(using ', flankedGeneRegsTempStaticFile.getLink(
                'these genomic regions'), ' for genes)'
        expansionStr = ' flanked' if not (
            upFlankSize == downFlankSize == 0) else ''

        idHtmlFileNamer = GalaxyRunSpecificFile(['allTfIds.html'], galaxyFn)
        idHtmlFileNamer.writeTextToFile('<br>'.join([
            '<a href=%s/hyper?dbkey=%s&track1=%s&track2=>%s</a>' %
            (URL_PREFIX, genome, quote(':'.join(tfTrackName + [tf])), tf)
            for tf in tfs
        ]))
        #idHtmlFileNamer.writeTextToFile('<br>'.join(['<a href=/hbdev/hyper?track1=%s&track2=>%s</a>'%( ':'.join(tfTrackName+[tf]), tf) for tf in tfs]))
        print '<p>', idHtmlFileNamer.getLink(
            'Inspect html file'
        ), ' of all TF IDs occurring 1 or more times within your%s gene region%s of interest, with each TF ID linking to analysis with this TF pre-selected.</p>' % (
            expansionStr, genesPlural)

        idFileNamer = GalaxyRunSpecificFile(['allTfIds.txt'], galaxyFn)
        idFileNamer.writeTextToFile(os.linesep.join(tfs) + os.linesep)
        print '<p>', idFileNamer.getLink(
            'Inspect text file'
        ), ' listing all TF IDs occurring 1 or more times within your%s gene region%s of interest.</p>' % (
            expansionStr, genesPlural)

        extractedTfbsFileNamer = GalaxyRunSpecificFile(
            ['tfbsInGeneRegions.bed'], galaxyFn)
        GalaxyInterface.extractTrackManyBins(
            genome, tfTrackName, regSpec, binSpec, True, 'bed', False, False,
            extractedTfbsFileNamer.getDiskPath())
        print '<p>', extractedTfbsFileNamer.getLink(
            'Inspect bed-file'
        ), 'of all TF binding sites occurring within your%s gene region%s of interest.</p>' % (
            expansionStr, genesPlural)
Exemplo n.º 8
0
    def findTFsOccurringInRegions(cls, genome, tfSource, regionsBedFn,
                                  upFlankSize, downFlankSize, galaxyFn):
        uniqueWebPath = GalaxyRunSpecificFile([], galaxyFn).getDiskPath()
        #assert genome == 'hg18' #other genomes not supported. TF id links do not specify genome for pre-selection of analysis

        tfTrackNameMappings = TfInfo.getTfTrackNameMappings(genome)
        assert tfTrackNameMappings != {}, 'No TF info for genome: %s' % genome

        tfTrackName = tfTrackNameMappings[tfSource]

        if (upFlankSize == downFlankSize == 0):
            flankedRegionsFn = regionsBedFn
        else:
            flankedRegionsFn = uniqueWebPath + os.sep + 'flankedRegs.bed'
            GalaxyInterface.expandBedSegments(regionsBedFn, flankedRegionsFn,
                                              genome, upFlankSize,
                                              downFlankSize)

        regSpec, binSpec = 'bed', flankedRegionsFn
        res = cls._runCategoryPointCount(genome, regSpec, binSpec, tfTrackName)

        tfNames = res.getResDictKeys()
        #print 'RES: ', res.getGlobalResult()[tfNames[0]], type(res.getGlobalResult()[tfNames[0]])
        pwm2tfids = safeshelve.open(
            os.sep.join([HB_SOURCE_CODE_BASE_DIR, 'data', 'pwm2TFids.shelf']),
            'r')
        tf2class = safeshelve.open(
            os.sep.join([HB_SOURCE_CODE_BASE_DIR, 'data', 'TfId2Class.shelf']),
            'r')
        pwmName2id = safeshelve.open(
            os.sep.join([HB_SOURCE_CODE_BASE_DIR, 'data', 'pwmName2id.shelf']),
            'r')
        #print tfNames[0],tfNames[1], ' VS ', pwm2tfids.keys()[0], len(pwm2tfids)
        #tfs = list(reversed(sorted([(res.getGlobalResult()[tf], tf, '%s (%i hits (class %s))'%(tf, res.getGlobalResult()[tf]), '/'.join([tf2class[x] for x in pwm2tfids[tf]]) ) for tf in tfNames]))) #num hits, tfName, tfTextInclHits
        tfs = list(reversed(sorted([(res.getGlobalResult()[tf], tf, '%s (%i hits )'%(tf, res.getGlobalResult()[tf]) + \
                                     (' (class: %s)'%'/'.join(set([str(tf2class.get(x)) for x in pwm2tfids[pwmName2id[tf]] if x in tf2class]))\
                                      if (tf in pwmName2id and pwmName2id[tf] in pwm2tfids and any([x in tf2class for x in pwm2tfids[pwmName2id[tf]]]))\
                                    else '') ) \
                                    for tf in tfNames])) ) #num hits, tfName, tfTextInclHits

        tfsPlural = 's' if len(tfs) != 1 else ''
        print '<p>There are %i TF%s targeting your regions of interest, using "%s" as source of TF occurrences.</p>' % (
            len(tfs), tfsPlural, tfSource)

        expansionStr = ' flanked' if not (
            upFlankSize == downFlankSize == 0) else ''

        idHtmlFileNamer = GalaxyRunSpecificFile(['allTfIds.html'], galaxyFn)
        idHtmlFileNamer.writeTextToFile('<br>'.join([
            '<a href=/hbdev/hyper?track1=%s&track2=>%s</a>' %
            (quote(':'.join(tfTrackName + [tf[1]])), tf[2]) for tf in tfs
        ]))
        print '<p>', idHtmlFileNamer.getLink(
            'Inspect html file'
        ), ' of all TF IDs occurring 1 or more times within your%s regions of interest, with each TF ID linking to analysis with this TF pre-selected.</p>' % (
            expansionStr)

        idFileNamer = GalaxyRunSpecificFile(['allTfIds.txt'], galaxyFn)
        idFileNamer.writeTextToFile(
            os.linesep.join([tf[2] for tf in tfs]) + os.linesep)
        print '<p>', idFileNamer.getLink(
            'Inspect text file'
        ), ' listing all TF IDs occurring 1 or more times within your%s regions of interest.</p>' % (
            expansionStr)

        extractedTfbsFileNamer = GalaxyRunSpecificFile(
            ['tfbsInGeneRegions.bed'], galaxyFn)
        GalaxyInterface.extractTrackManyBins(
            genome, tfTrackName, regSpec, binSpec, True, 'bed', False, False,
            extractedTfbsFileNamer.getDiskPath(), True)
        print '<p>', extractedTfbsFileNamer.getLoadToHistoryLink(
            'Inspect bed-file'
        ), 'of all TF binding sites occurring within your%s regions of interest.</p>' % (
            expansionStr)

        for dummy, tf, dummy2 in tfs:
            extractedTfbsFileNamer = GalaxyRunSpecificFile(
                [tf + '_tfbsInGeneRegions.bed'], galaxyFn)
            GalaxyInterface.extractTrackManyBins(
                genome, tfTrackName + [tf], regSpec, binSpec, True, 'bed',
                False, False, extractedTfbsFileNamer.getDiskPath())
            print '<p>', extractedTfbsFileNamer.getLoadToHistoryLink(
                'Binding sites of the TF %s' % tf, 'bed'
            ), 'occurring within your%s regions of interest (bed-file).</p>' % (
                expansionStr)
    def MakeCircosConfFile(dataset, galaxyFn, outputFn):
        circosMal = """<<include etc/colors_fonts_patterns.conf>>
        <ideogram>
        <spacing>
        default = 0.005r
        break   = 0.5r
        axis_break_at_edge = yes
        axis_break         = yes
        axis_break_style   = 2
        <break_style 1>
        stroke_color = black
        fill_color   = blue
        thickness    = 0.25r
        stroke_thickness = 2
        </break>
        <break_style 2>
        stroke_color     = black
        stroke_thickness = 2
        thickness        = 1.5r
        </break>
        </spacing>
        
        #<<include ideogram.position.conf>>
        radius           = 0.85r
        thickness        = 30p
        fill             = yes
        fill_color       = black
        stroke_thickness = 2
        stroke_color     = black
        
        #<<include ideogram.label.conf>>
        show_label       = yes
        label_font       = default
        label_radius     = dims(ideogram,radius) + 0.075r
        label_size       = 36
        label_parallel   = yes
        label_case       = upper
        
        #<<include bands.conf>>
        show_bands            = yes
        fill_bands            = yes
        band_stroke_thickness = 2
        band_stroke_color     = white
        band_transparency     = 3
        
        
        </ideogram>
        
        
        #<<include ticks.conf>>
        show_ticks          = yes
        show_tick_labels    = yes
        
        <ticks>
        tick_separation      = 3p
        label_separation     = 5p
        radius               = dims(ideogram,radius_outer)
        multiplier           = 1e-6
        color          = black
        size           = 20p
        thickness      = 4p
        label_offset   = 5p
        format         = %%d
        
        <tick>
        spacing        = 1u
        show_label     = yes
        label_size     = 16p
        </tick>
        
        <tick>
        spacing        = 5u
        show_label     = yes
        label_size     = 18p
        </tick>
        
        <tick>
        spacing        = 10u
        show_label     = yes
        label_size     = 20p
        </tick>
        
        <tick>
        spacing        = 20u
        show_label     = yes
        label_size     = 24p
        </tick>
        </ticks>
        
        karyotype   = %s
        #data/karyotype/karyotype.human.hg19_mod.txt
        
        <image>
        
        dir   = %s
        file  = circos.png
        png   = yes
        svg   = no
        # radius of inscribed circle in image
        radius         = 1500p
        # by default angle=0 is at 3 o'clock position
        angle_offset      = -90
        #angle_orientation = counterclockwise
        auto_alpha_colors = yes
        auto_alpha_steps  = 5
        background = white
        
        </image>
        
        chromosomes_units = 1000000
        chromosomes_display_default = yes
        
        #chromosomes = hs1;hs2;hs3;hs4;hs5;hs6
        %s
        <<include etc/housekeeping.conf>>
        """

        plotsMal = "<plots>\n%s\n</plots>\n"
        plotMal = "<plot>\nfile=%s\ntype=%s\nline\nr0=%s\nr1=%s\nmin=%s\nmax=%s\ncolor=black\nthickness=2\nextend_bin=no\naxis=yes\naxis_color=lgrey\naxis_thickness=2\naxis_spacing=0.1\n</plot>\n"
        #% (file, plotType, r0, r1, minVal, maxVal)
        highlightsMal = "<highlights>\n%s\n</highlights>\n"
        highlightMal = "<highlight>\nfile=%s\nr0=%s\nr1= %s\n</highlight>\n"  #% (fn, r0, r1)

        circosConfFile = GalaxyRunSpecificFile(['circos.conf'], galaxyFn)
        dir = dirname(outputFn)

        #dataset = {'/usit/titan/u1/kaitre/circosData/100kb_extended_MS_regions.bed':{'type':'highlight', 'r0':'0.90r', 'r1':'0.95r'}}
        #{'/usit/titan/u1/kaitre/circosData/SE_bcell_Factor_of_observed_vs_expected_overlap_per_cytoband.bedgraph':{'type':'line','r0':'0.95r', 'r1':'1.0r','min':'0', 'max':'1500' }\
        #            , '/usit/titan/u1/kaitre/circosData/AP_bcell_Factor_of_observed_vs_expected_overlap_per_cytoband.bedgraph':{'type':'line', 'r0':'0.90r', 'r1':'0.95r', 'min':'0', 'max':'1500'}}
        plotStr = ''
        highlightStr = ''
        for data in dataset.keys():
            if dataset[data]['type'] in ['line', 'histogram']:
                plotStr += plotMal % (data, dataset[data]['type'],
                                      dataset[data]['r0'], dataset[data]['r1'],
                                      dataset[data]['min'],
                                      dataset[data]['max'])

            elif dataset[data]['type'] in ['highlight']:
                highlightStr += highlightMal % (data, dataset[data]['r0'],
                                                dataset[data]['r1'])

        if plotStr != '':
            plotStr = plotsMal % plotStr
        if highlightStr != '':
            highlightStr = highlightsMal % highlightStr
        #print circosMal % ('data/karyotype/karyotype.human.hg19_mod.txt',dir, plotStr+highlightStr)
        circosConfFile.writeTextToFile(
            circosMal % ('data/karyotype/karyotype.human.hg19_mod.txt', dir,
                         plotStr + highlightStr))
        #open(circosConfFile.getDiskPath(True), 'w').write(circosMal % ('data/karyotype/karyotype.human.hg19_mod.txt',dir, plotStr+highlightStr))

        return 'circos -conf %s -noparanoid' % circosConfFile.getDiskPath()
Exemplo n.º 10
0
class ExactlySpecifiedTF(object):
    def __init__(self, tf, chipSeqPeaks, pwm, tracks, galaxyFn):
        self.tf = tf
        self.chipSeqPeaks = chipSeqPeaks
        self.pwm = pwm

        assert len(tracks) == 2
        self.track = tracks[0]
        self.mutationTrack = tracks[1]
        self.galaxyFn = galaxyFn

        self.bedPwmDiffScore = GalaxyRunSpecificFile(
            ['pwmDiffScore', self.pwm + '_'.join(self.track), 'pwmDiff.bed'],
            self.galaxyFn)
        self.pwmDiffScore = GalaxyRunSpecificFile(
            ['pwmDiffScore', self.pwm + '_'.join(self.track), 'pwmDiff.html'],
            self.galaxyFn)
        self.gtrackDiffScore = GalaxyRunSpecificFile([
            'pwmDiffScore', self.pwm + '_'.join(self.track), 'pwmDiff.gtrack'
        ], self.galaxyFn)
        self.mutatedFasta = GalaxyRunSpecificFile(
            ['fastaFiles', '_'.join(self.track), 'mutatedFastseq.fasta'],
            self.galaxyFn)
        self.regularFasta = GalaxyRunSpecificFile(
            ['fastaFiles', '_'.join(self.track), 'regularFastseq.fasta'],
            self.galaxyFn)

        self.maxPwmDiff = None
        self.avgPwmDiff = None
        self.numPwmDiff = 0

    def getFastaFiles(self, genome):
        assert self.track
        assert self.mutationTrack

        regionDict, pointDict = self.IntersectData(
            genome, [self.track, self.mutationTrack])
        self.intersectingPoints = str(
            sum([len(v) for v in regionDict.values()]))

        mutatedfastaDict = self.getMutatedSequence(genome, regionDict,
                                                   pointDict)
        regularFastaDict = self.getMutatedSequence(genome, regionDict)

        self.mutatedFasta.writeTextToFile('\n'.join([
            '\n'.join(mutatedfastaDict[chrom])
            for chrom in sorted(mutatedfastaDict.keys())
        ]))
        self.regularFasta.writeTextToFile('\n'.join([
            '\n'.join(regularFastaDict[chrom])
            for chrom in sorted(regularFastaDict.keys())
        ]))

    @classmethod
    def getMutatedSequence(cls, genome, regionDict, pointDict=None):
        resultDict = defaultdict(list)
        regionList = []
        fastaTrack = PlainTrack(['Sequence', 'DNA'])
        for chrom in regionDict.keys():
            for start, end in regionDict[chrom]:

                seqTv = fastaTrack.getTrackView(
                    GenomeRegion(genome, chrom, start, end))
                valList = list(seqTv.valsAsNumpyArray())
                if pointDict:
                    mutatedPoints = [
                        v[1:] for v in pointDict[chrom] if v[0] == start
                    ]
                    for index, val in mutatedPoints:
                        val = val[-1] if val.find('>') >= 0 else val
                        valList[index] = val
                resultDict[chrom].append(
                    '>%s %i-%i\n%s' %
                    (chrom, start + 1, end, ''.join(valList)))

        return resultDict

    @classmethod
    def IntersectData(cls, genome, tracks):
        from quick.util.CommonFunctions import getGeSource
        start = time()
        geSources = []
        for track in tracks:
            geSources.append(getGeSource(track, genome))
            #try:
            #    fileType = ExternalTrackManager.extractFileSuffixFromGalaxyTN(track)
            #    fn = ExternalTrackManager.extractFnFromGalaxyTN(track)
            #    if fileType == 'category.bed':
            #        geSources.append(BedCategoryGenomeElementSource(fn))
            #    elif fileType == 'gtrack':
            #        geSources.append(GtrackGenomeElementSource(fn))
            #    else:
            #        geSources.append(BedGenomeElementSource(fn))
            #
            #except:
            #    geSources.append(FullTrackGenomeElementSource(genome, track, allowOverlaps=False))

        resultDict, pointDict = defaultdict(list), defaultdict(list)
        gs1, gs2 = geSources
        track1Dict, track2Dict = defaultdict(list), defaultdict(list)

        for ge in gs1:
            track1Dict[ge.chr].append((ge.start, ge.end))

        for ge in gs2:
            track2Dict[ge.chr].append((ge.start, ge.end, ge.val))

        for chrom in track1Dict.keys():
            counter = 0
            track2List = sorted(track2Dict[chrom])
            for start1, end1 in sorted(track1Dict[chrom]):
                while len(track2List) > counter:
                    start2, end2, val = track2List[counter]
                    if start1 < end2 <= end1 or start1 <= start2 < end1:
                        resultDict[chrom].append([start1, end1])
                        pointDict[chrom].append(
                            [start1, start2 - start1,
                             str(val)])
                    elif start2 < start1 and end2 > end1:
                        resultDict[chrom].append([start1, end1])
                        pointDict[chrom].append(
                            [start1, start2 - start1,
                             str(val)])
                    elif start2 >= end1:
                        break
                    counter += 1
        return resultDict, pointDict

    def getPwmScores(self, motifId, moticScanObj):
        pwmRegDict = moticScanObj.scanMotifInTwoSequences(
            motifId, self.regularFasta.getDiskPath(),
            self.mutatedFasta.getDiskPath())
        #pwmMutDict = moticScanObj.scanMotifInSequence(motifId, self.mutatedFasta.getDiskPath())
        #pwmRegDict = moticScanObj.scanMotifInSequence(motifId, self.regularFasta.getDiskPath())
        diffResDict = defaultdict(list)
        lineTab = []
        for region in sorted(pwmRegDict):
            chrom, start = region.split()
            end = region.replace('-', ' ').split()[-1]
            start = int(start.split('-')[0])
            regular, mutated = pwmRegDict[region]
            difference = abs(regular[0] - mutated[0])
            reg, regMut, mut, mutReg = regular[:2] + mutated[:2]
            regSeq, regMutSeq, regPos = regular[2:]
            mutSeq, mutRegSeq, mutPos = mutated[2:]
            #print 'regSeq, regMutSeq, regPos: ', regSeq, regMutSeq, regPos, type(regSeq), type(regMutSeq), type(regPos)
            string = '%s\t%f\t[%f -> %f]\t[%f -> %f]\t' % (region.replace(
                '-', ' ').replace(' ',
                                  '\t'), difference, reg, regMut, mut, mutReg)
            string += '%s:%i-%i\t%s\t%s\t' % (chrom, start + regPos[0], start +
                                              regPos[1], regSeq, regMutSeq)
            string += '%s:%i-%i\t%s\t%s' % (chrom, start + mutPos[0], start +
                                            mutPos[1], mutSeq, mutRegSeq)
            diffResDict[difference].append(string)
            lineTab.append([
                chrom,
                str(start),
                str(end),
                str(difference),
                '[%f -> %f]' % (reg, regMut),
                '[%f -> %f]' % (mut, mutReg),
                '%s:%i-%i' % (chrom, start + regPos[0], start + regPos[1]),
                regSeq, regMutSeq,
                '%s:%i-%i' % (chrom, start + mutPos[0], start + mutPos[1]),
                mutSeq, mutRegSeq
            ])
        #(scores[bestIndx], mScores[bestIndx], matches[bestIndx], mMatches[bestIndx], endpoints[bestIndx]), (mScores[mBestIndx], scores[mBestIndx], mMatches[mBestIndx], matches[mBestIndx], mEndpoints[mBestIndx])]

        diffList = diffResDict.keys()
        if len(diffList) > 0:
            self.maxPwmDiff = str(max(diffList))
            self.avgPwmDiff = str(sum(diffList) / len(diffList))
            self.numPwmDiff = len(diffList)
            line = '# GTrack file\n#The columns in this dataset are:\n#\t(ChIP-seq_peak)chr\n#\tstart\n#\tend\n#\tmax(difference in column 5, difference in column 6)\n#\t[best_reference_sequence_PWM_hit_score -> corresponding_mutated_sequence_score]\n#\t[best_mutated_sequence_PWM_hit_score -> corresponding_reference_sequence_score]\n#\tchr:start-end(best_reference_sequence_PWM_hit_motif)\n#\tbest_reference_sequence_PWM_hit_motif\n#\tcorresponding_mutated_sequence_motif\n#\tchr:start-stop(best_mutated_sequence_PWM_hit_motif)\n#\tbest_mutated_sequence_PWM_hit_motif\n#\tcorresponding_reference_sequence_motif)\n##track type: valued segments\n##value column: val\n###seqid\tstart\tend\tval\treference_sequence_PWM\tmutated_sequence_PWM_hit_score\tbest_reference_sequence_PWM_hit_motif\tcorresponding_mutated_sequence_motif\tchr:start-stop(best_mutated_sequence_PWM_hit_motif)\tbest_mutated_sequence_PWM_hit_motif\tcorresponding_reference_sequence_motif\n'
            self.gtrackDiffScore.writeTextToFile(line)
            self.pwmDiffScore.writeTextToFile(self.getHtmlPwmTable(lineTab))
            self.bedPwmDiffScore.writeTextToFile('\n'.join(
                ['\t'.join(v[:4]) for v in lineTab]))
            for k in sorted(diffResDict.keys(), reverse=True):
                line = '\n'.join(diffResDict[k])
                #self.pwmDiffScore.writeTextToFile(line)
                self.gtrackDiffScore.writeTextToFile(line, mode='a')

    def getHtmlPwmTable(self, lineTab):
        headerTab = [
            'chrom', 'start', 'end', 'max PWM difference',
            'best reference seq_PWM score -> corresponding mut seq score',
            'best mut seq PWM score -> corresponding_ref seq score',
            'ref region', 'ref seq', 'corresponding mut seq', 'mut region',
            'mut seq', 'corresponding ref seq'
        ]
        core = HtmlCore()
        core.begin()
        core.tableHeader(headerTab, sortable=True)
        for row in lineTab:
            if True:  #hasattr(tfObj,'maxPwmDiff'):
                core.tableLine(row)
        core.tableFooter()
        core.end()
        return str(core)

    def makeHtmlStr(self):
        htmlPage = GalaxyRunSpecificFile(
            ['html', '_'.join(self.track), 'page.html'], self.galaxyFn)
        htmlStr = 'TF: ' + self.tf + '<br/>\nChip-seq peaks: ' + self.chipSeqPeaks + '<br/>\nPWM: ' + self.pwm + '<br/>\nNumber of SNV-intersected binding regions: ' + self.intersectingPoints + '<br/>\nHighest binding difference: ' + self.maxPwmDiff + '<br/>\nAvg binding difference: ' + self.avgPwmDiff + '<br/>\n' + self.regularFasta.getLink(
            'Original Fasta') + '<br/>\n' + self.mutatedFasta.getLink(
                'Mutated Fasta') + '<br/>\n' + self.pwmDiffScore.getLink(
                    'PWM score for each region'
                ) + '<br/>\n' + self.gtrackDiffScore.getLink(
                    'Gtrack of PWM score for each region')
        htmlPage.writeTextToFile(htmlStr)
        return htmlPage.getLink(self.tf + ':   ' + self.track[-1])