Пример #1
0
    def __init__(self, geSource):
        self._geSource = self._decorateGESource(geSource)
        self._boundingRegionsAndGEsCorresponds = None

        self._areValsCategorical = TrackFormat.createInstanceFromGeSource(geSource).getValTypeName() == 'Category'
        self._areEdgeWeightsCategorical = TrackFormat.createInstanceFromGeSource(geSource).getWeightTypeName() == 'Category'
        self._valCategories = set()
        self._edgeWeightCategories = set()

        self._numElements = OrderedDefaultDict(int)
        self._maxStrLens = OrderedDefaultDict(partial(self._initMaxStrLens, self._getMaxStrLensKeys()))
        self._maxNumEdges = OrderedDefaultDict(int)
        
        self._hasCalculatedStats = False
Пример #2
0
    def __init__(self, geSource):
        self._geSource = self._decorateGESource(geSource)
        self._boundingRegionsAndGEsCorrespond = None

        self._areValsCategorical = TrackFormat.createInstanceFromGeSource(geSource).getValTypeName() == 'Category'
        self._areEdgeWeightsCategorical = TrackFormat.createInstanceFromGeSource(geSource).getWeightTypeName() == 'Category'
        self._valCategories = set()
        self._edgeWeightCategories = set()

        self._numElements = OrderedDefaultDict(int)
        self._maxStrLens = OrderedDefaultDict(partial(self._initMaxStrLens, self._getMaxStrLensKeys()))
        self._maxNumEdges = OrderedDefaultDict(int)

        self._hasCalculatedStats = False
Пример #3
0
    def _composeContents(self,
                         out,
                         hbColumns,
                         columns,
                         geSource,
                         onlyNonDefault=True,
                         singleDataLine=False):
        tf = TrackFormat.createInstanceFromGeSource(self._geSource)
        out.write(self._composeHeaderLines(onlyNonDefault))
        out.write(self._composeColSpecLine(columns))

        for br, geList in iterateOverBRTuplesWithContainedGEs(
                geSource, onlyYieldTwoGEs=singleDataLine):
            if br is not None:
                out.write(self._composeBoundingRegionLine(br))

            for i, ge in enumerate(
                    self._removeStartElementIfApplicable(tf, geList)):
                out.write(
                    self._composeDataLine(ge, hbColumns, i + 1,
                                          i + 1 == len(geList)))

                if singleDataLine:
                    break
            if singleDataLine:
                break
Пример #4
0
 def _getGESourceManagerFromGESource(self, geSource):
     tf = TrackFormat.createInstanceFromGeSource(geSource)
     if tf.reprIsDense():
         if tf.getValTypeName() in ['Number', 'Number (integer)', 'Case-control']:
             return RegionBasedGESourceManager(geSource, self._regionList, \
                                               calcStatsInExtraPass=False, countElsInBoundingRegions=False)
         else:
             raise NotSupportedError
     else:
         return RegionBasedGESourceManager(geSource, self._regionList, \
                                           calcStatsInExtraPass=True, countElsInBoundingRegions=True)
Пример #5
0
 def _getGESourceManagerFromGESource(self, geSource):
     tf = TrackFormat.createInstanceFromGeSource(geSource)
     if tf.reprIsDense():
         if tf.getValTypeName() in [
                 'Number', 'Number (integer)', 'Case-control'
         ]:
             return RegionBasedGESourceManager(geSource, self._regionList, \
                                               calcStatsInExtraPass=False, countElsInBoundingRegions=False)
         else:
             raise NotSupportedError
     else:
         return RegionBasedGESourceManager(geSource, self._regionList, \
                                           calcStatsInExtraPass=True, countElsInBoundingRegions=True)
Пример #6
0
 def _composeContents(self, out, hbColumns, columns, geSource, onlyNonDefault=True, singleDataLine=False):
     tf = TrackFormat.createInstanceFromGeSource(self._geSource)
     out.write( self._composeHeaderLines(onlyNonDefault) )
     out.write( self._composeColSpecLine(columns) )
     
     for br, geList in iterateOverBRTuplesWithContainedGEs(geSource, onlyYieldTwoGEs=singleDataLine):
         if br is not None:
             out.write( self._composeBoundingRegionLine(br) )
         
         for i, ge in enumerate(self._removeStartElementIfApplicable(tf, geList)):
             out.write( self._composeDataLine(ge, hbColumns, i+1, i+1 == len(geList)) )
             
             if singleDataLine:
                 break
         if singleDataLine:
             break
Пример #7
0
    def testSorting(self):
        geSourceTest = self._commonSetup()

        for caseName in geSourceTest.cases:
            if not caseName.startswith("gtrack"):
                continue

            if "no_sort" in caseName:
                print "Test case skipped: " + caseName
                continue

            print caseName
            print

            case = geSourceTest.cases[caseName]
            testFn = self._writeTestFile(case)
            print open(testFn).read()
            print

            sortedContents = sortGtrackFileAndReturnContents(testFn, case.genome)
            print sortedContents

            sourceClass = GenomeElementSource if case.sourceClass is None else case.sourceClass
            forPreProcessor = True if case.sourceClass is None else False
            sortedGeSource = GEDependentAttributesHolder(
                sourceClass(
                    "sortedFile.gtrack",
                    case.genome,
                    forPreProcessor=forPreProcessor,
                    printWarnings=False,
                    strToUseInsteadOfFn=sortedContents,
                )
            )

            reprIsDense = TrackFormat.createInstanceFromGeSource(sortedGeSource).reprIsDense()

            if not reprIsDense:
                self.assertEquals(sorted(case.assertElementList), [ge for ge in sortedGeSource])
            else:
                for ge in sortedGeSource:
                    pass

            self.assertEquals(
                sorted(case.boundingRegionsAssertList), [br for br in sortedGeSource.getBoundingRegionTuples()]
            )
Пример #8
0
    def _compose(self, out):
        trackName = self._geSource.getTrackName()
        if trackName is not None:
            name = ':'.join(self._geSource.getTrackName()).replace(' ', '_')
        else:
            name = None

        print >> out, 'track type=wiggle_0' + (' name=%s' % name
                                               if name is not None else '')

        tf = TrackFormat.createInstanceFromGeSource(self._geSource)
        span = self._geSource.getFixedLength()
        step = self._geSource.getFixedGapSize() + span

        isFixedStep = (tf.reprIsDense() or step > 1
                       or (step == 1 and span != 1))

        for brt, geList in iterateOverBRTuplesWithContainedGEs(self._geSource):
            if len(geList) == 0:
                continue

            if isFixedStep:
                self._composeFixedStepDeclarationLine(out, brt.region, step,
                                                      span)
            else:
                curChr, curSpan = self._composeVariableStepDeclarationLine(
                    out, geList[0])

            for i, ge in enumerate(geList):
                if i==0 and tf.isDense() and tf.isInterval() and \
                    self._geSource.addsStartElementToDenseIntervals():
                    continue

                val = self._commonFormatNumberVal(ge.val)

                if isFixedStep:
                    cols = [val]
                else:
                    if ge.chr != curChr or self._getVariableSpan(
                            ge) != curSpan:
                        curChr, curSpan = self._composeVariableStepDeclarationLine(
                            out, ge)
                    cols = [str(ge.start + 1), val]

                print >> out, '\t'.join([str(x) for x in cols])
Пример #9
0
    def _allGESourceManagers(self, trackName, allowOverlaps):
        collector = PreProcMetaDataCollector(self._genome, trackName)
        if allowOverlaps == False and collector.overlapRuleHasBeenFinalized(True):
            for i in range(1):
                self._status = 'Trying to prepare preprocessing for track "%s"' % ':'.join(trackName) + \
                                (' (allowOverlaps: %s)' % allowOverlaps)
                yield self._getGESourceManagerFromTrack(trackName)
        else:
            for geSource in self._allGESources(trackName):
                if allowOverlaps == True:
                    tf = TrackFormat.createInstanceFromGeSource(geSource)
                    if tf.isDense() or geSource.hasNoOverlappingElements():
                        return

                self._status = 'Trying to prepare preprocessing for track "%s"' % ':'.join(trackName) + \
                                (' (filename: "%s")' % geSource.getFileName() if geSource.hasOrigFile() else '') + \
                                (' (allowOverlaps: %s)' % allowOverlaps)
                if PreProcessUtils.shouldPreProcessGESource(trackName, geSource, allowOverlaps):
                    yield self._getGESourceManagerFromGESource(geSource)
Пример #10
0
    def _allGESourceManagers(self, trackName, allowOverlaps):
        collector = PreProcMetaDataCollector(self._genome, trackName)
        if allowOverlaps == False and collector.overlapRuleHasBeenFinalized(
                True):
            for i in range(1):
                self._status = 'Trying to prepare preprocessing for track "%s"' % ':'.join(trackName) + \
                                (' (allowOverlaps: %s)' % allowOverlaps)
                yield self._getGESourceManagerFromTrack(trackName)
        else:
            for geSource in self._allGESources(trackName):
                if allowOverlaps == True:
                    tf = TrackFormat.createInstanceFromGeSource(geSource)
                    if tf.isDense() or geSource.hasNoOverlappingElements():
                        return

                self._status = 'Trying to prepare preprocessing for track "%s"' % ':'.join(trackName) + \
                                (' (filename: "%s")' % geSource.getFileName() if geSource.hasOrigFile() else '') + \
                                (' (allowOverlaps: %s)' % allowOverlaps)
                if PreProcessUtils.shouldPreProcessGESource(
                        trackName, geSource, allowOverlaps):
                    yield self._getGESourceManagerFromGESource(geSource)
Пример #11
0
    def _compose(self, out):
        trackName = self._geSource.getTrackName()
        if trackName is not None:
            name = ':'.join(self._geSource.getTrackName()).replace(' ','_')
        else:
            name = None
        
        print >>out, 'track type=wiggle_0' + (' name=%s' % name if name is not None else '')

        tf = TrackFormat.createInstanceFromGeSource(self._geSource)
        span = self._geSource.getFixedLength()
        step = self._geSource.getFixedGapSize() + span
        
        isFixedStep = (tf.reprIsDense() or step > 1 or (step == 1 and span != 1))
        
        for brt, geList in iterateOverBRTuplesWithContainedGEs(self._geSource):
            if len(geList) == 0:
                continue
            
            if isFixedStep:
                self._composeFixedStepDeclarationLine(out, brt.region, step, span)
            else:
                curChr, curSpan = self._composeVariableStepDeclarationLine(out, geList[0])
            
            for i,ge in enumerate(geList):
                if i==0 and tf.isDense() and tf.isInterval() and \
                    self._geSource.addsStartElementToDenseIntervals():
                    continue
                
                val = self._commonFormatNumberVal(ge.val)
                
                if isFixedStep:
                    cols = [val]
                else:
                    if ge.chr != curChr or self._getVariableSpan(ge) != curSpan:
                        curChr, curSpan = self._composeVariableStepDeclarationLine(out, ge)
                    cols = [str(ge.start+1), val]
                
                print >>out, '\t'.join([str(x) for x in cols])
Пример #12
0
            testFn = self._writeTestFile(case)
            print open(testFn).read()
            print

            sortedContents = sortGtrackFileAndReturnContents(
                testFn, case.genome)
            print sortedContents

            sourceClass = GenomeElementSource if case.sourceClass is None else case.sourceClass
            forPreProcessor = True if case.sourceClass is None else False
            sortedGeSource = GEDependentAttributesHolder(sourceClass('sortedFile.gtrack', case.genome, \
                                                                     forPreProcessor=forPreProcessor, \
                                                                     printWarnings=False, \
                                                                     strToUseInsteadOfFn=sortedContents))

            reprIsDense = TrackFormat.createInstanceFromGeSource(
                sortedGeSource).reprIsDense()

            if not reprIsDense:
                self.assertEquals(sorted(case.assertElementList),
                                  [ge for ge in sortedGeSource])
            else:
                for ge in sortedGeSource:
                    pass

            self.assertEquals(
                sorted(case.boundingRegionsAssertList),
                [br for br in sortedGeSource.getBoundingRegionTuples()])

    def runTest(self):
        pass
Пример #13
0
 def _init(self):
     self._allValsAreBedVals = False
     tf = TrackFormat.createInstanceFromGeSource(self._geSource)
     if tf.getValTypeName() == 'Number (integer)':
         self._allValsAreBedVals = all(
             (0 <= ge.val <= 1000) for ge in self._geSource)
Пример #14
0
 def _init(self):
     self._allValsAreBedVals = False
     tf = TrackFormat.createInstanceFromGeSource(self._geSource)
     if tf.getValTypeName() == 'Number (integer)':
         self._allValsAreBedVals = all((0 <= ge.val <= 1000) for ge in self._geSource)