Пример #1
0
def evaluatePvalueAndNullDistributionList(observedAndMcSamplesTuple, tail, rawStatisticMainClassName):
    resultsDict = OrderedDict()
    #TODO: What is received is not a list of tuples, it is a tuple of the real result which is a
    # TrackStructure whose result is a list of raw values and list of such track structures.
    # Need to find a way to handle it.

    observedResult = observedAndMcSamplesTuple[0]
    mcSamplesTsList = observedAndMcSamplesTuple[1]
    #TODO: What about categorial ts results?
    isPairedTsResult = all([val.isPairedTs() for val in observedResult.values()])
    observedResultDict = OrderedDict()
    mcSamplesResultDict = OrderedDefaultDict(list)
    if isPairedTsResult:
        for pairedTs in observedResult.values():
            trackTitle = pairedTs['reference'].metadata['title']
            assert trackTitle not in observedResultDict, "%s already in observed results dict" % trackTitle
            observedResultDict[trackTitle] = pairedTs.result
        for mcSampleTs in mcSamplesTsList:
            for pairedTs in mcSampleTs.values():
                trackTitle = pairedTs['reference'].metadata['title']
                mcSamplesResultDict[trackTitle].append(pairedTs.result)
    else: #isFlat?
        raise Exception('not implemented yet!')

    for trackTitle, observation in observedResultDict.iteritems():
        resultsDict[trackTitle] = evaluatePvalueAndNullDistribution((observation, mcSamplesResultDict[trackTitle]), tail, rawStatisticMainClassName)

    return resultsDict
Пример #2
0
    def __init__(self, geSource):
        self._geSource = self._decorateGESource(geSource)
        self._boundingRegionsAndGEsCorrespond = None

        self._areValsCategorical = TrackFormat.createInstanceFromGeSource(geSource).getValTypeName() == 'Category'
        self._areEdgeWeightsCategorical = TrackFormat.createInstanceFromGeSource(geSource).getWeightTypeName() == 'Category'
        self._valCategories = set()
        self._edgeWeightCategories = set()

        self._numElements = OrderedDefaultDict(int)
        self._maxStrLens = OrderedDefaultDict(partial(self._initMaxStrLens, self._getMaxStrLensKeys()))
        self._maxNumEdges = OrderedDefaultDict(int)

        self._hasCalculatedStats = False
Пример #3
0
class GESourceManager(object):
    def __init__(self, geSource):
        self._geSource = self._decorateGESource(geSource)
        self._boundingRegionsAndGEsCorrespond = None

        self._areValsCategorical = TrackFormat.createInstanceFromGeSource(geSource).getValTypeName() == 'Category'
        self._areEdgeWeightsCategorical = TrackFormat.createInstanceFromGeSource(geSource).getWeightTypeName() == 'Category'
        self._valCategories = set()
        self._edgeWeightCategories = set()

        self._numElements = OrderedDefaultDict(int)
        self._maxStrLens = OrderedDefaultDict(partial(self._initMaxStrLens, self._getMaxStrLensKeys()))
        self._maxNumEdges = OrderedDefaultDict(int)

        self._hasCalculatedStats = False
#        self._calcStatisticsInExtraPass()

    def _decorateGESource(self, geSource):
        return GEDependentAttributesHolder(geSource)

    def _getMaxStrLensKeys(self):
        prefixSet = set(self._geSource.getPrefixList())

        return (['val'] if 'val' in prefixSet and self._geSource.getValDataType() == 'S' else []) + \
               (['id'] if 'id' in prefixSet else []) + \
               (['edges'] if 'edges' in prefixSet else []) + \
               (['weights'] if 'weights' in prefixSet and self._geSource.getEdgeWeightDataType() == 'S' else []) + \
               [x for x in prefixSet if x not in RESERVED_PREFIXES]

    @staticmethod
    def _initMaxStrLens(keys):
        return dict([(x,0) for x in keys])

    def _calcStatisticsInExtraPass(self):
        if not self._hasCalculatedStats:
            prevPrintWarnings = self._geSource.getPrintWarnings()
            self._geSource.setPrintWarnings(False)

            if self._geSource.isSliceSource():
                if len(self._getMaxStrLensKeys()):
                    raise NotImplementedError('Dimension calculation not yet implemented for slice-based GenomeElementSources.')

                prefixList = self._geSource.getPrefixList()
                for el in self._geSource:
                    chr = el.chr
                    self._numElements[chr] += len(getattr(el, prefixList[0]))
            else:
                for el in self._geSource:
                    chr = el.chr
                    self._numElements[chr] += 1

                    if el.isBlankElement:
                        continue

                    if self._areValsCategorical:
                        self._valCategories.add(el.val)

                    if self._areEdgeWeightsCategorical:
                        self._edgeWeightCategories |= set(el.weights)

                    for prefix in self._maxStrLens[chr]:
                        content = getattr(el, prefix, None)

                        if content is not None:
                            self._maxStrLens[chr][prefix] = \
                                    max( self._maxStrLens[chr][prefix], \
                                         max(1, len(content)) if isinstance(content, basestring) else \
                                            max([1] + [len(x) for x in flatten(content)]) )

                            if prefix == 'edges':
                                self._maxNumEdges[chr] = max(self._maxNumEdges[chr], len(el.edges))

            self._geSource.setPrintWarnings(prevPrintWarnings)
            self._hasCalculatedStats = True

    def getGESource(self):
        return self._geSource

    def getBoundingRegionTuples(self):
        boundingRegionTuples = [x for x in self._getBoundingRegionTuples() \
                                if x.region.chr is not None]

        if len(boundingRegionTuples) == 0:
            from gold.origdata.GenomeElementSource import BoundingRegionTuple
            from gold.track.GenomeRegion import GenomeRegion
            from quick.util.GenomeInfo import GenomeInfo

            geChrList = self.getAllChrs()
            boundingRegionTuples = [BoundingRegionTuple( \
                                     GenomeRegion(chr=chr, start=0, end=GenomeInfo.getChrLen(self._geSource.genome, chr)), \
                                     self.getNumElementsForChr(chr) ) \
                                    for chr in geChrList]
            self._boundingRegionsAndGEsCorrespond = False
        else:
            self._boundingRegionsAndGEsCorrespond = True

        return boundingRegionTuples

    def _getBoundingRegionTuples(self):
        return self._geSource.getBoundingRegionTuples()

    def boundingRegionsAndGEsCorrespond(self):
        assert self._boundingRegionsAndGEsCorrespond is not None
        return self._boundingRegionsAndGEsCorrespond

    def getPrefixList(self):
        return self._geSource.getPrefixList()

    def getValDataType(self):
        return self._geSource.getValDataType()

    def getValDim(self):
        return self._geSource.getValDim()

    def getEdgeWeightDataType(self):
        return self._geSource.getEdgeWeightDataType()

    def getEdgeWeightDim(self):
        return self._geSource.getEdgeWeightDim()

    def isSorted(self):
        return self._geSource.isSorted()

    def getAllChrs(self):
        self._calcStatisticsInExtraPass()
        return self._numElements.keys()

    def getNumElements(self):
        self._calcStatisticsInExtraPass()
        return sum(self._numElements.values())

    def getNumElementsForChr(self, chr):
        self._calcStatisticsInExtraPass()
        return self._numElements[chr]

    def getValCategories(self):
        self._calcStatisticsInExtraPass()
        return self._valCategories

    def getEdgeWeightCategories(self):
        self._calcStatisticsInExtraPass()
        return self._edgeWeightCategories

    def getMaxNumEdges(self):
        self._calcStatisticsInExtraPass()
        return max(self._maxNumEdges.values())

    def getMaxNumEdgesForChr(self, chr):
        self._calcStatisticsInExtraPass()
        return self._maxNumEdges[chr]

    def getMaxStrLens(self):
        self._calcStatisticsInExtraPass()
        return reduce(lambda x,y:dict((key, max(x[key], y[key])) for key in x.keys()), \
               self._maxStrLens.values())

    def getMaxStrLensForChr(self, chr):
        self._calcStatisticsInExtraPass()
        return self._maxStrLens[chr]

    def getMaxChrStrLen(self):
        self._calcStatisticsInExtraPass()
        return max(len(chr) for chr in self._maxStrLens.keys())
 def __init__(self, statistic=None):
     self._statClassList = [statistic] if statistic else []
     self._analysisParts = []
     self._analysisOptionsDict = OrderedDefaultDict(list)
class AnalysisSpec(object):
    #Only supports a single stat, at least for now
    #Takes a MagicStatFactory, as this will resolve into either an unsplittable or a splittable statistic according to what's suited
    #Note: maybe MagicStatFactory should have a synonomous class name that would appear less intrusive in a setting like this?
    #@takes(AnalysisSpec, MagicStatFactory)
    def __init__(self, statistic=None):
        self._statClassList = [statistic] if statistic else []
        self._analysisParts = []
        self._analysisOptionsDict = OrderedDefaultDict(list)

    def integrateParsedAnalysis(self, other):
        assert isinstance(other, AnalysisSpec)
        self._statClassList = copy(other._statClassList)
        self._analysisParts = copy(other._analysisParts)
        self._analysisOptionsDict = copy(other._analysisOptionsDict)

    #@takes(str, str)
    def addParameter(self, paramName, paramValue):
        self._appendAnalysisOption('[%s=%s]' % (paramName, paramValue))

    def _appendAnalysisOption(self, optionLine):
        analysisOption = AnalysisOption(optionLine)
        self._analysisParts.append(analysisOption)
        self._analysisOptionsDict[analysisOption.getLabelKey()].append(analysisOption)

    def _removeAnalysisOption(self, optionLabelKey):
        for option in self._allAnalysisOptions([optionLabelKey], raiseIfEmpty=True):
            self._analysisParts.remove(option)
        self._analysisOptionsDict.remove(optionLabelKey)

    def _allAnalysisOptions(self, labelKeys=None, onlyWithLabelText=False, onlyActivated=False, raiseIfEmpty=False):
        empty = True

        if labelKeys is None:
            labelKeys = self._analysisOptionsDict.keys()

        for key in labelKeys:
            options = self._analysisOptionsDict.get(key)
            if options is not None:
                if onlyWithLabelText:
                    options = [opt for opt in options if opt.getLabelText() != '']
                if onlyActivated:
                    options = [opt for opt in options if opt.isActivated(self)]
                if options:
                    empty = False
                    for opt in options:
                        yield opt

        if empty and raiseIfEmpty:
            raise ShouldNotOccurError

    def getDefAfterChoices(self, filterByActivation=False):
        defAfterChoices = ''
        for part in self._analysisParts:
            if isinstance(part, AnalysisOption):
                if filterByActivation and not part.isActivated(self):
                    continue
                defAfterChoices += part.getDefAfterChoice()
            else:
                defAfterChoices += str(part)
        defAfterChoices += ' -> ' + ','.join([x.__name__ for x in self._statClassList])
        return defAfterChoices