예제 #1
0
 def __init__(self, dbName):
     self.dbName = dbName
     client = MongoClient()
     self.db = client[dbName]
     self.cubeService = CubeService(dbName)
예제 #2
0
파일: cubify.py 프로젝트: rtansf/cubify
 def __init__(self, dbName="cubify"):
     self.cubeService = CubeService(dbName)
     self.cubeSetService = CubeSetService(dbName)
예제 #3
0
class CubeSetService:

    def __init__(self, dbName):
        self.dbName = dbName
        client = MongoClient()
        self.db = client[dbName]
        self.cubeService = CubeService(dbName)

    #
    # Update an arbitrary field in cubeset
    #
    def __updateCubeSetProperty__(self, cubeSetName, update):
        self.db['cubeset'].update_one({ "name" : cubeSetName}, update)


    #
    # Get a cube set
    #
    def getCubeSet(self, cubeSetName):
        return self.db['cubeset'].find_one({ "name": cubeSetName})


    #
    # Create a cube set including binning and optionally aggregation
    #
    def createCubeSet(self, owner, cubeSetName, csvFilePath, binnings=None, aggs=None):

        # Make sure cubeSetName is unique
        existing = self.getCubeSet(cubeSetName)
        if existing != None:
            raise ValueError('Cube Set with ' + cubeSetName + ' already exists')

        sourceCubeName = cubeSetName + "_source"
        sourceCube = self.cubeService.createCubeFromCsv(csvFilePath, sourceCubeName)
        binnedCubeName = cubeSetName + "_binned"
        if binnings != None:
            binnedCube = self.cubeService.binCubeCustom(binnings, sourceCube, binnedCubeName)
        else:
            binnedCube = self.cubeService.binCube(sourceCube, binnedCubeName)

        if aggs != None:
                self.cubeService.aggregateCubeCustom(binnedCube, aggs)

        # Now save the cubeSet
        cubeSet = {}
        cubeSet['name'] = cubeSetName
        cubeSet['owner'] = owner
        cubeSet['csvFilePath'] = csvFilePath
        cubeSet['createdOn'] = datetime.utcnow()
        cubeSet['sourceCube'] = sourceCubeName
        cubeSet['binnedCube'] = binnedCubeName
        if aggs != None:
            aggCubeNames = []
            for agg in aggs:
                aggCubeNames.append(binnedCubeName + "_" + agg['name'])
            cubeSet['aggCubes'] = aggCubeNames
    
        self.db['cubeset'].insert_one(cubeSet)        
        
        return cubeSet

    #
    # Add rows to source cube
    #
    def addRowsToSourceCube(self, cubeSet, csvFilePath):
        if cubeSet == None:
            return

        existingSourceCube = self.cubeService.getCube(cubeSet['sourceCube'])
        self.cubeService.appendToCubeFromCsv(csvFilePath, existingSourceCube)

        #re-bin
        if 'binnedCube' in cubeSet:
            binnedCubeName = cubeSet['binnedCube']
            binnedCube = self.cubeService.getCube(binnedCubeName)
            self.cubeService.rebinCubeCustom(binnedCube['binnings'], existingSourceCube, cubeSet['binnedCube'])

            #re-aggregate
            if 'aggCubes' in cubeSet:
                for aggCubeName in cubeSet['aggCubes']:
                   aggCube = self.cubeService.getCube(aggCubeName)
                   aggs = []
                   aggs.append(aggCube['agg'])
                   self.cubeService.aggregateCubeCustom(binnedCube, aggs)

    #
    # Remove rows from source
    #
    def removeRowsFromSourceCube(self, cubeSet, filter):
        if cubeSet == None:
            return

        self.cubeService.deleteCubeRows(cubeSet['sourceCube'], filter)

        existingSourceCube = self.cubeService.getCube(cubeSet['sourceCube'])

        #re-bin
        if 'binnedCube' in cubeSet:
            binnedCubeName = cubeSet['binnedCube']
            binnedCube = self.cubeService.getCube(binnedCubeName)
            self.cubeService.rebinCubeCustom(binnedCube['binnings'], existingSourceCube, cubeSet['binnedCube'])

            #re-aggregate
            if 'aggCubes' in cubeSet:
                for aggCubeName in cubeSet['aggCubes']:
                   aggCube = self.cubeService.getCube(aggCubeName)
                   aggs = []
                   aggs.append(aggCube['agg'])
                   self.cubeService.aggregateCubeCustom(binnedCube, aggs)

    #
    # Delete a cube set
    #
    def deleteCubeSet(self, cubeSetName):

        existing = self.getCubeSet(cubeSetName)
        if existing == None:
            return

        # Delete source, binned and agg cubes
        self.cubeService.deleteCube(existing['sourceCube'])
        if 'binnedCube' in existing:
            self.cubeService.deleteCube(existing['binnedCube'])
            if 'aggCubes' in existing:
                for aggCube in existing['aggCubes']:
                    self.cubeService.deleteCube(aggCube)
                   
        # Delete cubeset
        self.db['cubeset'].remove({ "name": cubeSetName })

    # 
    # Get source cube rows. Iterator to cube rows is returned.
    # 
    def getSourceCubeRows(self, cubeSet):
        if cubeSet == None:
            return []

        return self.cubeService.getCubeRowsForCube(cubeSet['sourceCube'])

    # 
    # Get binned cube rows. Iterator to cube rows is returned.
    # 
    def getBinnedCubeRows(self, cubeSet):
        if cubeSet == None:
            return []

        # Refresh the cube set
        cubeSet = self.getCubeSet(cubeSet['name'])

        if cubeSet['binnedCube'] != None:
            return self.cubeService.getCubeRowsForCube(cubeSet['binnedCube'])
        else:
            return []

    # 
    # Get aggregated cube rows. List of cube row is returned
    # 
    def getAggregatedCubeRows(self, cubeSet, aggName):
        if cubeSet == None:
            return []

        # Refresh the cube set
        cubeSet = self.getCubeSet(cubeSet['name'])

        if 'aggCubes' in cubeSet:
            if aggName == 'ALL':
                rows = []
                for aggCube in cubeSet['aggCubes']:
                    aggRows = self.cubeService.getCubeRowsForCube(aggCube)
                    for aggRow in aggRows:
                        rows.append(aggRow)
                return rows
            else:
               for aggCube in cubeSet['aggCubes']:
                  if cubeSet['binnedCube'] + "_" + aggName == aggCube:
                      return self.cubeService.getCubeRowsForCube(aggCube)

        return []

    #
    # Export source cube to csv.
    #
    def exportSourceCubeToCsv(self, cubeSet, csvFilePath):
        if cubeSet == None:
            return

        sourceCubeName = cubeSet['sourceCube']
        sourceCube = self.cubeService.getCube(sourceCubeName)
        self.cubeService.exportCubeToCsv(sourceCube, csvFilePath)

    #
    # Export binned cube to csv.
    #
    def exportBinnedCubeToCsv(self, cubeSet, csvFilePath):
        if cubeSet == None:
            return

        binnedCubeName = cubeSet['binnedCube']
        binnedCube = self.cubeService.getCube(binnedCubeName)
        self.cubeService.exportCubeToCsv(binnedCube, csvFilePath)
        
    #
    # Export agg cubes to csv.
    #
    def exportAggCubesToCsv(self, cubeSet, directoryPath):
        if cubeSet == None:
            return

        # Refresh the cube set
        cubeSet = self.getCubeSet(cubeSet['name'])

        if 'aggCubes' in cubeSet:
            for aggCubeName in cubeSet['aggCubes']:
                aggCube = self.cubeService.getCube(aggCubeName)
                csvFilePath = directoryPath + "/" + cubeSet['name'] + "_agg_" + aggCube['name'] + '.csv'
                self.cubeService.exportCubeToCsv(aggCube, csvFilePath)

    #
    # Export all component cubes of cube set to csv
    #
    def exportToCsv(self, cubeSet, directoryPath):
        if cubeSet == None:
            return
        cubeSetName = cubeSet['name']
        cubeSet = self.getCubeSet(cubeSetName) # Refresh

        self.exportSourceCubeToCsv(cubeSet, directoryPath + "/" + cubeSetName + "_source" + ".csv")
        self.exportSourceCubeToCsv(cubeSet, directoryPath + "/" + cubeSetName + "_binned" + ".csv")
        for aggCubeName in cubeSet['aggCubes']:
            aggCube = self.cubeService.getCube(aggCubeName)
            self.cubeService.exportCubeToCsv(aggCube, directoryPath + "/" + cubeSetName + "_agg_" + aggCube['name'] + '.csv')

    #
    # Perform binning on source cube
    #
    def performBinning(self, cubeSet, binnings):
        if cubeSet == None:
            return None

        sourceCube = self.cubeService.getCube(cubeSet['sourceCube'])

        # Are we rebinning?
        if cubeSet['binnedCube'] != None:
            binnedCubeName = cubeSet['binnedCube']
            if binnings != None:
                self.cubeService.rebinCubeCustom(binnings, sourceCube, cubeSet['binnedCube'])
            else:
                self.cubeService.rebinCube(sourceCube, cubeSet['binnedCube'])
        else:
            cubeSetName = cubeSet['name']
            binnedCubeName = cubeSetName + "_binned"
            if binnings != None:
                self.cubeService.binCubeCustom(binnings, sourceCube, binnedCubeName)
            else:
                self.cubeService.binCube(sourceCube, binnedCubeName, [])
            self.__updateCubeSetProperty__(cubeSetName, { "$set": {"binnedCube" : binnedCubeName}})

        return self.cubeService.getCube(binnedCubeName)

    #
    #  Given a list of dimensions, return a list of n lists of dimensions, each a unique combination of the original list
    #  For example, if the input is ['d1','d2','d3'] this returns a list of 3 lists [['d1','d2','d3'],['d1','d2'],['d1']]
    #
    def __getGroupByDimensionsList__(self, dimensions):
        result = []
        groupByList = []
        done = False
        while done==False:
            for d in dimensions:
               groupByList.append(d)
            result.append(groupByList)
            groupByList = []
            dimensions = dimensions[:-1]
            if len(dimensions) == 0:
               done = True
        return result

    # 
    # Aggregate the binned cube with an ordered list of dimensions.
    # For example if the dimensions list is ['d1', 'd2', 'd3'] the aggregation will be performed 3 times on the 
    # binned cube with group-by dimensions, ['d1', 'd2', 'd3'], ['d1', 'd2'], ['d1'] 
    #  
    def performAggregation(self, cubeSet, dimensions):
        if cubeSet == None:
            return []

        binnedCubeName = cubeSet['binnedCube']
        binnedCube = self.cubeService.getCube(binnedCubeName)
        groupByDimensionsList = self.__getGroupByDimensionsList__(dimensions)
        aggCubes = self.cubeService.aggregateCubeComplex(binnedCube, groupByDimensionsList)

        aggCubeNames = []
        for aggCube in aggCubes:
            aggCubeNames.append(aggCube['name'])

        cubeSet['aggCubes'] = aggCubeNames
        cubeSetName = cubeSet['name']
        self.__updateCubeSetProperty__(cubeSetName, { "$set": {"aggCubes" : aggCubeNames}})

        return aggCubes
       
    #
    # Perform one or more aggregations on binned cube using custom aggs.
    # The aggregated cubes are automatically saved and identified by aggName
    #
    def performAggregationCustom(self, cubeSet, aggs):
        if cubeSet == None:
            return []

        binnedCubeName = cubeSet['binnedCube']
        binnedCube = self.cubeService.getCube(binnedCubeName)
        aggCubes = self.cubeService.aggregateCubeCustom(binnedCube, aggs)

        aggCubeNames = []
        for agg in aggs:
              aggCubeNames.append(binnedCubeName + "_" + agg['name'])

        cubeSet['aggCubes'] = aggCubeNames
        cubeSetName = cubeSet['name']
        self.__updateCubeSetProperty__(cubeSetName, { "$set": {"aggCubes" : aggCubeNames}})
        
        return aggCubes
예제 #4
0
파일: cubify.py 프로젝트: rtansf/cubify
class Cubify:

    def __init__(self, dbName="cubify"):
        self.cubeService = CubeService(dbName)
        self.cubeSetService = CubeSetService(dbName)

    ### Cubes

    def createCubeFromCsv(self, csvFilePath, cubeName, inMemory=False):
        return self.cubeService.createCubeFromCsv(csvFilePath, cubeName, inMemory)

    def createCubeFromCube(self, fromCube, filter, toCubeName):
        return self.cubeService.createCubeFromCube(fromCube, filter, toCubeName)

    def deleteCube(self, cubeName):
        self.cubeService.deleteCube(cubeName)

    def getCube(self, cubeName):
        return self.cubeService.getCube(cubeName)

    def queryCubeRows(self, cube, filter):
        return self.cubeService.queryCubeRows(cube, filter)

    def getCubeRows(self, cube):
        return self.cubeService.getCubeRows(cube)
        
    def exportCubeToCsv(self, cube, csvFilePath):
        return self.cubeService.exportCubeToCsv(cube, csvFilePath)

    def addColumn(self, cube, newColumnName, type, expression=None, func=None):
        self.cubeService.addColumn(cube, newColumnName, type, expression, func)

    def binCube(self, sourceCube, binnedCubeName, toBeBinned=None, hints={}):
        return self.cubeService.binCube(sourceCube, binnedCubeName, toBeBinned, hints)

    def rebinCube(self, sourceCube, binnedCubeName):
        return self.rebinCube(sourceCube, binnedCubeName)

    def binCubeCustom(self, binnings, sourceCube, binnedCubeName):
        return self.cubeService.binCubeCustom(binnings, sourceCube, binnedCubeName)

    def rebinCubeCustom(self, binnings, sourceCube, binnedCubeName):
        return self.cubeService.rebinCubeCustom(binnings, sourceCube, binnedCubeName)

    def aggregateCube(self, cube, groupByDimensions, measures=None):
        return self.cubeService.aggregateCube(cube, groupByDimensions, measures)

    def aggregateCubeCustom(self, cube, aggs):
        return self.cubeService.aggregateCubeCustom(cube, aggs)

    ####  CubeSets

    def createCubeSet(self, owner, cubeSetName, csvFilePath, binnings=None, aggs=None):
        return self.cubeSetService.createCubeSet(owner, cubeSetName, csvFilePath, binnings, aggs)

    def deleteCubeSet(self, cubeSetName):
        return self.cubeSetService.deleteCubeSet(cubeSetName)

    def getCubeSet(self, cubeSetName):
        return self.cubeSetService.getCubeSet(cubeSetName)

    def addRowsToSourceCube(self, cubeSet, csvFilePath):
        return self.cubeSetService.addRowsToSourceCube(cubeSet, csvFilePath)
        
    def removeRowsFromSourceCube(self, cubeSet, filter):
        return self.cubeSetService.removeRowsFromSourceCube(cubeSet, filter)

    def performBinning(self, cubeSet, binnings):
        return self.cubeSetService.performBinning(cubeSet, binnings)

    def performAggregation(self, cubeSetName, dimensions):
        return self.cubeSetService.performAggregation(cubeSetName, dimensions)

    def performAggregationCustom(self, cubeSetName, aggs):
        return self.cubeSetService.performAggregationCustom(cubeSetName, aggs)

    def getSourceCubeRows(self, cubeSet):
        return self.cubeSetService.getSourceCubeRows(cubeSet)

    def getBinnedCubeRows(self, cubeSet):
        return self.cubeSetService.getBinnedCubeRows(cubeSet)

    def getAggregatedCubeRows(self, cubeSet, aggName):
        return self.cubeSetService.getAggregatedCubeRows(cubeSet, aggName)

    def exportSourceCubeToCsv(self, cubeSet, csvFilePath):
        self.cubeSetService.exportSourceCubeToCsv(cubeSet, csvFilePath)

    def exportBinnedCubeToCsv(self, cubeSet, csvFilePath):
        self.cubeSetService.exportBinnedCubeToCsv(cubeSet, csvFilePath)

    def exportAggCubesToCsv(self, cubeSet, directoryPath):
        self.cubeSetService.exportAggCubesToCsv(cubeSet, directoryPath)

    def exportToCsv(self, cubeSet, directoryPath):
        self.cubeSetService.exportToCsv(cubeSet, directoryPath)