示例#1
0
    def testGetStats(self):
        cubeName = 'test-' + str(uuid.uuid4())
        shutil.copyfile('testdata.csv', cubeName + '.csv')
        cs = CubeService('testdb')
        cs.createCubeFromCsv(cubeName + '.csv', cubeName, cubeName)
        cubeCells = cs.getCubeCells(cubeName)
        stats = cs.getStats(cubeCells)
        for stat in stats:
            self.assertTrue(stat in ['Price', 'Qty'])
            if stat == 'Price':
                self.assertTrue(math.ceil(stats[stat]['std'] * 100) / 100 == 6.42)
                self.assertTrue(stats[stat]['min'] == 1.5)
                self.assertTrue(stats[stat]['max'] == 20.5)
                self.assertTrue(stats[stat]['median'] == 12.5)
                self.assertTrue(stats[stat]['total'] == 164.0)
                self.assertTrue(math.ceil(stats[stat]['mean'] * 100) / 100 == 11.72)
            elif stat == 'Qty':
                self.assertTrue(math.ceil(stats[stat]['std'] * 100) / 100 == 1.81)
                self.assertTrue(stats[stat]['min'] == 1.0)
                self.assertTrue(stats[stat]['max'] == 7.0)
                self.assertTrue(stats[stat]['median'] == 3.0)
                self.assertTrue(stats[stat]['total'] == 44.0)
                self.assertTrue(math.ceil(stats[stat]['mean'] * 100) / 100 == 3.15)

        os.remove(cubeName + '.csv')
示例#2
0
    def testBinning(self):
        cubeName = 'test-' + str(uuid.uuid4())
        shutil.copyfile('testdata.csv', cubeName + '.csv')
        cs = CubeService('testdb')
        cs.createCubeFromCsv(cubeName + '.csv', cubeName, cubeName)
        with open('test_binnings.json') as binnings_file:
            binnings = json.load(binnings_file)
        cs.binCube(binnings, cubeName, cubeName + '_b', cubeName + '_b')

        binnedCubeCells = cs.getCubeCells(cubeName + '_b')
        dimkeys = []
        for binnedCubeCell in binnedCubeCells:
            dimkeys.append(binnedCubeCell['dimensionKey'])
        dimkeys.sort()

        self.assertTrue(dimkeys[0] == '#CustomerId:C1#PriceBin:0-5#ProductId:P1#QtyBin:0-5#Region:West#State:CA#Year:Year2014#Date:2014-10-11')
        self.assertTrue(dimkeys[1] == '#CustomerId:C1#PriceBin:0-5#ProductId:P1#QtyBin:0-5#Region:West#State:CA#Year:Year2015#Date:2015-10-11')
        self.assertTrue(dimkeys[2] == '#CustomerId:C1#PriceBin:10+#ProductId:P1#QtyBin:0-5#Region:West#State:CA#Year:Year2014#Date:2014-10-10')
        self.assertTrue(dimkeys[3] == '#CustomerId:C1#PriceBin:10+#ProductId:P1#QtyBin:0-5#Region:West#State:CA#Year:Year2015#Date:2015-10-10')
        self.assertTrue(dimkeys[4] == '#CustomerId:C1#PriceBin:10+#ProductId:P2#QtyBin:0-5#Region:West#State:CA#Year:Year2014#Date:2014-10-11')
        self.assertTrue(dimkeys[5] == '#CustomerId:C1#PriceBin:10+#ProductId:P2#QtyBin:0-5#Region:West#State:CA#Year:Year2015#Date:2015-10-11')
        self.assertTrue(dimkeys[6] == '#CustomerId:C2#PriceBin:0-5#ProductId:P1#QtyBin:0-5#Region:NorthEast#State:NY#Year:Year2014#Date:2014-10-11')
        self.assertTrue(dimkeys[7] == '#CustomerId:C2#PriceBin:0-5#ProductId:P1#QtyBin:0-5#Region:NorthEast#State:NY#Year:Year2015#Date:2015-10-11')
        self.assertTrue(dimkeys[8] == '#CustomerId:C2#PriceBin:10+#ProductId:P1#QtyBin:0-5#Region:NorthEast#State:NY#Year:Year2014#Date:2014-10-10')
        self.assertTrue(dimkeys[9] == '#CustomerId:C2#PriceBin:10+#ProductId:P1#QtyBin:0-5#Region:NorthEast#State:NY#Year:Year2015#Date:2015-10-10')
        self.assertTrue(dimkeys[10] == '#CustomerId:C2#PriceBin:5-10#ProductId:P2#QtyBin:0-5#Region:NorthEast#State:NY#Year:Year2014#Date:2014-10-10')
        self.assertTrue(dimkeys[11] == '#CustomerId:C2#PriceBin:5-10#ProductId:P2#QtyBin:0-5#Region:NorthEast#State:NY#Year:Year2015#Date:2015-10-10')
        self.assertTrue(dimkeys[12] == '#CustomerId:C3#PriceBin:10+#ProductId:P1#QtyBin:5+#Region:NorthEast#State:MA#Year:Year2014#Date:2014-10-11')
        self.assertTrue(dimkeys[13] == '#CustomerId:C3#PriceBin:10+#ProductId:P1#QtyBin:5+#Region:NorthEast#State:MA#Year:Year2015#Date:2015-10-11')

        os.remove(cubeName + '.csv')
示例#3
0
    def testAggregation(self):
        cubeName = 'test-' + str(uuid.uuid4())
        shutil.copyfile('testdata.csv', cubeName + '.csv')
        cs = CubeService('testdb')
        cs.createCubeFromCsv(cubeName + '.csv',cubeName, cubeName)
        with open('test_binnings.json') as binnings_file:
            binnings = json.load(binnings_file)
        cs.binCube(binnings, cubeName, cubeName + '_b', cubeName + '_b')

        aggs = []
        with open('test_agg.json') as agg_file:
            aggs = json.load(agg_file)

        cs.aggregateCube(cubeName + '_b', aggs)

        aggCubeCells = cs.getCubeCells(cubeName + '_b_agg1')
        self.assertTrue (aggCubeCells.count() == 4)
        for aggCubeCell in aggCubeCells:
            self.assertTrue(len(aggCubeCell['dimensions']) == 2)
            print aggCubeCell

        print '---------'

        aggCubeCells = cs.getCubeCells(cubeName + '_b_agg2')
        self.assertTrue (aggCubeCells.count() == 2)
        print aggCubeCells.count()
        for aggCubeCell in aggCubeCells:
            self.assertTrue(len(aggCubeCell['dimensions']) == 1)
            print aggCubeCell

        print '---------'

        aggCubeCells = cs.getCubeCells(cubeName + '_b_agg3')
        self.assertTrue (aggCubeCells.count() == 2)
        print aggCubeCells.count()
        for aggCubeCell in aggCubeCells:
            self.assertTrue(len(aggCubeCell['dimensions']) == 1)
            print aggCubeCell

        print '---------'

        os.remove(cubeName + '.csv')
示例#4
0
class CubeSetService:

    def __init__(self, dbName):
        self.dbName = dbName
        client = MongoClient()
        self.db = client[dbName]
        self.cubeService = CubeService(dbName)

    #
    # Update an arbitrary field in cubeset
    #
    def __updateCubeSetProperty__(self, cubeSetName, update):
        self.db['cubeset'].update_one({ "name" : cubeSetName}, update)


    def getCubeSet(self, cubeSetName):
        return self.db['cubeset'].find_one({ "name": cubeSetName})


    def createCubeSet(self, owner, cubeSetName, cubeSetDisplayName, csvFileName, binnings, aggs):

        # Make sure cubeSetName is unique
        existing = self.getCubeSet(cubeSetName)
        if existing != None:
            raise ValueError('Cube Set with ' + cubeSetName + ' already exists')

        sourceCubeName = cubeSetName + "_source"
        self.cubeService.createCubeFromCsv(csvFileName, sourceCubeName, sourceCubeName)
        if binnings != None:
            binnedCubeName = cubeSetName + "_binned"
            self.cubeService.binCube(binnings, sourceCubeName, binnedCubeName, binnedCubeName)
            if aggs != None:
                self.cubeService.aggregateCube(binnedCubeName, aggs)
        
        # Now save the cubeSet
        cubeSet = {}
        cubeSet['name'] = cubeSetName
        cubeSet['displayName'] = cubeSetDisplayName
        cubeSet['owner'] = owner
        cubeSet['csvFileName'] = csvFileName
        cubeSet['createdOn'] = datetime.utcnow()
        cubeSet['sourceCube'] = sourceCubeName
        if binnings != None:
            cubeSet['binnedCube'] = binnedCubeName
            if aggs != None:
                aggCubeNames = []
                for agg in aggs:
                    aggCubeNames.append(binnedCubeName + "_" + agg['name'])
                cubeSet['aggCubes'] = aggCubeNames
    
        self.db['cubeset'].insert_one(cubeSet);        

    #
    # Add cells to source cube
    #
    def addCellsToSourceCube(self, cubeSetName, csvFileName):
        existing = self.getCubeSet(cubeSetName)
        if existing == None:
            raise ValueError('Cube Set with ' + cubeSetName + ' does not exist')

        self.cubeService.appendToCubeFromCsv(csvFileName, existing['sourceCube'])

        #re-bin
        if 'binnedCube' in existing:
            binnedCubeName = existing['binnedCube']
            binnedCube = self.cubeService.getCube(binnedCubeName)
            self.cubeService.rebinCube(binnedCube['binnings'], existing['sourceCube'], existing['binnedCube'])

            #re-aggregate
            if 'aggCubes' in existing:
                for aggCubeName in existing['aggCubes']:
                   aggCube = self.cubeService.getCube(aggCubeName)
                   aggs = []
                   aggs.append(aggCube['agg'])
                   self.cubeService.aggregateCube(binnedCubeName, aggs)

    #
    # Remove cells from source
    #
    def removeCellsFromSourceCube(self, cubeSetName, filter):
        existing = self.getCubeSet(cubeSetName)
        if existing == None:
            raise ValueError('Cube Set with ' + cubeSetName + ' does not exist')
        self.cubeService.deleteCubeCells(existing['sourceCube'], filter)

        #re-bin
        if 'binnedCube' in existing:
            binnedCubeName = existing['binnedCube']
            binnedCube = self.cubeService.getCube(binnedCubeName)
            self.cubeService.rebinCube(binnedCube['binnings'], existing['sourceCube'], existing['binnedCube'])

            #re-aggregate
            if 'aggCubes' in existing:
                for aggCubeName in existing['aggCubes']:
                   aggCube = self.cubeService.getCube(aggCubeName)
                   aggs = []
                   aggs.append(aggCube['agg'])
                   self.cubeService.aggregateCube(binnedCubeName, aggs)

    #
    # Update cubeset display name
    #
    def updateCubeSetDisplayName(self, cubeSetName, displayName):
        self.__updateCubeSetProperty__(cubeSetName, { "$set": {"displayName" : displayName}})


    def deleteCubeSet(self, cubeSetName):

        existing = self.getCubeSet(cubeSetName)
        if existing == None:
            raise ValueError('Cube Set with ' + cubeSetName + ' does not exist')

        # Delete source, binned and agg cubes
        self.cubeService.deleteCube(existing['sourceCube'])
        if 'binnedCube' in existing:
            self.cubeService.deleteCube(existing['binnedCube'])
            if 'aggCubes' in existing:
                for aggCube in existing['aggCubes']:
                    self.cubeService.deleteCube(aggCube)
                   
        # Delete cubeset
        self.db['cubeset'].remove({ "name": cubeSetName })


    def getSourceCubeCells(self, cubeSetName):
        existing = self.getCubeSet(cubeSetName)
        if existing == None:
            raise ValueError('Cube Set with ' + cubeSetName + ' does not exist')

        return self.cubeService.getCubeCells(existing['sourceCube'])

    def getBinnedCubeCells(self, cubeSetName):
        existing = self.getCubeSet(cubeSetName)
        if existing == None:
            raise ValueError('Cube Set with ' + cubeSetName + ' does not exist')

        if existing['binnedCube'] != None:
            return self.cubeService.getCubeCells(existing['binnedCube'])
        else:
            return None

    def getAggregatedCubeCells(self, cubeSetName, aggName):
        existing = self.getCubeSet(cubeSetName)
        if existing == None:
            raise ValueError('Cube Set with ' + cubeSetName + ' does not exist')

        if 'aggCubes' in existing:
            for aggCube in existing['aggCubes']:
               if existing['binnedCube'] + "_" + aggName == aggCube:
                   return self.cubeService.getCubeCells(aggCube)

        return None

    def performBinning(self, cubeSetName, binnings):
        existing = self.getCubeSet(cubeSetName)
        if existing == None:
            raise ValueError('Cube Set with ' + cubeSetName + ' does not exist')
        
        # Are we rebinning?
        if existing['binnedCube'] != None:
            self.cubeService.rebinCube(binnings, existing['sourceCube'], existing['binnedCube'])
        else: 
            binnedCubeName = cubeSetName + "_binned"
            self.cubeService.binCube(binnings, binnedCubeName, binnedCubeName)
            self.__updateCubeSetProperty__(cubeSetName, { "$set": {"binnedCube" : binnedCubeName}})

    def performAggregation(self, cubeSetName, aggs):
        existing = self.getCubeSet(cubeSetName)
        if existing == None:
            raise ValueError('Cube Set with ' + cubeSetName + ' does not exist')

        binnedCubeName = existing['binnedCube']
        self.cubeService.aggregateCube(binnedCubeName, aggs)

        aggCubeNames = []
        for agg in aggs:
              aggCubeNames.append(binnedCubeName + "_" + agg['name'])
 
        self.__updateCubeSetProperty__(cubeSetName, { "$set": {"aggCubes" : aggCubeNames}})