示例#1
0
class PythonApiTest(unittest.TestCase):
    def grouper(self, iterable, n, fillvalue=None):
        args = [iter(iterable)] * n
        return izip_longest(fillvalue=fillvalue, *args)

    def setUp(self):
        #Initialize all objects
        self.cos = CaffeOnSpark(sc)
        cmdargs = conf.get('spark.pythonargs')
        self.args = dict(self.grouper(cmdargs.split(), 2))
        self.cfg = Config(sc, self.args)
        self.train_source = DataSource(sc).getSource(self.cfg, True)
        self.validation_source = DataSource(sc).getSource(self.cfg, False)

    def testTrain(self):
        self.cos.train(self.train_source)
        self.assertTrue(
            os.path.isfile(self.args.get('-model').split(":")[1][3:]))
        result = self.cos.features(self.validation_source)
        self.assertTrue('accuracy' in result.columns)
        self.assertTrue('ip1' in result.columns)
        self.assertTrue('ip2' in result.columns)
        self.assertTrue(result.count() > 100)
        self.assertTrue(result.first()['SampleID'] == '00000000')
        result = self.cos.test(self.validation_source)
        self.assertTrue(result.get('accuracy') > 0.9)

    def testTrainWithValidation(self):
        result = self.cos.trainWithValidation(self.train_source,
                                              self.validation_source)
        self.assertEqual(len(result.columns), 2)
        self.assertEqual(result.columns[0], 'accuracy')
        self.assertEqual(result.columns[1], 'loss')
        result.show(2)

        row_count = result.count()
        last_row = result.rdd.zipWithIndex().filter(
            lambda (row, index): index == (row_count - 1)).collect()[0][0]
        finalAccuracy = last_row[0][0]
        self.assertTrue(finalAccuracy > 0.8)
        finalLoss = last_row[1][0]
        self.assertTrue(finalLoss < 0.5)
class PythonApiTest(unittest.TestCase):
    def grouper(self,iterable, n, fillvalue=None):
        args = [iter(iterable)] * n
        return izip_longest(fillvalue=fillvalue, *args)

    def setUp(self):
        #Initialize all objects
        self.cos=CaffeOnSpark(sc)
        cmdargs = conf.get('spark.pythonargs')
        self.args= dict(self.grouper(cmdargs.split(),2))
        self.cfg=Config(sc,self.args)
        self.train_source = DataSource(sc).getSource(self.cfg,True)
        self.validation_source = DataSource(sc).getSource(self.cfg,False)
        
    def testTrain(self):
        self.cos.train(self.train_source)
        self.assertTrue(os.path.isfile(self.args.get('-model').split(":")[1][3:]))
        result=self.cos.features(self.validation_source)
        self.assertTrue('accuracy' in result.columns)
        self.assertTrue('ip1' in result.columns)
        self.assertTrue('ip2' in result.columns)
        self.assertTrue(result.count() > 100)
        self.assertTrue(result.first()['SampleID'] == '00000000')
        result=self.cos.test(self.validation_source)
        self.assertTrue(result.get('accuracy') > 0.9)

    def testTrainWithValidation(self):
        result=self.cos.trainWithValidation(self.train_source, self.validation_source)
        self.assertEqual(len(result.columns), 2)
        self.assertEqual(result.columns[0], 'accuracy')
        self.assertEqual(result.columns[1], 'loss')
        result.show(2)

        row_count = result.count()
        last_row = result.rdd.zipWithIndex().filter(lambda (row,index): index==(row_count - 1)).collect()[0][0]
        finalAccuracy = last_row[0][0]
        self.assertTrue(finalAccuracy > 0.8)
        finalLoss = last_row[1][0]
        self.assertTrue(finalLoss < 0.5)
示例#3
0
class PythonApiTest(unittest.TestCase):
    def grouper(self, iterable, n, fillvalue=None):
        args = [iter(iterable)] * n
        return izip_longest(fillvalue=fillvalue, *args)

    def setUp(self):
        #Initialize all objects
        self.cos = CaffeOnSpark(sc, sqlContext)
        cmdargs = conf.get('spark.pythonargs')
        self.args = dict(self.grouper(cmdargs.split(), 2))
        self.cfg = Config(sc, self.args)
        self.train_source = DataSource(sc).getSource(self.cfg, True)
        self.validation_source = DataSource(sc).getSource(self.cfg, False)

    def testTrain(self):
        self.cos.train(self.train_source)
        self.assertTrue(
            os.path.isfile(self.args.get('-model').split(":")[1][3:]))
        result = self.cos.features(self.validation_source)
        self.assertTrue('accuracy' in result.columns)
        self.assertTrue('ip1' in result.columns)
        self.assertTrue('ip2' in result.columns)
        result = self.cos.test(self.validation_source)
        self.assertTrue(result.get('accuracy') > 0.9)

    def testTrainWithValidation(self):
        result = self.cos.trainWithValidation(self.train_source,
                                              self.validation_source)
        self.assertEqual(self.cfg.solverParameter.getTestIter(0), len(result))
        finalAccuracy = 0
        finalLoss = 0
        for i in range(self.cfg.solverParameter.getTestIter(0)):
            finalAccuracy += result[i][0]
            finalLoss += result[i][1]

        self.assertTrue(
            finalAccuracy / self.cfg.solverParameter.getTestIter(0) > 0.8)
        self.assertTrue(
            finalLoss / self.cfg.solverParameter.getTestIter(0) < 0.5)
    cfg.isFeature=True
    cfg.label='label'
    cfg.features=['ip1']
    cfg.outputFormat = 'json'
    cfg.clusterSize = 1
    cfg.lmdb_partitions=cfg.clusterSize
    
#Train
    dl_train_source = DataSource(sc).getSource(cfg,True)

    cos.train(dl_train_source)

#Extract features
    
    lr_raw_source = DataSource(sc).getSource(cfg,False)
    extracted_df = cos.features(lr_raw_source)
    extracted_df.show(365)
    squaresdf = extracted_df.map(lambda p : (p.label[0] , p.label[0]*p.label[0] , (p.label[0] - p.ip1[0]) , (p.label[0] - p.ip1[0])*(p.label[0] - p.ip1[0]) , 1 ) )
    squares = squaresdf.reduce(lambda a , b : (a[0]+b[0] , a[1]+b[1] , a[2]+b[2] , a[3]+b[3] , a[4]+b[4] ) )
    tss = float(squares[1]) - float(squares[0]*squares[0])/float(squares[4])
    rss = float(squares[3]) - float(squares[2]*squares[2])/float(squares[4])
    r2 = 1-rss/tss
    print("Test set:")
    print("Total SS: " + str(tss))
    print("Redidual SS: " + str(rss))
    print("R-Squared: " + str(r2))
    print str(squares)


    cfg.protoFile = aerosolProtoFile
    cfg.modelPath = 'file:' + aerosolModelFile
示例#5
0
    cfg.isFeature = True
    cfg.label = 'label'
    cfg.features = ['ip1']
    cfg.outputFormat = 'json'
    cfg.clusterSize = 1
    cfg.lmdb_partitions = cfg.clusterSize

    #Train
    dl_train_source = DataSource(sc).getSource(cfg, True)

    cos.train(dl_train_source)

    #Extract features

    lr_raw_source = DataSource(sc).getSource(cfg, False)
    extracted_df = cos.features(lr_raw_source)
    extracted_df.show(365)
    squaresdf = extracted_df.map(lambda p: (p.label[0], p.label[0] * p.label[
        0], (p.label[0] - p.ip1[0]), (p.label[0] - p.ip1[0]) *
                                            (p.label[0] - p.ip1[0]), 1))
    squares = squaresdf.reduce(lambda a, b: (a[0] + b[0], a[1] + b[1], a[2] +
                                             b[2], a[3] + b[3], a[4] + b[4]))
    tss = float(
        squares[1]) - float(squares[0] * squares[0]) / float(squares[4])
    rss = float(
        squares[3]) - float(squares[2] * squares[2]) / float(squares[4])
    r2 = 1 - rss / tss
    print("Test set:")
    print("Total SS: " + str(tss))
    print("Redidual SS: " + str(rss))
    print("R-Squared: " + str(r2))