class PythonApiTest(unittest.TestCase): def grouper(self, iterable, n, fillvalue=None): args = [iter(iterable)] * n return izip_longest(fillvalue=fillvalue, *args) def setUp(self): #Initialize all objects self.cos = CaffeOnSpark(sc) cmdargs = conf.get('spark.pythonargs') self.args = dict(self.grouper(cmdargs.split(), 2)) self.cfg = Config(sc, self.args) self.train_source = DataSource(sc).getSource(self.cfg, True) self.validation_source = DataSource(sc).getSource(self.cfg, False) def testTrain(self): self.cos.train(self.train_source) self.assertTrue( os.path.isfile(self.args.get('-model').split(":")[1][3:])) result = self.cos.features(self.validation_source) self.assertTrue('accuracy' in result.columns) self.assertTrue('ip1' in result.columns) self.assertTrue('ip2' in result.columns) self.assertTrue(result.count() > 100) self.assertTrue(result.first()['SampleID'] == '00000000') result = self.cos.test(self.validation_source) self.assertTrue(result.get('accuracy') > 0.9) def testTrainWithValidation(self): result = self.cos.trainWithValidation(self.train_source, self.validation_source) self.assertEqual(len(result.columns), 2) self.assertEqual(result.columns[0], 'accuracy') self.assertEqual(result.columns[1], 'loss') result.show(2) row_count = result.count() last_row = result.rdd.zipWithIndex().filter( lambda (row, index): index == (row_count - 1)).collect()[0][0] finalAccuracy = last_row[0][0] self.assertTrue(finalAccuracy > 0.8) finalLoss = last_row[1][0] self.assertTrue(finalLoss < 0.5)
class PythonApiTest(unittest.TestCase): def grouper(self,iterable, n, fillvalue=None): args = [iter(iterable)] * n return izip_longest(fillvalue=fillvalue, *args) def setUp(self): #Initialize all objects self.cos=CaffeOnSpark(sc) cmdargs = conf.get('spark.pythonargs') self.args= dict(self.grouper(cmdargs.split(),2)) self.cfg=Config(sc,self.args) self.train_source = DataSource(sc).getSource(self.cfg,True) self.validation_source = DataSource(sc).getSource(self.cfg,False) def testTrain(self): self.cos.train(self.train_source) self.assertTrue(os.path.isfile(self.args.get('-model').split(":")[1][3:])) result=self.cos.features(self.validation_source) self.assertTrue('accuracy' in result.columns) self.assertTrue('ip1' in result.columns) self.assertTrue('ip2' in result.columns) self.assertTrue(result.count() > 100) self.assertTrue(result.first()['SampleID'] == '00000000') result=self.cos.test(self.validation_source) self.assertTrue(result.get('accuracy') > 0.9) def testTrainWithValidation(self): result=self.cos.trainWithValidation(self.train_source, self.validation_source) self.assertEqual(len(result.columns), 2) self.assertEqual(result.columns[0], 'accuracy') self.assertEqual(result.columns[1], 'loss') result.show(2) row_count = result.count() last_row = result.rdd.zipWithIndex().filter(lambda (row,index): index==(row_count - 1)).collect()[0][0] finalAccuracy = last_row[0][0] self.assertTrue(finalAccuracy > 0.8) finalLoss = last_row[1][0] self.assertTrue(finalLoss < 0.5)
class PythonApiTest(unittest.TestCase): def grouper(self, iterable, n, fillvalue=None): args = [iter(iterable)] * n return izip_longest(fillvalue=fillvalue, *args) def setUp(self): #Initialize all objects self.cos = CaffeOnSpark(sc, sqlContext) cmdargs = conf.get('spark.pythonargs') self.args = dict(self.grouper(cmdargs.split(), 2)) self.cfg = Config(sc, self.args) self.train_source = DataSource(sc).getSource(self.cfg, True) self.validation_source = DataSource(sc).getSource(self.cfg, False) def testTrain(self): self.cos.train(self.train_source) self.assertTrue( os.path.isfile(self.args.get('-model').split(":")[1][3:])) result = self.cos.features(self.validation_source) self.assertTrue('accuracy' in result.columns) self.assertTrue('ip1' in result.columns) self.assertTrue('ip2' in result.columns) result = self.cos.test(self.validation_source) self.assertTrue(result.get('accuracy') > 0.9) def testTrainWithValidation(self): result = self.cos.trainWithValidation(self.train_source, self.validation_source) self.assertEqual(self.cfg.solverParameter.getTestIter(0), len(result)) finalAccuracy = 0 finalLoss = 0 for i in range(self.cfg.solverParameter.getTestIter(0)): finalAccuracy += result[i][0] finalLoss += result[i][1] self.assertTrue( finalAccuracy / self.cfg.solverParameter.getTestIter(0) > 0.8) self.assertTrue( finalLoss / self.cfg.solverParameter.getTestIter(0) < 0.5)
cfg.isFeature=True cfg.label='label' cfg.features=['ip1'] cfg.outputFormat = 'json' cfg.clusterSize = 1 cfg.lmdb_partitions=cfg.clusterSize #Train dl_train_source = DataSource(sc).getSource(cfg,True) cos.train(dl_train_source) #Extract features lr_raw_source = DataSource(sc).getSource(cfg,False) extracted_df = cos.features(lr_raw_source) extracted_df.show(365) squaresdf = extracted_df.map(lambda p : (p.label[0] , p.label[0]*p.label[0] , (p.label[0] - p.ip1[0]) , (p.label[0] - p.ip1[0])*(p.label[0] - p.ip1[0]) , 1 ) ) squares = squaresdf.reduce(lambda a , b : (a[0]+b[0] , a[1]+b[1] , a[2]+b[2] , a[3]+b[3] , a[4]+b[4] ) ) tss = float(squares[1]) - float(squares[0]*squares[0])/float(squares[4]) rss = float(squares[3]) - float(squares[2]*squares[2])/float(squares[4]) r2 = 1-rss/tss print("Test set:") print("Total SS: " + str(tss)) print("Redidual SS: " + str(rss)) print("R-Squared: " + str(r2)) print str(squares) cfg.protoFile = aerosolProtoFile cfg.modelPath = 'file:' + aerosolModelFile
cfg.isFeature = True cfg.label = 'label' cfg.features = ['ip1'] cfg.outputFormat = 'json' cfg.clusterSize = 1 cfg.lmdb_partitions = cfg.clusterSize #Train dl_train_source = DataSource(sc).getSource(cfg, True) cos.train(dl_train_source) #Extract features lr_raw_source = DataSource(sc).getSource(cfg, False) extracted_df = cos.features(lr_raw_source) extracted_df.show(365) squaresdf = extracted_df.map(lambda p: (p.label[0], p.label[0] * p.label[ 0], (p.label[0] - p.ip1[0]), (p.label[0] - p.ip1[0]) * (p.label[0] - p.ip1[0]), 1)) squares = squaresdf.reduce(lambda a, b: (a[0] + b[0], a[1] + b[1], a[2] + b[2], a[3] + b[3], a[4] + b[4])) tss = float( squares[1]) - float(squares[0] * squares[0]) / float(squares[4]) rss = float( squares[3]) - float(squares[2] * squares[2]) / float(squares[4]) r2 = 1 - rss / tss print("Test set:") print("Total SS: " + str(tss)) print("Redidual SS: " + str(rss)) print("R-Squared: " + str(r2))