def setUp(self): #Initialize all objects self.cos = CaffeOnSpark(sc) cmdargs = conf.get('spark.pythonargs') self.args = dict(self.grouper(cmdargs.split(), 2)) self.cfg = Config(sc, self.args) self.train_source = DataSource(sc).getSource(self.cfg, True) self.validation_source = DataSource(sc).getSource(self.cfg, False)
def setUp(self): #Initialize all objects self.cos=CaffeOnSpark(sc) cmdargs = conf.get('spark.pythonargs') self.args= dict(self.grouper(cmdargs.split(),2)) self.cfg=Config(sc,self.args) self.train_source = DataSource(sc).getSource(self.cfg,True) self.validation_source = DataSource(sc).getSource(self.cfg,False)
class PythonApiTest(unittest.TestCase): def grouper(self, iterable, n, fillvalue=None): args = [iter(iterable)] * n return izip_longest(fillvalue=fillvalue, *args) def setUp(self): #Initialize all objects self.cos = CaffeOnSpark(sc) cmdargs = conf.get('spark.pythonargs') self.args = dict(self.grouper(cmdargs.split(), 2)) self.cfg = Config(sc, self.args) self.train_source = DataSource(sc).getSource(self.cfg, True) self.validation_source = DataSource(sc).getSource(self.cfg, False) def testTrain(self): self.cos.train(self.train_source) self.assertTrue( os.path.isfile(self.args.get('-model').split(":")[1][3:])) result = self.cos.features(self.validation_source) self.assertTrue('accuracy' in result.columns) self.assertTrue('ip1' in result.columns) self.assertTrue('ip2' in result.columns) self.assertTrue(result.count() > 100) self.assertTrue(result.first()['SampleID'] == '00000000') result = self.cos.test(self.validation_source) self.assertTrue(result.get('accuracy') > 0.9) def testTrainWithValidation(self): result = self.cos.trainWithValidation(self.train_source, self.validation_source) self.assertEqual(len(result.columns), 2) self.assertEqual(result.columns[0], 'accuracy') self.assertEqual(result.columns[1], 'loss') result.show(2) row_count = result.count() last_row = result.rdd.zipWithIndex().filter( lambda (row, index): index == (row_count - 1)).collect()[0][0] finalAccuracy = last_row[0][0] self.assertTrue(finalAccuracy > 0.8) finalLoss = last_row[1][0] self.assertTrue(finalLoss < 0.5)
class PythonApiTest(unittest.TestCase): def grouper(self,iterable, n, fillvalue=None): args = [iter(iterable)] * n return izip_longest(fillvalue=fillvalue, *args) def setUp(self): #Initialize all objects self.cos=CaffeOnSpark(sc) cmdargs = conf.get('spark.pythonargs') self.args= dict(self.grouper(cmdargs.split(),2)) self.cfg=Config(sc,self.args) self.train_source = DataSource(sc).getSource(self.cfg,True) self.validation_source = DataSource(sc).getSource(self.cfg,False) def testTrain(self): self.cos.train(self.train_source) self.assertTrue(os.path.isfile(self.args.get('-model').split(":")[1][3:])) result=self.cos.features(self.validation_source) self.assertTrue('accuracy' in result.columns) self.assertTrue('ip1' in result.columns) self.assertTrue('ip2' in result.columns) self.assertTrue(result.count() > 100) self.assertTrue(result.first()['SampleID'] == '00000000') result=self.cos.test(self.validation_source) self.assertTrue(result.get('accuracy') > 0.9) def testTrainWithValidation(self): result=self.cos.trainWithValidation(self.train_source, self.validation_source) self.assertEqual(len(result.columns), 2) self.assertEqual(result.columns[0], 'accuracy') self.assertEqual(result.columns[1], 'loss') result.show(2) row_count = result.count() last_row = result.rdd.zipWithIndex().filter(lambda (row,index): index==(row_count - 1)).collect()[0][0] finalAccuracy = last_row[0][0] self.assertTrue(finalAccuracy > 0.8) finalLoss = last_row[1][0] self.assertTrue(finalLoss < 0.5)
class PythonApiTest(unittest.TestCase): def grouper(self, iterable, n, fillvalue=None): args = [iter(iterable)] * n return izip_longest(fillvalue=fillvalue, *args) def setUp(self): #Initialize all objects self.cos = CaffeOnSpark(sc, sqlContext) cmdargs = conf.get('spark.pythonargs') self.args = dict(self.grouper(cmdargs.split(), 2)) self.cfg = Config(sc, self.args) self.train_source = DataSource(sc).getSource(self.cfg, True) self.validation_source = DataSource(sc).getSource(self.cfg, False) def testTrain(self): self.cos.train(self.train_source) self.assertTrue( os.path.isfile(self.args.get('-model').split(":")[1][3:])) result = self.cos.features(self.validation_source) self.assertTrue('accuracy' in result.columns) self.assertTrue('ip1' in result.columns) self.assertTrue('ip2' in result.columns) result = self.cos.test(self.validation_source) self.assertTrue(result.get('accuracy') > 0.9) def testTrainWithValidation(self): result = self.cos.trainWithValidation(self.train_source, self.validation_source) self.assertEqual(self.cfg.solverParameter.getTestIter(0), len(result)) finalAccuracy = 0 finalLoss = 0 for i in range(self.cfg.solverParameter.getTestIter(0)): finalAccuracy += result[i][0] finalLoss += result[i][1] self.assertTrue( finalAccuracy / self.cfg.solverParameter.getTestIter(0) > 0.8) self.assertTrue( finalLoss / self.cfg.solverParameter.getTestIter(0) < 0.5)
""" This function calls CaffeOnSpark to train the model. It is similar in structure to the LeNext example, e.g., see https://github.com/yahoo/CaffeOnSpark/wiki/GetStarted_python In fact, the Python interface for CaffeOnSpark currently (July 2016) allows for very little deviation from this format. """ if __name__ == '__main__': sparkConf = SparkConf().setAppName("BeijingTomorrow").setMaster("local") sc=SparkContext(conf=sparkConf) registerContext(sc) sqlContext = SQLContext(sc) registerSQLContext(sqlContext) cos=CaffeOnSpark(sc,sqlContext) cfg=Config(sc) this_file = os.path.abspath(inspect.getfile(inspect.currentframe())) project_dir = os.path.dirname(os.path.dirname(os.path.dirname(this_file))) visualProtoFile= os.path.join(project_dir,"resources/caffe_prototxt/beijing_pollution_solver_visual.prototxt") visualModelFile= os.path.join(project_dir,"resources/caffe_models/beijing_pollution_model_visual.model") aerosolProtoFile= os.path.join(project_dir,"resources/caffe_prototxt/beijing_pollution_solver_aerosol.prototxt") aerosolModelFile= os.path.join(project_dir,"resources/caffe_models/beijing_pollution_model_aerosol.model") cfg.protoFile = visualProtoFile cfg.modelPath = 'file:' + visualModelFile cfg.devices = 1 cfg.isFeature=True cfg.label='label' cfg.features=['ip1'] cfg.outputFormat = 'json'
""" This function calls CaffeOnSpark to train the model. It is similar in structure to the LeNext example, e.g., see https://github.com/yahoo/CaffeOnSpark/wiki/GetStarted_python In fact, the Python interface for CaffeOnSpark currently (July 2016) allows for very little deviation from this format. """ if __name__ == '__main__': sparkConf = SparkConf().setAppName("BeijingTomorrow").setMaster("local") sc = SparkContext(conf=sparkConf) registerContext(sc) sqlContext = SQLContext(sc) registerSQLContext(sqlContext) cos = CaffeOnSpark(sc, sqlContext) cfg = Config(sc) this_file = os.path.abspath(inspect.getfile(inspect.currentframe())) project_dir = os.path.dirname(os.path.dirname(os.path.dirname(this_file))) visualProtoFile = os.path.join( project_dir, "resources/caffe_prototxt/beijing_pollution_solver_visual.prototxt") visualModelFile = os.path.join( project_dir, "resources/caffe_models/beijing_pollution_model_visual.model") aerosolProtoFile = os.path.join( project_dir, "resources/caffe_prototxt/beijing_pollution_solver_aerosol.prototxt") aerosolModelFile = os.path.join( project_dir, "resources/caffe_models/beijing_pollution_model_aerosol.model")