def setUp(self): #Initialize all objects self.cos = CaffeOnSpark(sc) cmdargs = conf.get('spark.pythonargs') self.args = dict(self.grouper(cmdargs.split(), 2)) self.cfg = Config(sc, self.args) self.train_source = DataSource(sc).getSource(self.cfg, True) self.validation_source = DataSource(sc).getSource(self.cfg, False)
""" This function calls CaffeOnSpark to train the model. It is similar in structure to the LeNext example, e.g., see https://github.com/yahoo/CaffeOnSpark/wiki/GetStarted_python In fact, the Python interface for CaffeOnSpark currently (July 2016) allows for very little deviation from this format. """ if __name__ == '__main__': sparkConf = SparkConf().setAppName("BeijingTomorrow").setMaster("local") sc = SparkContext(conf=sparkConf) registerContext(sc) sqlContext = SQLContext(sc) registerSQLContext(sqlContext) cos = CaffeOnSpark(sc, sqlContext) cfg = Config(sc) this_file = os.path.abspath(inspect.getfile(inspect.currentframe())) project_dir = os.path.dirname(os.path.dirname(os.path.dirname(this_file))) visualProtoFile = os.path.join( project_dir, "resources/caffe_prototxt/beijing_pollution_solver_visual.prototxt") visualModelFile = os.path.join( project_dir, "resources/caffe_models/beijing_pollution_model_visual.model") aerosolProtoFile = os.path.join( project_dir, "resources/caffe_prototxt/beijing_pollution_solver_aerosol.prototxt") aerosolModelFile = os.path.join( project_dir, "resources/caffe_models/beijing_pollution_model_aerosol.model")
from pyspark import SparkConf, SparkContext from itertools import izip_longest from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.classification import LogisticRegressionWithLBFGS from pyspark.sql import SQLContext def grouper(iterable, n, fillvalue=None): args = [iter(iterable)] * n return izip_longest(fillvalue=fillvalue, *args) conf = SparkConf() sc = SparkContext(conf=conf) #Initialize all objects cos = CaffeOnSpark(sc) cmdargs = conf.get('spark.pythonargs') args = dict(grouper(cmdargs.split(), 2)) cfg = Config(sc, args) dl_train_source = DataSource(sc).getSource(cfg, True) #Train cos.train(dl_train_source) lr_raw_source = DataSource(sc).getSource(cfg, False) #Extract features extracted_df = cos.features(lr_raw_source) # Do multiclass LogisticRegression data = extracted_df.map( lambda row: LabeledPoint(row.label[0], Vectors.dense(row.ip1))) lr = LogisticRegressionWithLBFGS.train(data, numClasses=10, iterations=10) predictions = lr.predict(data.map(lambda pt: pt.features))