def func(rd): rd = rd.map(lambda l: l.strip('"')).map(lambda l: l.split("||")).map( lambda l: (int(l[0]), l[1])) sqlContext = SQLContext(sc) test_data = sqlContext.createDataFrame(rd, schema=["label", "text"]) data = sc.textFile("/home/asdf/Documents/news", 1) data = data.map(lambda l: l.strip('"')).map(lambda l: l.split("||")).map( lambda l: (int(l[0]), l[1])) df = sqlContext.createDataFrame(data, schema=["label", "text"]) lr = Pipeline.load("/home/asdf/Documents/models/lr") nb = Pipeline.load("/home/asdf/Documents/models/nb") lr_pred = lr.fit(df).transform(test_data) nb_pred = nb.fit(df).transform(test_data) accuracy = MulticlassClassificationEvaluator(predictionCol="prediction") recall = MulticlassClassificationEvaluator(predictionCol="prediction", metricName="weightedPrecision") precision = MulticlassClassificationEvaluator(predictionCol="prediction", metricName="weightedRecall") with open("/home/asdf/Documents/op1", 'a') as file: file.write("Logistic Regression:\n") file.write("Accuracy:" + str(accuracy.evaluate(lr_pred) * 100)) file.write("\nRecall:" + str(recall.evaluate(lr_pred) * 100)) file.write("\nPrecision:" + str(precision.evaluate(lr_pred) * 100)) file.write('\n') file.write("\nNaive Bayes:\n") file.write("Accuracy:" + str(accuracy.evaluate(nb_pred) * 100)) file.write("\nRecall:" + str(recall.evaluate(nb_pred) * 100)) file.write("\nPrecision:" + str(precision.evaluate(nb_pred) * 100))
def test_h2o_mojo_model_serialization_in_pipeline(self): mojo = H2OMOJOModel.createFromMojo("file://" + os.path.abspath( "../ml/src/test/resources/binom_model_prostate.mojo")) prostate_frame = self._spark.read.csv( "file://" + unit_test_utils.locate("smalldata/prostate/prostate.csv"), header=True) pipeline = Pipeline(stages=[mojo]) pipeline.write().overwrite().save( "file://" + os.path.abspath("build/test_spark_pipeline_model_mojo")) loaded_pipeline = Pipeline.load( "file://" + os.path.abspath("build/test_spark_pipeline_model_mojo")) model = loaded_pipeline.fit(prostate_frame) model.write().overwrite().save( "file://" + os.path.abspath("build/test_spark_pipeline_model_mojo_model")) PipelineModel.load( "file://" + os.path.abspath("build/test_spark_pipeline_model_mojo_model"))
def testPipelineWithTargetEncoderIsSerializable(): targetEncoder = H2OTargetEncoder( foldCol="ID", labelCol="CAPSULE", inputCols=["RACE", "DPROS", "DCAPS"], outputCols=["RACE_out", "DPROS_out", "DCAPS_out"], holdoutStrategy="KFold", blendedAvgEnabled=True, blendedAvgInflectionPoint=15.0, blendedAvgSmoothing=25.0, noise=0.05, noiseSeed=123) gbm = H2OGBM() \ .setLabelCol("CAPSULE") \ .setFeaturesCols(targetEncoder.getOutputCols()) pipeline = Pipeline(stages=[targetEncoder, gbm]) path = "file://" + os.path.abspath( "build/testPipelineWithTargetEncoderIsSerializable") pipeline.write().overwrite().save(path) loadedPipeline = Pipeline.load(path) [loadedTargetEncoder, loadedGbm] = loadedPipeline.getStages() assertTargetEncoderAndMOJOModelParamsAreEqual(targetEncoder, loadedTargetEncoder) assert gbm.getLabelCol() == loadedGbm.getLabelCol() assert gbm.getFeaturesCols() == loadedGbm.getFeaturesCols()
def testPipelineSerialization(craiglistDataset): [traningDataset, testingDataset] = craiglistDataset.randomSplit([0.9, 0.1], 42) tokenizer = RegexTokenizer(inputCol="jobtitle", minTokenLength=2, outputCol="tokenized") stopWordsRemover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="stopWordsRemoved") w2v = H2OWord2Vec(sentSampleRate=0, epochs=10, inputCol=stopWordsRemover.getOutputCol(), outputCol="w2v") gbm = H2OGBM(labelCol="category", featuresCols=[w2v.getOutputCol()]) pipeline = Pipeline(stages=[tokenizer, stopWordsRemover, w2v, gbm]) pipeline.write().overwrite().save("file://" + os.path.abspath("build/w2v_pipeline")) loadedPipeline = Pipeline.load("file://" + os.path.abspath("build/w2v_pipeline")) model = loadedPipeline.fit(traningDataset) expected = model.transform(testingDataset) model.write().overwrite().save("file://" + os.path.abspath("build/w2v_pipeline_model")) loadedModel = PipelineModel.load( "file://" + os.path.abspath("build/w2v_pipeline_model")) result = loadedModel.transform(testingDataset) unit_test_utils.assert_data_frames_are_identical(expected, result)
def featureExtractLr(self, trainDataframe, predictionDataframe): pipeline = None try: # pipeline = PipelineModel.load(ROOT_PATH+'/logistic') pipeline = Pipeline.load(ROOT_PATH + '/logistic') except Exception: print Exception.message self.logger.error(Exception) if pipeline is None: # tokenizer = Tokenizer(inputCol="keywords", outputCol="words") remover = StopWordsRemover(inputCol="keywords", outputCol="filtered") # 设置停用词 remover.setStopWords(self.cuttingMachine.chineseStopwords()) hashingTF = HashingTF(inputCol=remover.getOutputCol(), outputCol="features") lr = LogisticRegression(maxIter=10, regParam=0.001).setElasticNetParam(0.8) pipeline = Pipeline(stages=[remover, hashingTF, lr]) model = pipeline.fit(trainDataframe) pipeline.write().overwrite().save(ROOT_PATH + '/logistic') # model.write().overwrite().save(ROOT_PATH+'/logistic') resultDataframe = model.transform(predictionDataframe) resultDataframe.show() selected = resultDataframe.select("id", "features", "probability", "prediction") for row in selected.collect(): rid, features, prob, prediction = row self.logger.info("features: %s", features) self.logger.info("prob: %s", str(prob)) self.logger.info("prediction: %s", str(prediction))
def featureExtract(self, trainDataframe, predictionDataframe): pipeline = None try: pipeline = Pipeline.load(ROOT_PATH + '/pipeline') except Exception: print Exception.message self.logger.error(Exception) if pipeline is None: # tokenizer = Tokenizer(inputCol="keywords", outputCol="words") remover = StopWordsRemover(inputCol="keywords", outputCol="filtered") # 设置停用词 remover.setStopWords(self.cuttingMachine.chineseStopwords()) hashingTF = HashingTF(inputCol=remover.getOutputCol(), outputCol="features") idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="idff") # lr = LogisticRegression(maxIter=10, regParam=0.001) pipeline = Pipeline(stages=[remover, hashingTF, idf]) model = pipeline.fit(trainDataframe) pipeline.write().overwrite().save(ROOT_PATH + '/pipeline') resultDataframe = model.transform(predictionDataframe) resultDataframe.show() selected = resultDataframe.select("filtered", "features", "idff") for row in selected.collect(): filtered, features, idff = row self.logger.info("features: %s", features) self.logger.info("idff: %s", idff) self.logger.info( "filtered: %s", str(filtered).decode("unicode_escape").encode("utf-8")) return selected
def trainPipelineModel(idf, hashingTF, stopWordsRemover, tokenizer, algoStage, data): ## Remove all helper columns colPruner = ColumnPruner(columns=[ idf.getOutputCol(), hashingTF.getOutputCol(), stopWordsRemover.getOutputCol(), tokenizer.getOutputCol() ]) ## Create the pipeline by defining all the stages pipeline = Pipeline(stages=[ tokenizer, stopWordsRemover, hashingTF, idf, algoStage, colPruner ]) ## Test exporting and importing the pipeline. On Systems where HDFS & Hadoop is not available, this call store the pipeline ## to local file in the current directory. In case HDFS & Hadoop is available, this call stores the pipeline to HDFS home ## directory for the current user. Absolute paths can be used as wells. The same holds for the model import/export bellow. pipelinePath = "file://" + os.path.abspath("../build/pipeline") pipeline.write().overwrite().save(pipelinePath) loaded_pipeline = Pipeline.load(pipelinePath) ## Train the pipeline model modelPath = "file://" + os.path.abspath("../build/model") model = loaded_pipeline.fit(data) model.write().overwrite().save(modelPath) return PipelineModel.load(modelPath)
def test_grid_gbm_in_spark_pipeline(self): prostate_frame = self._spark.read.csv( "file://" + unit_test_utils.locate("smalldata/prostate/prostate.csv"), header=True, inferSchema=True) algo = H2OGridSearch(predictionCol="AGE", hyperParameters={"_seed": [1, 2, 3]}, ratio=0.8, algo=H2OGBM()) pipeline = Pipeline(stages=[algo]) pipeline.write().overwrite().save( "file://" + os.path.abspath("build/grid_gbm_pipeline")) loaded_pipeline = Pipeline.load( "file://" + os.path.abspath("build/grid_gbm_pipeline")) model = loaded_pipeline.fit(prostate_frame) model.write().overwrite().save( "file://" + os.path.abspath("build/grid_gbm_pipeline_model")) loaded_model = PipelineModel.load( "file://" + os.path.abspath("build/grid_gbm_pipeline_model")) loaded_model.transform(prostate_frame).count()
def test_glm_in_spark_pipeline(self): prostate_frame = self._spark.read.csv( "file://" + unit_test_utils.locate("smalldata/prostate/prostate.csv"), header=True, inferSchema=True) algo = H2OGLM(featuresCols=[ "CAPSULE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON" ], labelCol="AGE", seed=1, ratio=0.8) pipeline = Pipeline(stages=[algo]) pipeline.write().overwrite().save( "file://" + os.path.abspath("build/glm_pipeline")) loaded_pipeline = Pipeline.load("file://" + os.path.abspath("build/glm_pipeline")) model = loaded_pipeline.fit(prostate_frame) model.write().overwrite().save( "file://" + os.path.abspath("build/glm_pipeline_model")) loaded_model = PipelineModel.load( "file://" + os.path.abspath("build/glm_pipeline_model")) loaded_model.transform(prostate_frame).count()
def getIn(self): # 训练Spark等模型 from pyspark.ml import Pipeline self.originalDF = utils.dataUtil.SparkReadHive(self.inputUrl2, self.spark) self.model = Pipeline.load(self.inputUrl1).getStages()[0]
def getIn(self): self.logger.debug("using PySpark") from pyspark.ml import Pipeline self.originalDF = utils.dataUtil.SparkReadHive(self.inputUrl2, self.spark) self.model = Pipeline.load(self.inputUrl1).getStages()[0]
def test_grid_gbm_in_spark_pipeline(self): prostate_frame = self._spark.read.csv( "file://" + unit_test_utils.locate("smalldata/prostate/prostate.csv"), header=True, inferSchema=True) algo = H2OGridSearch(labelCol="AGE", hyperParameters={"_seed": [1, 2, 3]}, ratio=0.8, algo=H2OGBM(), strategy="RandomDiscrete", maxModels=3, maxRuntimeSecs=60, selectBestModelBy="RMSE") pipeline = Pipeline(stages=[algo]) pipeline.write().overwrite().save( "file://" + os.path.abspath("build/grid_gbm_pipeline")) loaded_pipeline = Pipeline.load( "file://" + os.path.abspath("build/grid_gbm_pipeline")) model = loaded_pipeline.fit(prostate_frame) model.write().overwrite().save( "file://" + os.path.abspath("build/grid_gbm_pipeline_model")) loaded_model = PipelineModel.load( "file://" + os.path.abspath("build/grid_gbm_pipeline_model")) loaded_model.transform(prostate_frame).count()
def test_nested_pipeline_persistence(self): """ Pipeline[HashingTF, Pipeline[PCA]] """ sqlContext = SQLContext(self.sc) temp_path = tempfile.mkdtemp() try: df = sqlContext.createDataFrame([(["a", "b", "c"],), (["c", "d", "e"],)], ["words"]) tf = HashingTF(numFeatures=10, inputCol="words", outputCol="features") pca = PCA(k=2, inputCol="features", outputCol="pca_features") p0 = Pipeline(stages=[pca]) pl = Pipeline(stages=[tf, p0]) model = pl.fit(df) pipeline_path = temp_path + "/pipeline" pl.save(pipeline_path) loaded_pipeline = Pipeline.load(pipeline_path) self._compare_pipelines(pl, loaded_pipeline) model_path = temp_path + "/pipeline-model" model.save(model_path) loaded_model = PipelineModel.load(model_path) self._compare_pipelines(model, loaded_model) finally: try: rmtree(temp_path) except OSError: pass
def test_mojo_dai_pipeline_serialize(self): mojo = H2OMOJOPipelineModel.createFromMojo("file://" + os.path.abspath( "../ml/src/test/resources/mojo2data/pipeline.mojo")) prostateFrame = self._spark.read.csv( "file://" + unit_test_utils.locate("smalldata/prostate/prostate.csv"), header=True) # Create Spark pipeline of single step - mojo pipeline pipeline = Pipeline(stages=[mojo]) pipeline.write().overwrite().save( "file://" + os.path.abspath("build/test_dai_pipeline_as_spark_pipeline")) loadedPipeline = Pipeline.load( "file://" + os.path.abspath("build/test_dai_pipeline_as_spark_pipeline")) # Train the pipeline model model = loadedPipeline.fit(prostateFrame) model.write().overwrite().save( "file://" + os.path.abspath("build/test_dai_pipeline_as_spark_pipeline_model")) loadedModel = PipelineModel.load( "file://" + os.path.abspath("build/test_dai_pipeline_as_spark_pipeline_model")) preds = loadedModel.transform(prostateFrame).repartition(1).select( mojo.selectPredictionUDF("AGE")).take(5) assert preds[0][0] == 65.36320409515132 assert preds[1][0] == 64.96902128114817 assert preds[2][0] == 64.96721023747583 assert preds[3][0] == 65.78772654671035 assert preds[4][0] == 66.11327967814829
def __prediction(self, df_pred): logger.info( "__prediction: LOADING PIPELINE ##################################### " ) gbt_pipeline = Pipeline.load('budget_prediction_pipeline/') gbt_pipeline_loaded = gbt_pipeline.fit(df_pred) ddf_features_df = gbt_pipeline_loaded.transform(df_pred) logger.info( "__prediction: FILTERING DATA ##################################### " ) ddf_features_df = ddf_features_df.filter("idmovie in(99999)") logger.info( "__prediction: LOADING MODEL ##################################### " ) gbt_model_load = GBTRegressionModel.load('gbt_model_old/') gbt_model_pred = gbt_model_load.transform(ddf_features_df) gbt_model_pred.selectExpr( 'idmovie', 'director', 'genres', 'runtime', 'cast(prediction as Decimal(38,2)) as prediction').show( truncate=False) logger.info( "__prediction: DATA PREDICTED ##################################### " ) return gbt_model_pred.selectExpr('director', 'genres', 'runtime', 'prediction')
def test_nested_pipeline_persistence(self): """ Pipeline[HashingTF, Pipeline[PCA]] """ sqlContext = SQLContext(self.sc) temp_path = tempfile.mkdtemp() try: df = sqlContext.createDataFrame([(["a", "b", "c"], ), (["c", "d", "e"], )], ["words"]) tf = HashingTF(numFeatures=10, inputCol="words", outputCol="features") pca = PCA(k=2, inputCol="features", outputCol="pca_features") p0 = Pipeline(stages=[pca]) pl = Pipeline(stages=[tf, p0]) model = pl.fit(df) pipeline_path = temp_path + "/pipeline" pl.save(pipeline_path) loaded_pipeline = Pipeline.load(pipeline_path) self._compare_pipelines(pl, loaded_pipeline) model_path = temp_path + "/pipeline-model" model.save(model_path) loaded_model = PipelineModel.load(model_path) self._compare_pipelines(model, loaded_model) finally: try: rmtree(temp_path) except OSError: pass
def test2(): trA = MyTransformer() pipeA = Pipeline(stages=[trA]) print type(pipeA) pipeA.save('testA.pipe') pipeAA = PysparkPipelineWrapper.unwrap(Pipeline.load('testA.pipe')) stagesAA = pipeAA.getStages() trAA = stagesAA[0] print trAA.dataset_count
def __init__(self): print("Initializing SentAnalysisProdiction!!"); self.appName = "Sentiment Analysis in Spark" # create Spark session self.spark = SparkSession.builder.appName(self.appName) \ .config("spark.executor.heartbeatInterval", "200000") \ .config("spark.network.timeout", "300000") \ .getOrCreate() self.model_name = "prediction_pipeline" self.pipeline = Pipeline.load(self.model_name) return
def create_cross_val(pipelineStr, regressor): print("Creating cross validator") pipelineModel = Pipeline.load(pipelineStr) paramGrid = ParamGridBuilder().addGrid(regressor.numTrees, [100, 500]).build() crossval = CrossValidator(estimator=pipelineModel, estimatorParamMaps=paramGrid, evaluator=RegressionEvaluator(labelCol="MSRP"), numFolds=3) return crossval
def gridSearchTester(algo, prostateDataset): grid = H2OGridSearch(labelCol="AGE", hyperParameters={"seed": [1, 2, 3]}, splitRatio=0.8, algo=algo, strategy="RandomDiscrete", maxModels=3, maxRuntimeSecs=60, selectBestModelBy="RMSE") pipeline = Pipeline(stages=[grid]) pipeline.write().overwrite().save("file://" + os.path.abspath("build/grid_pipeline")) loadedPipeline = Pipeline.load("file://" + os.path.abspath("build/grid_pipeline")) model = loadedPipeline.fit(prostateDataset) model.write().overwrite().save("file://" + os.path.abspath("build/grid_pipeline_model")) loadedModel = PipelineModel.load("file://" + os.path.abspath("build/grid_pipeline_model")) loadedModel.transform(prostateDataset).count()
def testPipelineSerialization(prostateDataset): algo = H2OIsolationForest(seed=1) pipeline = Pipeline(stages=[algo]) pipeline.write().overwrite().save("file://" + os.path.abspath("build/isolation_forest_pipeline")) loadedPipeline = Pipeline.load("file://" + os.path.abspath("build/isolation_forest_pipeline")) model = loadedPipeline.fit(prostateDataset) expected = model.transform(prostateDataset) model.write().overwrite().save("file://" + os.path.abspath("build/isolation_forest_pipeline_model")) loadedModel = PipelineModel.load("file://" + os.path.abspath("build/isolation_forest_pipeline_model")) result = loadedModel.transform(prostateDataset) unit_test_utils.assert_data_frames_are_identical(expected, result)
def testPipelineSerialization(prostateDataset): algo = H2ODRF(featuresCols=["CAPSULE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON"], labelCol="AGE", seed=1, splitRatio=0.8) pipeline = Pipeline(stages=[algo]) pipeline.write().overwrite().save("file://" + os.path.abspath("build/drf_pipeline")) loadedPipeline = Pipeline.load("file://" + os.path.abspath("build/drf_pipeline")) model = loadedPipeline.fit(prostateDataset) model.write().overwrite().save("file://" + os.path.abspath("build/drf_pipeline_model")) loadedModel = PipelineModel.load("file://" + os.path.abspath("build/drf_pipeline_model")) loadedModel.transform(prostateDataset).count()
def test_pipeline_persistence(self): sqlContext = SQLContext(self.sc) temp_path = tempfile.mkdtemp() try: df = sqlContext.createDataFrame([(["a", "b", "c"], ), (["c", "d", "e"], )], ["words"]) tf = HashingTF(numFeatures=10, inputCol="words", outputCol="features") pca = PCA(k=2, inputCol="features", outputCol="pca_features") pl = Pipeline(stages=[tf, pca]) model = pl.fit(df) pipeline_path = temp_path + "/pipeline" pl.save(pipeline_path) loaded_pipeline = Pipeline.load(pipeline_path) self.assertEqual(loaded_pipeline.uid, pl.uid) self.assertEqual(len(loaded_pipeline.getStages()), 2) [loaded_tf, loaded_pca] = loaded_pipeline.getStages() self.assertIsInstance(loaded_tf, HashingTF) self.assertEqual(loaded_tf.uid, tf.uid) param = loaded_tf.getParam("numFeatures") self.assertEqual(loaded_tf.getOrDefault(param), tf.getOrDefault(param)) self.assertIsInstance(loaded_pca, PCA) self.assertEqual(loaded_pca.uid, pca.uid) self.assertEqual(loaded_pca.getK(), pca.getK()) model_path = temp_path + "/pipeline-model" model.save(model_path) loaded_model = PipelineModel.load(model_path) [model_tf, model_pca] = model.stages [loaded_model_tf, loaded_model_pca] = loaded_model.stages self.assertEqual(model_tf.uid, loaded_model_tf.uid) self.assertEqual(model_tf.getOrDefault(param), loaded_model_tf.getOrDefault(param)) self.assertEqual(model_pca.uid, loaded_model_pca.uid) self.assertEqual(model_pca.pc, loaded_model_pca.pc) self.assertEqual(model_pca.explainedVariance, loaded_model_pca.explainedVariance) finally: try: rmtree(temp_path) except OSError: pass
def kmeans_with_loading(): df1 = sqlContext.read.format("csv").option("header", "true").option("mode", "DROPMALFORMED").load \ ("canadatweets.csv") df2 = sqlContext.read.format("csv").option("header", "true").option( "mode", "DROPMALFORMED").load("products.csv") df3 = sqlContext.read.format("csv").option("header", "true").option( "mode", "DROPMALFORMED").load("products.csv") df4 = sqlContext.read.format("csv").option("header", "true").option( "mode", "DROPMALFORMED").load("claritin.csv") df = df1.unionAll(df2) df = df.unionAll(df3) df = df.unionAll(df4) df.show() clusteringPipeline = Pipeline.load('KMeansPipeline') model = clusteringPipeline.fit(df) transf = model.transform(df) transf.show(200, False)
def test_pipeline_persistence(self): sqlContext = SQLContext(self.sc) temp_path = tempfile.mkdtemp() try: df = sqlContext.createDataFrame([(["a", "b", "c"],), (["c", "d", "e"],)], ["words"]) tf = HashingTF(numFeatures=10, inputCol="words", outputCol="features") pca = PCA(k=2, inputCol="features", outputCol="pca_features") pl = Pipeline(stages=[tf, pca]) model = pl.fit(df) pipeline_path = temp_path + "/pipeline" pl.save(pipeline_path) loaded_pipeline = Pipeline.load(pipeline_path) self.assertEqual(loaded_pipeline.uid, pl.uid) self.assertEqual(len(loaded_pipeline.getStages()), 2) [loaded_tf, loaded_pca] = loaded_pipeline.getStages() self.assertIsInstance(loaded_tf, HashingTF) self.assertEqual(loaded_tf.uid, tf.uid) param = loaded_tf.getParam("numFeatures") self.assertEqual(loaded_tf.getOrDefault(param), tf.getOrDefault(param)) self.assertIsInstance(loaded_pca, PCA) self.assertEqual(loaded_pca.uid, pca.uid) self.assertEqual(loaded_pca.getK(), pca.getK()) model_path = temp_path + "/pipeline-model" model.save(model_path) loaded_model = PipelineModel.load(model_path) [model_tf, model_pca] = model.stages [loaded_model_tf, loaded_model_pca] = loaded_model.stages self.assertEqual(model_tf.uid, loaded_model_tf.uid) self.assertEqual(model_tf.getOrDefault(param), loaded_model_tf.getOrDefault(param)) self.assertEqual(model_pca.uid, loaded_model_pca.uid) self.assertEqual(model_pca.pc, loaded_model_pca.pc) self.assertEqual(model_pca.explainedVariance, loaded_model_pca.explainedVariance) finally: try: rmtree(temp_path) except OSError: pass
def testPipelineSerialization(dataset): algo = H2OKMeans( splitRatio=0.8, seed=1, k=3, featuresCols=["sepal_len", "sepal_wid", "petal_len", "petal_wid"]) pipeline = Pipeline(stages=[algo]) pipeline.write().overwrite().save("file://" + os.path.abspath("build/kmeans_pipeline")) loadedPipeline = Pipeline.load("file://" + os.path.abspath("build/kmeans_pipeline")) model = loadedPipeline.fit(dataset) model.write().overwrite().save( "file://" + os.path.abspath("build/kmeans_pipeline_model")) loadedModel = PipelineModel.load( "file://" + os.path.abspath("build/kmeans_pipeline_model")) loadedModel.transform(dataset).count()
def testMojoModelSerializationInPipeline(prostateDataset): mojo = H2OMOJOModel.createFromMojo( "file://" + os.path.abspath("../ml/src/test/resources/binom_model_prostate.mojo")) pipeline = Pipeline(stages=[mojo]) pipeline.write().overwrite().save( "file://" + os.path.abspath("build/test_spark_pipeline_model_mojo")) loadedPipeline = Pipeline.load( "file://" + os.path.abspath("build/test_spark_pipeline_model_mojo")) model = loadedPipeline.fit(prostateDataset) model.write().overwrite().save( "file://" + os.path.abspath("build/test_spark_pipeline_model_mojo_model")) PipelineModel.load( "file://" + os.path.abspath("build/test_spark_pipeline_model_mojo_model"))
def testPipelineSerialization(heartDataset): features = ['age', 'year', 'surgery', 'transplant', 'start', 'stop'] algo = H2OCoxPH(labelCol="event", featuresCols=features, startCol='start', stopCol='stop') pipeline = Pipeline(stages=[algo]) pipeline.write().overwrite().save("file://" + os.path.abspath("build/cox_ph_pipeline")) loadedPipeline = Pipeline.load("file://" + os.path.abspath("build/cox_ph_pipeline")) model = loadedPipeline.fit(heartDataset) expected = model.transform(heartDataset) model.write().overwrite().save("file://" + os.path.abspath("build/cox_ph_pipeline")) loadedModel = PipelineModel.load("file://" + os.path.abspath("build/cox_ph_pipeline")) result = loadedModel.transform(heartDataset) unit_test_utils.assert_data_frames_are_identical(expected, result)
colPruner = ColumnPruner(columns=[ idf.getOutputCol(), hashingTF.getOutputCol(), stopWordsRemover.getOutputCol(), tokenizer.getOutputCol() ]) ## Create the pipeline by defining all the stages pipeline = Pipeline( stages=[tokenizer, stopWordsRemover, hashingTF, idf, algoStage, colPruner]) ## Test exporting and importing the pipeline. On Systems where HDFS & Hadoop is not available, this call store the pipeline ## to local file in the current directory. In case HDFS & Hadoop is available, this call stores the pipeline to HDFS home ## directory for the current user. Absolute paths can be used as wells. The same holds for the model import/export bellow. pipeline.write().overwrite().save("examples/build/pipeline") loaded_pipeline = Pipeline.load("examples/build/pipeline") ## Train the pipeline model data = load() model = loaded_pipeline.fit(data) model.write().overwrite().save("examples/build/model") loaded_model = PipelineModel.load("examples/build/model") ## ## Make predictions on unlabeled data ## Spam detector ## def isSpam(smsText, model, hamThreshold=0.5): smsTextDF = spark.createDataFrame([(smsText, )],
elif algo == "xgboost": ## Create H2OXGBoost model algoStage = H2OXGBoost(convertUnknownCategoricalLevelsToNa=True, featuresCols=[idf.getOutputCol()], predictionCol="label") ## Remove all helper columns colPruner = ColumnPruner(columns=[idf.getOutputCol(), hashingTF.getOutputCol(), stopWordsRemover.getOutputCol(), tokenizer.getOutputCol()]) ## Create the pipeline by defining all the stages pipeline = Pipeline(stages=[tokenizer, stopWordsRemover, hashingTF, idf, algoStage, colPruner]) ## Test exporting and importing the pipeline. On Systems where HDFS & Hadoop is not available, this call store the pipeline ## to local file in the current directory. In case HDFS & Hadoop is available, this call stores the pipeline to HDFS home ## directory for the current user. Absolute paths can be used as wells. The same holds for the model import/export bellow. pipeline.write().overwrite().save("examples/build/pipeline") loaded_pipeline = Pipeline.load("examples/build/pipeline") ## Train the pipeline model data = load() model = loaded_pipeline.fit(data) model.write().overwrite().save("examples/build/model") loaded_model = PipelineModel.load("examples/build/model") ## ## Make predictions on unlabeled data ## Spam detector ##
from pyspark.ml import Pipeline from pyspark.ml import PipelineModel # start a kafka consumer session from kafka.consumer import KafkaConsumer consumer = KafkaConsumer( "titanic", bootstrap_servers=['ip-172-31-12-218.us-east-2.compute.internal:6667']) print('consumer launched') testSchema = [ "PassengerId", "Pclass", "Name", "Sex", "Age", "SibSp", "Parch", "Ticket", "Fare", "Cabin", "Embarked" ] pipeline = Pipeline.load("/home/ubuntu/titanic/pipeline") model = PipelineModel.load("/home/ubuntu/titanic/model") def getTrain(msg): # put passenger info into dataframe # print msg # combine two lists into list of tuple # combined = map(lambda x, y: (x, y), trainSchema, msg) msg = [ast.literal_eval(msg)] msg[0][0] = float(msg[0][0]) msg[0][1] = float(msg[0][1]) msg[0][4] = float(msg[0][4]) msg[0][5] = float(msg[0][5]) msg[0][6] = float(msg[0][6]) msg[0][8] = float(msg[0][8])
.config("spark.hadoop.yarn.resourcemanager.principal",os.environ["HADOOP_USER_NAME"])\ .config("spark.executor.instances", 2)\ .config("spark.executor.cores", 2)\ .getOrCreate() ### Reading latest data and recreating pipeline from model development notebook #Notice we are only picking the most recent 1000 rows reflecting latest customer interactions df = spark.sql("SELECT RECENCY, HISTORY, USED_DISCOUNT, USED_BOGO, ZIP_CODE, IS_REFERRAL, CHANNEL, OFFER, SCORE, CONVERSION FROM DEFAULT.CUSTOMER_INTERACTIONS_CICD ORDER BY BATCH_TMS DESC LIMIT 20000") #Renaming target feature as "LABEL": df = df.withColumnRenamed("CONVERSION","label") latest_url_formatted = find_latest("spark_pipeline") pm = Pipeline.load(latest_url_formatted) pm = pm.fit(df) ## Saving the newly trained model run_time_suffix = datetime.now() run_time_suffix_string = run_time_suffix.strftime("%d%m%Y%H%M%S") pm.write().overwrite().save(os.environ["STORAGE"]+"/testpysparkmodels/"+"{}".format(run_time_suffix_string)) ## Saving the newly trained model metadata conn = sqlite3.connect('models.db') c = conn.cursor() import re
# In[13]: import pyspark.ml.evaluation as ev evaluator = ev.BinaryClassificationEvaluator(rawPredictionCol='probability', labelCol='INFANT_ALIVE_AT_REPORT') print(evaluator.evaluate(test_model, {evaluator.metricName: 'areaUnderROC'})) print(evaluator.evaluate(test_model, {evaluator.metricName: 'areaUnderPR'})) # In[14]: pipelinePath = './model/infant_oneHotEncoder_Logistic_Pipeline' pipeline.write().overwrite().save(pipelinePath) # In[15]: loadedPipeline = Pipeline.load(pipelinePath) loadedPipeline.fit(births_train).transform(births_test).take(1) # In[16]: from pyspark.ml import PipelineModel modelPath = './model/infant_oneHotEncoder_Logistic_PipelineModel' model.write().overwrite().save(modelPath) loadedPipelineModel = PipelineModel.load(modelPath) test_loadedModel = loadedPipelineModel.transform(births_test)