def featureExtractLr(self, trainDataframe, predictionDataframe): pipeline = None try: # pipeline = PipelineModel.load(ROOT_PATH+'/logistic') pipeline = Pipeline.load(ROOT_PATH + '/logistic') except Exception: print Exception.message self.logger.error(Exception) if pipeline is None: # tokenizer = Tokenizer(inputCol="keywords", outputCol="words") remover = StopWordsRemover(inputCol="keywords", outputCol="filtered") # 设置停用词 remover.setStopWords(self.cuttingMachine.chineseStopwords()) hashingTF = HashingTF(inputCol=remover.getOutputCol(), outputCol="features") lr = LogisticRegression(maxIter=10, regParam=0.001).setElasticNetParam(0.8) pipeline = Pipeline(stages=[remover, hashingTF, lr]) model = pipeline.fit(trainDataframe) pipeline.write().overwrite().save(ROOT_PATH + '/logistic') # model.write().overwrite().save(ROOT_PATH+'/logistic') resultDataframe = model.transform(predictionDataframe) resultDataframe.show() selected = resultDataframe.select("id", "features", "probability", "prediction") for row in selected.collect(): rid, features, prob, prediction = row self.logger.info("features: %s", features) self.logger.info("prob: %s", str(prob)) self.logger.info("prediction: %s", str(prediction))
def testPipelineWithTargetEncoderIsSerializable(): targetEncoder = H2OTargetEncoder( foldCol="ID", labelCol="CAPSULE", inputCols=["RACE", "DPROS", "DCAPS"], outputCols=["RACE_out", "DPROS_out", "DCAPS_out"], holdoutStrategy="KFold", blendedAvgEnabled=True, blendedAvgInflectionPoint=15.0, blendedAvgSmoothing=25.0, noise=0.05, noiseSeed=123) gbm = H2OGBM() \ .setLabelCol("CAPSULE") \ .setFeaturesCols(targetEncoder.getOutputCols()) pipeline = Pipeline(stages=[targetEncoder, gbm]) path = "file://" + os.path.abspath( "build/testPipelineWithTargetEncoderIsSerializable") pipeline.write().overwrite().save(path) loadedPipeline = Pipeline.load(path) [loadedTargetEncoder, loadedGbm] = loadedPipeline.getStages() assertTargetEncoderAndMOJOModelParamsAreEqual(targetEncoder, loadedTargetEncoder) assert gbm.getLabelCol() == loadedGbm.getLabelCol() assert gbm.getFeaturesCols() == loadedGbm.getFeaturesCols()
def testPipelineSerialization(craiglistDataset): [traningDataset, testingDataset] = craiglistDataset.randomSplit([0.9, 0.1], 42) tokenizer = RegexTokenizer(inputCol="jobtitle", minTokenLength=2, outputCol="tokenized") stopWordsRemover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="stopWordsRemoved") w2v = H2OWord2Vec(sentSampleRate=0, epochs=10, inputCol=stopWordsRemover.getOutputCol(), outputCol="w2v") gbm = H2OGBM(labelCol="category", featuresCols=[w2v.getOutputCol()]) pipeline = Pipeline(stages=[tokenizer, stopWordsRemover, w2v, gbm]) pipeline.write().overwrite().save("file://" + os.path.abspath("build/w2v_pipeline")) loadedPipeline = Pipeline.load("file://" + os.path.abspath("build/w2v_pipeline")) model = loadedPipeline.fit(traningDataset) expected = model.transform(testingDataset) model.write().overwrite().save("file://" + os.path.abspath("build/w2v_pipeline_model")) loadedModel = PipelineModel.load( "file://" + os.path.abspath("build/w2v_pipeline_model")) result = loadedModel.transform(testingDataset) unit_test_utils.assert_data_frames_are_identical(expected, result)
def runTest(self): document_assembler = DocumentAssembler() \ .setInputCol("text") \ .setOutputCol("document") tokenizer = RegexTokenizer() \ .setOutputCol("token") lemmatizer = Lemmatizer() \ .setInputCols(["token"]) \ .setOutputCol("lemma") \ .setDictionary({"sad": "unsad"}) finisher = Finisher() \ .setInputCols(["token", "lemma"]) \ .setOutputCols(["token_views", "lemma_views"]) pipeline = Pipeline( stages=[document_assembler, tokenizer, lemmatizer, finisher]) model = pipeline.fit(self.data) token_before_save = model.transform(self.data).select( "token_views").take(1)[0].token_views.split("@")[2] lemma_before_save = model.transform(self.data).select( "lemma_views").take(1)[0].lemma_views.split("@")[2] pipe_path = "./tmp_pipeline" pipeline.write().overwrite().save(pipe_path) loaded_pipeline = Pipeline.read().load(pipe_path) token_after_save = model.transform(self.data).select( "token_views").take(1)[0].token_views.split("@")[2] lemma_after_save = model.transform(self.data).select( "lemma_views").take(1)[0].lemma_views.split("@")[2] print(token_before_save) assert token_before_save == "sad" assert lemma_before_save == "unsad" assert token_after_save == token_before_save assert lemma_after_save == lemma_before_save loaded_pipeline.fit(self.data).transform(self.data).show()
def featureExtract(self, trainDataframe, predictionDataframe): pipeline = None try: pipeline = Pipeline.load(ROOT_PATH + '/pipeline') except Exception: print Exception.message self.logger.error(Exception) if pipeline is None: # tokenizer = Tokenizer(inputCol="keywords", outputCol="words") remover = StopWordsRemover(inputCol="keywords", outputCol="filtered") # 设置停用词 remover.setStopWords(self.cuttingMachine.chineseStopwords()) hashingTF = HashingTF(inputCol=remover.getOutputCol(), outputCol="features") idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="idff") # lr = LogisticRegression(maxIter=10, regParam=0.001) pipeline = Pipeline(stages=[remover, hashingTF, idf]) model = pipeline.fit(trainDataframe) pipeline.write().overwrite().save(ROOT_PATH + '/pipeline') resultDataframe = model.transform(predictionDataframe) resultDataframe.show() selected = resultDataframe.select("filtered", "features", "idff") for row in selected.collect(): filtered, features, idff = row self.logger.info("features: %s", features) self.logger.info("idff: %s", idff) self.logger.info( "filtered: %s", str(filtered).decode("unicode_escape").encode("utf-8")) return selected
def test_grid_gbm_in_spark_pipeline(self): prostate_frame = self._spark.read.csv( "file://" + unit_test_utils.locate("smalldata/prostate/prostate.csv"), header=True, inferSchema=True) algo = H2OGridSearch(predictionCol="AGE", hyperParameters={"_seed": [1, 2, 3]}, ratio=0.8, algo=H2OGBM()) pipeline = Pipeline(stages=[algo]) pipeline.write().overwrite().save( "file://" + os.path.abspath("build/grid_gbm_pipeline")) loaded_pipeline = Pipeline.load( "file://" + os.path.abspath("build/grid_gbm_pipeline")) model = loaded_pipeline.fit(prostate_frame) model.write().overwrite().save( "file://" + os.path.abspath("build/grid_gbm_pipeline_model")) loaded_model = PipelineModel.load( "file://" + os.path.abspath("build/grid_gbm_pipeline_model")) loaded_model.transform(prostate_frame).count()
def trainPipelineModel(idf, hashingTF, stopWordsRemover, tokenizer, algoStage, data): ## Remove all helper columns colPruner = ColumnPruner(columns=[ idf.getOutputCol(), hashingTF.getOutputCol(), stopWordsRemover.getOutputCol(), tokenizer.getOutputCol() ]) ## Create the pipeline by defining all the stages pipeline = Pipeline(stages=[ tokenizer, stopWordsRemover, hashingTF, idf, algoStage, colPruner ]) ## Test exporting and importing the pipeline. On Systems where HDFS & Hadoop is not available, this call store the pipeline ## to local file in the current directory. In case HDFS & Hadoop is available, this call stores the pipeline to HDFS home ## directory for the current user. Absolute paths can be used as wells. The same holds for the model import/export bellow. pipelinePath = "file://" + os.path.abspath("../build/pipeline") pipeline.write().overwrite().save(pipelinePath) loaded_pipeline = Pipeline.load(pipelinePath) ## Train the pipeline model modelPath = "file://" + os.path.abspath("../build/model") model = loaded_pipeline.fit(data) model.write().overwrite().save(modelPath) return PipelineModel.load(modelPath)
def test_glm_in_spark_pipeline(self): prostate_frame = self._spark.read.csv( "file://" + unit_test_utils.locate("smalldata/prostate/prostate.csv"), header=True, inferSchema=True) algo = H2OGLM(featuresCols=[ "CAPSULE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON" ], labelCol="AGE", seed=1, ratio=0.8) pipeline = Pipeline(stages=[algo]) pipeline.write().overwrite().save( "file://" + os.path.abspath("build/glm_pipeline")) loaded_pipeline = Pipeline.load("file://" + os.path.abspath("build/glm_pipeline")) model = loaded_pipeline.fit(prostate_frame) model.write().overwrite().save( "file://" + os.path.abspath("build/glm_pipeline_model")) loaded_model = PipelineModel.load( "file://" + os.path.abspath("build/glm_pipeline_model")) loaded_model.transform(prostate_frame).count()
def test_grid_gbm_in_spark_pipeline(self): prostate_frame = self._spark.read.csv( "file://" + unit_test_utils.locate("smalldata/prostate/prostate.csv"), header=True, inferSchema=True) algo = H2OGridSearch(labelCol="AGE", hyperParameters={"_seed": [1, 2, 3]}, ratio=0.8, algo=H2OGBM(), strategy="RandomDiscrete", maxModels=3, maxRuntimeSecs=60, selectBestModelBy="RMSE") pipeline = Pipeline(stages=[algo]) pipeline.write().overwrite().save( "file://" + os.path.abspath("build/grid_gbm_pipeline")) loaded_pipeline = Pipeline.load( "file://" + os.path.abspath("build/grid_gbm_pipeline")) model = loaded_pipeline.fit(prostate_frame) model.write().overwrite().save( "file://" + os.path.abspath("build/grid_gbm_pipeline_model")) loaded_model = PipelineModel.load( "file://" + os.path.abspath("build/grid_gbm_pipeline_model")) loaded_model.transform(prostate_frame).count()
def test_mojo_dai_pipeline_serialize(self): mojo = H2OMOJOPipelineModel.createFromMojo("file://" + os.path.abspath( "../ml/src/test/resources/mojo2data/pipeline.mojo")) prostateFrame = self._spark.read.csv( "file://" + unit_test_utils.locate("smalldata/prostate/prostate.csv"), header=True) # Create Spark pipeline of single step - mojo pipeline pipeline = Pipeline(stages=[mojo]) pipeline.write().overwrite().save( "file://" + os.path.abspath("build/test_dai_pipeline_as_spark_pipeline")) loadedPipeline = Pipeline.load( "file://" + os.path.abspath("build/test_dai_pipeline_as_spark_pipeline")) # Train the pipeline model model = loadedPipeline.fit(prostateFrame) model.write().overwrite().save( "file://" + os.path.abspath("build/test_dai_pipeline_as_spark_pipeline_model")) loadedModel = PipelineModel.load( "file://" + os.path.abspath("build/test_dai_pipeline_as_spark_pipeline_model")) preds = loadedModel.transform(prostateFrame).repartition(1).select( mojo.selectPredictionUDF("AGE")).take(5) assert preds[0][0] == 65.36320409515132 assert preds[1][0] == 64.96902128114817 assert preds[2][0] == 64.96721023747583 assert preds[3][0] == 65.78772654671035 assert preds[4][0] == 66.11327967814829
def test_h2o_mojo_model_serialization_in_pipeline(self): mojo = H2OMOJOModel.createFromMojo("file://" + os.path.abspath( "../ml/src/test/resources/binom_model_prostate.mojo")) prostate_frame = self._spark.read.csv( "file://" + unit_test_utils.locate("smalldata/prostate/prostate.csv"), header=True) pipeline = Pipeline(stages=[mojo]) pipeline.write().overwrite().save( "file://" + os.path.abspath("build/test_spark_pipeline_model_mojo")) loaded_pipeline = Pipeline.load( "file://" + os.path.abspath("build/test_spark_pipeline_model_mojo")) model = loaded_pipeline.fit(prostate_frame) model.write().overwrite().save( "file://" + os.path.abspath("build/test_spark_pipeline_model_mojo_model")) PipelineModel.load( "file://" + os.path.abspath("build/test_spark_pipeline_model_mojo_model"))
def process(spark, train_data, test_data): df_train = spark.read.parquet(train_data) df_test = spark.read.parquet(test_data) features = VectorAssembler(inputCols=df_train.columns[1:-1], outputCol='features') evaluator = RegressionEvaluator(labelCol='ctr', predictionCol='prediction', metricName='rmse') lr_model_base = LinearRegression(labelCol='ctr', **LR_PARAMS_BASE) lr_model_to_tune = LinearRegression(labelCol='ctr') lr_param_grid = ParamGridBuilder() \ .addGrid(lr_model_to_tune.maxIter, [5, 10, 20, 40, 50]) \ .addGrid(lr_model_to_tune.regParam, [0.4, 0.1, 0.01, 0.001]) \ .addGrid(lr_model_to_tune.fitIntercept, [False, True]) \ .addGrid(lr_model_to_tune.elasticNetParam, [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]) \ .build() tvs = TrainValidationSplit(estimator=lr_model_to_tune, estimatorParamMaps=lr_param_grid, evaluator=evaluator, trainRatio=0.8) pipeline_model_base = Pipeline( stages=[features, lr_model_base]).fit(df_train) prediction_base = pipeline_model_base.transform(df_test) rmse_base = evaluator.evaluate(prediction_base) print(f'Base lr model params: {LR_PARAMS_BASE}') print(f'RMSE at base lr model = {rmse_base}') print('Tuning lr model...') pipeline_model_tuned = Pipeline(stages=[features, tvs]).fit(df_train) prediction_tuned = pipeline_model_tuned.transform(df_test) rmse_tuned = evaluator.evaluate(prediction_tuned) model_java_obj = pipeline_model_tuned.stages[-1].bestModel._java_obj lr_params_tuned = { 'maxIter': model_java_obj.getMaxIter(), 'regParam': model_java_obj.getRegParam(), 'elasticNetParam': model_java_obj.getElasticNetParam(), 'fitIntercept': model_java_obj.getFitIntercept() } print(f'Base lr model params: {lr_params_tuned}') print(f'RMSE at tuned lr model = {rmse_tuned}') if rmse_tuned < rmse_base: pipeline_model_tuned.write().overwrite().save(MODEL_PATH) print(f'Tuned model has better RMSE value') else: pipeline_model_base.write().overwrite().save(MODEL_PATH) print(f'Base model has better RMSE value') print(f'Model saved at "{MODEL_PATH}"') spark.stop()
class KMeans: def __init__(self, args, args2): """ Standalone version for initializing KMeans clustering @param args: dict K: int init: string one of "k-means++" and "random" n_init: int max_iter: int tol: float """ # init logging self.logger = logging.getLogger(self.__class__.__name__) # init parameters self.outputUrl1 = args["output"][0]["value"] self.param = args["param"] self.model = None self.logger.info("initializing SparkSession") # init SparkSession self.spark = utils.init_spark() def getIn(self): return def execute(self): from pyspark.ml.clustering import KMeans from pyspark.ml import Pipeline # 聚类中心个数 k = int(self.param["K"]) # 初始化方法 init = self.param["init"] if init == "k-means": init = "k-means||" # 运行次数 n_init = int(self.param["n_init"]) # 单次训练最大迭代次数 max_iter = int(self.param["max_iter"]) # 容忍度 tol = int(self.param["tol"]) # 以Pipeline的模式初始化模型,方便统一接口加载模型 self.logger.info("initializing model") self.model = Pipeline(stages=[ KMeans(k=k, initMode=init, initSteps=n_init, maxIter=max_iter, tol=tol) ]) def setOut(self): self.logger.info("saving model to %s" % self.outputUrl1) self.model.write().overwrite().save(self.outputUrl1)
class RandomForest: def __init__(self, args, args2): """ Spark version for initializing RandomForest multi-class classifier @param args: dict n_estimators: int criterion: string one of "gini" and "entropy" max_depth: int min_samples_split: int min_samples_leaf: int """ self.logger = logging.getLogger(self.__class__.__name__) self.outputUrl1 = args["output"][0]["value"] self.param = args["param"] self.model = None self.dataUtil = utils.dataUtil(args2) self.logger.info("initializing SparkSession") self.spark = utils.init_spark() def getIn(self): return def execute(self): from pyspark.ml.classification import RandomForestClassifier from pyspark.ml import Pipeline # 树的个数 n_estimators = int(self.param["treeNum"]) # 评价标准 criterion = self.param["criterion"] # 最大树深度 max_depth = int(self.param["maxDepth"]) # 最小分割样本数 min_samples_split = int(self.param["minSamplesSplit"]) # 叶子节点最小样本数 min_samples_leaf = int(self.param["minSamplesLeaf"]) # 以Pipeline的模式初始化模型,方便统一接口加载模型 self.logger.info("initializing model") self.model = Pipeline(stages=[ RandomForestClassifier(numTrees=n_estimators, impurity=criterion, maxDepth=max_depth, minInstancesPerNode=min_samples_leaf) ]) def setOut(self): self.logger.info("Writing model to %s" % self.outputUrl1) self.model.write().overwrite().save(self.outputUrl1)
def gridSearchTester(algo, prostateDataset): grid = H2OGridSearch(labelCol="AGE", hyperParameters={"seed": [1, 2, 3]}, splitRatio=0.8, algo=algo, strategy="RandomDiscrete", maxModels=3, maxRuntimeSecs=60, selectBestModelBy="RMSE") pipeline = Pipeline(stages=[grid]) pipeline.write().overwrite().save("file://" + os.path.abspath("build/grid_pipeline")) loadedPipeline = Pipeline.load("file://" + os.path.abspath("build/grid_pipeline")) model = loadedPipeline.fit(prostateDataset) model.write().overwrite().save("file://" + os.path.abspath("build/grid_pipeline_model")) loadedModel = PipelineModel.load("file://" + os.path.abspath("build/grid_pipeline_model")) loadedModel.transform(prostateDataset).count()
def testPipelineSerialization(prostateDataset): algo = H2OIsolationForest(seed=1) pipeline = Pipeline(stages=[algo]) pipeline.write().overwrite().save("file://" + os.path.abspath("build/isolation_forest_pipeline")) loadedPipeline = Pipeline.load("file://" + os.path.abspath("build/isolation_forest_pipeline")) model = loadedPipeline.fit(prostateDataset) expected = model.transform(prostateDataset) model.write().overwrite().save("file://" + os.path.abspath("build/isolation_forest_pipeline_model")) loadedModel = PipelineModel.load("file://" + os.path.abspath("build/isolation_forest_pipeline_model")) result = loadedModel.transform(prostateDataset) unit_test_utils.assert_data_frames_are_identical(expected, result)
def testPipelineSerialization(prostateDataset): algo = H2ODRF(featuresCols=["CAPSULE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON"], labelCol="AGE", seed=1, splitRatio=0.8) pipeline = Pipeline(stages=[algo]) pipeline.write().overwrite().save("file://" + os.path.abspath("build/drf_pipeline")) loadedPipeline = Pipeline.load("file://" + os.path.abspath("build/drf_pipeline")) model = loadedPipeline.fit(prostateDataset) model.write().overwrite().save("file://" + os.path.abspath("build/drf_pipeline_model")) loadedModel = PipelineModel.load("file://" + os.path.abspath("build/drf_pipeline_model")) loadedModel.transform(prostateDataset).count()
def create_random_pipeline(): print("Creating Data pipeline for regressor") assembler = VectorAssembler(inputCols=[ "Year", "Engine HP", "Engine Cylinders", "Number of Doors", "highway MPG", "city mpg", "Popularity" ], outputCol="Attributes") regressor = RandomForestRegressor(featuresCol="Attributes", labelCol="MSRP") pipeline = Pipeline(stages=[assembler, regressor]) pipelineStr = "pipeline" pipeline.write().overwrite().save(pipelineStr) # Save pipeline return (pipelineStr, regressor)
def train_model(dataFrame, k_value, w2v_value, seed=2137): """Train and save model""" tokenizer = Tokenizer(inputCol="text", outputCol="words_raw") remover = StopWordsRemover(inputCol="words_raw", outputCol="words") word2Vec = Word2Vec(vectorSize=w2v_value, seed=seed, inputCol="words", outputCol="features_unnormalized") scaler = StandardScaler(inputCol="features_unnormalized", outputCol="features", withStd=True, withMean=True) kmeans = KMeans(k=k_value, seed=seed) pipeline = Pipeline(stages=[tokenizer, remover, word2Vec, scaler, kmeans]) pipeline = pipeline.fit(dataFrame) pipeline.write().overwrite().save("hdfs:///models/model") return pipeline
def main(argv): spark = SparkSession.builder \ .master("local[*]") \ .config("spark.driver.memory", "4g") \ .config("spark.executor.memory", "1g") \ .getOrCreate() features_df = ParquetDataFrame( f'data/processed/{Phase.train.name}/features', spark) test_data_frac = 0.1 test_features_df, train_features_df = features_df.randomSplit( [test_data_frac, 1 - test_data_frac]) label_col = 'duration_min' model = Pipeline(stages=[ StringIndexer(inputCol='pickup_cell_6', handleInvalid='keep', outputCol='pickup_cell_6_idx'), StringIndexer(inputCol='dropoff_cell_6', handleInvalid='keep', outputCol='dropoff_cell_6_idx'), VectorAssembler(inputCols=[ 'pickup_cell_6_idx', 'dropoff_cell_6_idx', 'distance', 'month', 'day_of_month', 'day_of_week', 'hour', 'requests_pickup_cell', 'requests_dropoff_cell' ], outputCol="features"), DecisionTreeRegressor( maxDepth=7, featuresCol='features', labelCol=label_col) ]).fit(train_features_df) model_path = 'model/trip_duration_min' print(f'Saving model to {model_path}') model.write().overwrite().save(model_path) print(f'Model saved...') model = PipelineModel.load(model_path) predictions_df = model.transform(test_features_df) mae_cv = RegressionEvaluator(labelCol=label_col, metricName='mae').evaluate(predictions_df) print(f'Mean absolutre error: {mae_cv}') spark.stop()
def testPipelineSerialization(dataset): algo = H2OKMeans( splitRatio=0.8, seed=1, k=3, featuresCols=["sepal_len", "sepal_wid", "petal_len", "petal_wid"]) pipeline = Pipeline(stages=[algo]) pipeline.write().overwrite().save("file://" + os.path.abspath("build/kmeans_pipeline")) loadedPipeline = Pipeline.load("file://" + os.path.abspath("build/kmeans_pipeline")) model = loadedPipeline.fit(dataset) model.write().overwrite().save( "file://" + os.path.abspath("build/kmeans_pipeline_model")) loadedModel = PipelineModel.load( "file://" + os.path.abspath("build/kmeans_pipeline_model")) loadedModel.transform(dataset).count()
def testMojoModelSerializationInPipeline(prostateDataset): mojo = H2OMOJOModel.createFromMojo( "file://" + os.path.abspath("../ml/src/test/resources/binom_model_prostate.mojo")) pipeline = Pipeline(stages=[mojo]) pipeline.write().overwrite().save( "file://" + os.path.abspath("build/test_spark_pipeline_model_mojo")) loadedPipeline = Pipeline.load( "file://" + os.path.abspath("build/test_spark_pipeline_model_mojo")) model = loadedPipeline.fit(prostateDataset) model.write().overwrite().save( "file://" + os.path.abspath("build/test_spark_pipeline_model_mojo_model")) PipelineModel.load( "file://" + os.path.abspath("build/test_spark_pipeline_model_mojo_model"))
def testPipelineSerialization(heartDataset): features = ['age', 'year', 'surgery', 'transplant', 'start', 'stop'] algo = H2OCoxPH(labelCol="event", featuresCols=features, startCol='start', stopCol='stop') pipeline = Pipeline(stages=[algo]) pipeline.write().overwrite().save("file://" + os.path.abspath("build/cox_ph_pipeline")) loadedPipeline = Pipeline.load("file://" + os.path.abspath("build/cox_ph_pipeline")) model = loadedPipeline.fit(heartDataset) expected = model.transform(heartDataset) model.write().overwrite().save("file://" + os.path.abspath("build/cox_ph_pipeline")) loadedModel = PipelineModel.load("file://" + os.path.abspath("build/cox_ph_pipeline")) result = loadedModel.transform(heartDataset) unit_test_utils.assert_data_frames_are_identical(expected, result)
mlSourceDF.printSchema() mlSourceDF=mlSourceDF.fillna(0, subset= [x for x in mlSourceDF.columns if 'Lag' in x]) # after creating all lag features, we can drop NA columns on the key columns # drop na to avoid error in StringIndex mlSourceDF = mlSourceDF.na.drop(subset=["ServerIP","SessionStartHourTime"]) # indexing columnsForIndex = ['dayofweek', 'ServerIP', 'year', 'month', 'weekofyear', 'dayofmonth', 'hourofday', 'Holiday', 'BusinessHour', 'Morning'] mlSourceDF=mlSourceDF.fillna(0, subset= [x for x in columnsForIndex ]) sIndexers = [StringIndexer(inputCol=x, outputCol=x + '_indexed').setHandleInvalid("skip") for x in columnsForIndex] indexModel = Pipeline(stages=sIndexers).fit(mlSourceDF) mlSourceDF = indexModel.transform(mlSourceDF) # save model for operationalization indexModel.write().overwrite().save(stringIndexModelFile) # encoding for categorical features catVarNames=[x + '_indexed' for x in columnsForIndex ] columnOnlyIndexed = [ catVarNames[i] for i in range(0,len(catVarNames)) if len(indexModel.stages[i].labels)<2 ] columnForEncode = [ catVarNames[i] for i in range(0,len(catVarNames)) if len(indexModel.stages[i].labels)>=2 ] info['columnOnlyIndexed'] = columnOnlyIndexed info['columnForEncode'] = columnForEncode # save info to blob storage write_blob(info, infoFile, storageContainer, storageAccount, storageKey) ohEncoders = [OneHotEncoder(inputCol=x, outputCol=x + '_encoded') for x in columnForEncode ]
# Print out the predicted Play Type, Actual Play Type, and the vector of indexed features predictions.select("predictedLabel", "play_type", "indexedFeatures").show(5) # Determine the accuracy of the model # Can specify other evaluation metrics evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol = "prediction", metricName="accuracy") # Calculate the test error accuracy = evaluator.evaluate(predictions) print("Test Error = %g" % (1.0 - accuracy)) rfModel = model.stages[2] print(rfModel) # COMMAND ---------- predictions.select("indexedLabel","prediction","predictedLabel").show(5) # COMMAND ---------- rfPipeline.write().overwrite().save("nfl-data/pipelines") # COMMAND ---------- rfModel.write().overwrite().save("/nfl-data/models") # COMMAND ---------- # MAGIC %fs # MAGIC ls /nfl-data/pipelines/stages
## Remove all helper columns colPruner = ColumnPruner(columns=[ idf.getOutputCol(), hashingTF.getOutputCol(), stopWordsRemover.getOutputCol(), tokenizer.getOutputCol() ]) ## Create the pipeline by defining all the stages pipeline = Pipeline( stages=[tokenizer, stopWordsRemover, hashingTF, idf, algoStage, colPruner]) ## Test exporting and importing the pipeline. On Systems where HDFS & Hadoop is not available, this call store the pipeline ## to local file in the current directory. In case HDFS & Hadoop is available, this call stores the pipeline to HDFS home ## directory for the current user. Absolute paths can be used as wells. The same holds for the model import/export bellow. pipeline.write().overwrite().save("examples/build/pipeline") loaded_pipeline = Pipeline.load("examples/build/pipeline") ## Train the pipeline model data = load() model = loaded_pipeline.fit(data) model.write().overwrite().save("examples/build/model") loaded_model = PipelineModel.load("examples/build/model") ## ## Make predictions on unlabeled data ## Spam detector ## def isSpam(smsText, model, hamThreshold=0.5):
pipeline = Pipeline(stages=[stringIndexer, vecAssembler]) # COMMAND ---------- # MAGIC %md # MAGIC ## Scala # MAGIC # MAGIC Distributed XGBoost with Spark only has a Scala API, so we are going to create views of our DataFrames to use in Scala, as well as save our (untrained) pipeline to load in to Scala. # COMMAND ---------- trainDF.createOrReplaceTempView("trainDF") testDF.createOrReplaceTempView("testDF") fileName = "/tmp/xgboost_feature_pipeline" pipeline.write().overwrite().save(fileName) # COMMAND ---------- # MAGIC %md # MAGIC ## Load Data/Pipeline in Scala # MAGIC # MAGIC This section is only available in Scala because there is no distributed Python API for XGBoost in Spark yet. # MAGIC # MAGIC Let's load in our data/pipeline that we defined in Python. # COMMAND ---------- # MAGIC %scala # MAGIC import org.apache.spark.ml.Pipeline # MAGIC
from pyspark.ml import Pipeline from pyspark.sql import SparkSession from pyspark.ml.feature import Binarizer if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("BinarizerExample")\ .getOrCreate() continuousDataFrame = spark.createDataFrame([(4)], [ "feature"]) binarizer = Binarizer(threshold=5, inputCol="feature", outputCol="binarized_feature") pipeline = Pipeline(stages=[binarizer]) pipeline = pipeline.fit(continuousDataFrame) pipeline.write().overwrite().save("binarizer")
test_model.take(1) # In[13]: import pyspark.ml.evaluation as ev evaluator = ev.BinaryClassificationEvaluator(rawPredictionCol='probability', labelCol='INFANT_ALIVE_AT_REPORT') print(evaluator.evaluate(test_model, {evaluator.metricName: 'areaUnderROC'})) print(evaluator.evaluate(test_model, {evaluator.metricName: 'areaUnderPR'})) # In[14]: pipelinePath = './model/infant_oneHotEncoder_Logistic_Pipeline' pipeline.write().overwrite().save(pipelinePath) # In[15]: loadedPipeline = Pipeline.load(pipelinePath) loadedPipeline.fit(births_train).transform(births_test).take(1) # In[16]: from pyspark.ml import PipelineModel modelPath = './model/infant_oneHotEncoder_Logistic_PipelineModel' model.write().overwrite().save(modelPath) loadedPipelineModel = PipelineModel.load(modelPath) test_loadedModel = loadedPipelineModel.transform(births_test)
predictionCol="label") elif algo == "xgboost": ## Create H2OXGBoost model algoStage = H2OXGBoost(convertUnknownCategoricalLevelsToNa=True, featuresCols=[idf.getOutputCol()], predictionCol="label") ## Remove all helper columns colPruner = ColumnPruner(columns=[idf.getOutputCol(), hashingTF.getOutputCol(), stopWordsRemover.getOutputCol(), tokenizer.getOutputCol()]) ## Create the pipeline by defining all the stages pipeline = Pipeline(stages=[tokenizer, stopWordsRemover, hashingTF, idf, algoStage, colPruner]) ## Test exporting and importing the pipeline. On Systems where HDFS & Hadoop is not available, this call store the pipeline ## to local file in the current directory. In case HDFS & Hadoop is available, this call stores the pipeline to HDFS home ## directory for the current user. Absolute paths can be used as wells. The same holds for the model import/export bellow. pipeline.write().overwrite().save("examples/build/pipeline") loaded_pipeline = Pipeline.load("examples/build/pipeline") ## Train the pipeline model data = load() model = loaded_pipeline.fit(data) model.write().overwrite().save("examples/build/model") loaded_model = PipelineModel.load("examples/build/model") ## ## Make predictions on unlabeled data ## Spam detector
assembler = VectorAssembler(inputCols=selected, outputCol="features") # especificar el modelo: from pyspark.ml.classification import DecisionTreeClassifier classifier = DecisionTreeClassifier(featuresCol="features", labelCol="five_star_rating") # especificar el pipeline: from pyspark.ml import Pipeline stages = [filterer, converter, binarizer, extractor, assembler, classifier] pipeline = Pipeline(stages=stages) # ## Save and load the machine learning pipeline # guardar la instancia del `Pipeline` HDFS: pipeline.write().overwrite().save("models/pipeline") # si no queremos sobreescribirlo: #```python #pipeline.save("models/pipeline") #``` # leer el pipeline desde el hdfs : pipeline_loaded = Pipeline.read().load("models/pipeline") # se puede usar esto otro método: #```python #pipeline_loaded = Pipeline.load("models/pipeline") #``` # ## entrenar el modelo