def runTest(self): document_assembler = DocumentAssembler() \ .setInputCol("text") \ .setOutputCol("document") tokenizer = RegexTokenizer() \ .setOutputCol("token") lemmatizer = Lemmatizer() \ .setInputCols(["token"]) \ .setOutputCol("lemma") \ .setDictionary({"sad": "unsad"}) finisher = Finisher() \ .setInputCols(["token", "lemma"]) \ .setOutputCols(["token_views", "lemma_views"]) pipeline = Pipeline( stages=[document_assembler, tokenizer, lemmatizer, finisher]) model = pipeline.fit(self.data) token_before_save = model.transform(self.data).select( "token_views").take(1)[0].token_views.split("@")[2] lemma_before_save = model.transform(self.data).select( "lemma_views").take(1)[0].lemma_views.split("@")[2] pipe_path = "./tmp_pipeline" pipeline.write().overwrite().save(pipe_path) loaded_pipeline = Pipeline.read().load(pipe_path) token_after_save = model.transform(self.data).select( "token_views").take(1)[0].token_views.split("@")[2] lemma_after_save = model.transform(self.data).select( "lemma_views").take(1)[0].lemma_views.split("@")[2] print(token_before_save) assert token_before_save == "sad" assert lemma_before_save == "unsad" assert token_after_save == token_before_save assert lemma_after_save == lemma_before_save loaded_pipeline.fit(self.data).transform(self.data).show()
from pyspark.ml import PipelineModel from pyspark.sql.types import DoubleType from pyspark import SparkFiles url = "https://s3-us-west-2.amazonaws.com/mlapi-samples/demo/data/input/iris.csv" spark.sparkContext.addFile(url) # Load and parse the data file, converting it to a DataFrame. data = spark.read.csv(SparkFiles.get("iris.csv"), header=True) data = data.withColumn("sepal_length", data["sepal_length"].cast(DoubleType())) data = data.withColumn("sepal_width", data["sepal_width"].cast(DoubleType())) data = data.withColumn("petal_width", data["petal_width"].cast(DoubleType())) data = data.withColumn("petal_length", data["petal_length"].cast(DoubleType())) pipeline = Pipeline.read().load("classification-pipeline") model = PipelineModel.read().load("classification-model") # Make predictions. predictions = model.transform(data) # Select example rows to display. predictions.select("predictedLabel", "species", "features").show(5) # Select (prediction, true label) and compute test error evaluator = MulticlassClassificationEvaluator( labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("Test Error = %g" % (1.0 - accuracy))
from pyspark.ml import Pipeline stages = [filterer, converter, binarizer, extractor, assembler, classifier] pipeline = Pipeline(stages=stages) # ## Save and load the machine learning pipeline # guardar la instancia del `Pipeline` HDFS: pipeline.write().overwrite().save("models/pipeline") # si no queremos sobreescribirlo: #```python #pipeline.save("models/pipeline") #``` # leer el pipeline desde el hdfs : pipeline_loaded = Pipeline.read().load("models/pipeline") # se puede usar esto otro método: #```python #pipeline_loaded = Pipeline.load("models/pipeline") #``` # ## entrenar el modelo pipeline_model = pipeline.fit(rides) # ## guardar el modelo # guardar el pipeline model en HDFS: pipeline_model.write().overwrite().save("models/pipeline_model")