示例#1
0
 def runTest(self):
     document_assembler = DocumentAssembler() \
         .setInputCol("text") \
         .setOutputCol("document")
     tokenizer = RegexTokenizer() \
         .setOutputCol("token")
     lemmatizer = Lemmatizer() \
         .setInputCols(["token"]) \
         .setOutputCol("lemma") \
         .setDictionary({"sad": "unsad"})
     finisher = Finisher() \
         .setInputCols(["token", "lemma"]) \
         .setOutputCols(["token_views", "lemma_views"])
     pipeline = Pipeline(
         stages=[document_assembler, tokenizer, lemmatizer, finisher])
     model = pipeline.fit(self.data)
     token_before_save = model.transform(self.data).select(
         "token_views").take(1)[0].token_views.split("@")[2]
     lemma_before_save = model.transform(self.data).select(
         "lemma_views").take(1)[0].lemma_views.split("@")[2]
     pipe_path = "./tmp_pipeline"
     pipeline.write().overwrite().save(pipe_path)
     loaded_pipeline = Pipeline.read().load(pipe_path)
     token_after_save = model.transform(self.data).select(
         "token_views").take(1)[0].token_views.split("@")[2]
     lemma_after_save = model.transform(self.data).select(
         "lemma_views").take(1)[0].lemma_views.split("@")[2]
     print(token_before_save)
     assert token_before_save == "sad"
     assert lemma_before_save == "unsad"
     assert token_after_save == token_before_save
     assert lemma_after_save == lemma_before_save
     loaded_pipeline.fit(self.data).transform(self.data).show()
示例#2
0
from pyspark.ml import PipelineModel
from pyspark.sql.types import DoubleType

from pyspark import SparkFiles

url = "https://s3-us-west-2.amazonaws.com/mlapi-samples/demo/data/input/iris.csv"
spark.sparkContext.addFile(url)

# Load and parse the data file, converting it to a DataFrame.
data = spark.read.csv(SparkFiles.get("iris.csv"), header=True)

data = data.withColumn("sepal_length", data["sepal_length"].cast(DoubleType()))
data = data.withColumn("sepal_width", data["sepal_width"].cast(DoubleType()))
data = data.withColumn("petal_width", data["petal_width"].cast(DoubleType()))
data = data.withColumn("petal_length", data["petal_length"].cast(DoubleType()))

pipeline = Pipeline.read().load("classification-pipeline")
model = PipelineModel.read().load("classification-model")

# Make predictions.
predictions = model.transform(data)

# Select example rows to display.
predictions.select("predictedLabel", "species", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))
示例#3
0
from pyspark.ml import Pipeline
stages = [filterer, converter, binarizer, extractor, assembler, classifier]
pipeline = Pipeline(stages=stages)

# ## Save and load the machine learning pipeline

# guardar la instancia del  `Pipeline` HDFS:
pipeline.write().overwrite().save("models/pipeline")

# si no queremos sobreescribirlo:
#```python
#pipeline.save("models/pipeline")
#```

# leer el pipeline desde el hdfs  :
pipeline_loaded = Pipeline.read().load("models/pipeline")

# se puede usar esto otro método:
#```python
#pipeline_loaded = Pipeline.load("models/pipeline")
#```

# ## entrenar el modelo

pipeline_model = pipeline.fit(rides)

# ## guardar el modelo

# guardar el pipeline model en  HDFS:
pipeline_model.write().overwrite().save("models/pipeline_model")