def main(business_id_arg): concat_list = udf(lambda lst: ", ".join(lst), types.StringType()) reviews_df = spark.read.format("org.apache.spark.sql.cassandra") \ .options(table=TABLE_REVIEW, keyspace=KEY_SPACE) \ .load() review_filter = reviews_df.filter(reviews_df.business_id == business_id_arg) review_concatenate = review_filter.groupby('business_id').agg(collect_list('review').alias("review")) review_concatenate.show() train_fin = review_concatenate.withColumn("review", concat_list("review")) train_fin = train_fin.withColumn("review", functions.regexp_replace(train_fin.review, "[^0-9A-Za-z ,]", "")) # Create a new pipeline to create Tokenizer and Lemmatizer documentAssembler = DocumentAssembler().setInputCol("review").setOutputCol("document") tokenizer = Tokenizer().setInputCols(["document"]).setOutputCol("token") lemmatizer = Lemmatizer().setInputCols(["token"]).setOutputCol("lemma") \ .setDictionary("lemmas001.txt", key_delimiter=" ", value_delimiter="\t") pipeline = Pipeline(stages=[documentAssembler, tokenizer, lemmatizer]) pipelineFit = pipeline.fit(train_fin) train_df = pipelineFit.transform(train_fin) train_df.select('lemma').show(truncate=False) price_range_udf = functions.UserDefinedFunction(lambda attributes: get_attributes(attributes), types.StringType()) train_df = train_df.withColumn('lemma', price_range_udf(train_df['lemma'])) train_df = train_df.withColumn('lemma', functions.split(train_df['lemma'], ",").cast('array<string>')) # Create a new pipeline to remove the stop words test_review = train_df.select("lemma") stop_words_remover = StopWordsRemover(inputCol="lemma", outputCol="filtered") hash_tf = HashingTF(numFeatures=2 ** 16, inputCol="lemma", outputCol='tf') pipeline_too_remove_stop_words = Pipeline(stages=[hash_tf, stop_words_remover]) pipeline_fit = pipeline_too_remove_stop_words.fit(train_df) test_df = pipeline_fit.transform(test_review) test_df.show() token_array = test_df.select('filtered').rdd.flatMap(lambda row: row).collect() counts = Counter(token_array[0]) word_cloud = WordCloud( background_color='white', max_words=100, max_font_size=50, min_font_size=10, random_state=40 ).fit_words(counts) plt.imshow(word_cloud) plt.axis('off') # remove axis plt.show()
def main(): data = spark.range(100000) data = data.select( (functions.rand()*100).alias('length'), (functions.rand()*100).alias('width'), (functions.rand()*100).alias('height'), ) data = data.withColumn('volume', data['length']*data['width']*data['height']) training, validation = data.randomSplit([0.75, 0.25], seed=42) assemble_features = VectorAssembler( inputCols=['length', 'width', 'height'], outputCol='features') classifier = GBTRegressor( featuresCol='features', labelCol='volume') pipeline = Pipeline(stages=[assemble_features, classifier]) model = pipeline.fit(training) predictions = model.transform(validation) predictions.show() r2_evaluator = RegressionEvaluator( predictionCol='prediction', labelCol='volume', metricName='r2') r2 = r2_evaluator.evaluate(predictions) print(r2)
def test_pipeline(self): dataset = MockDataset() estimator0 = MockEstimator() transformer1 = MockTransformer() estimator2 = MockEstimator() transformer3 = MockTransformer() pipeline = Pipeline() \ .setStages([estimator0, transformer1, estimator2, transformer3]) pipeline_model = pipeline.fit(dataset, { estimator0.fake: 0, transformer1.fake: 1 }) self.assertEqual(0, estimator0.dataset_index) self.assertEqual(0, estimator0.fake_param_value) model0 = estimator0.model self.assertEqual(0, model0.dataset_index) self.assertEqual(1, transformer1.dataset_index) self.assertEqual(1, transformer1.fake_param_value) self.assertEqual(2, estimator2.dataset_index) model2 = estimator2.model self.assertIsNone( model2.dataset_index, "The model produced by the last estimator should " "not be called during fit.") dataset = pipeline_model.transform(dataset) self.assertEqual(2, model0.dataset_index) self.assertEqual(3, transformer1.dataset_index) self.assertEqual(4, model2.dataset_index) self.assertEqual(5, transformer3.dataset_index) self.assertEqual(6, dataset.index)
def test_pipeline(self): dataset = MockDataset() estimator0 = MockEstimator() transformer1 = MockTransformer() estimator2 = MockEstimator() transformer3 = MockTransformer() pipeline = Pipeline() \ .setStages([estimator0, transformer1, estimator2, transformer3]) pipeline_model = pipeline.fit(dataset, {estimator0.fake: 0, transformer1.fake: 1}) self.assertEqual(0, estimator0.dataset_index) self.assertEqual(0, estimator0.fake_param_value) model0 = estimator0.model self.assertEqual(0, model0.dataset_index) self.assertEqual(1, transformer1.dataset_index) self.assertEqual(1, transformer1.fake_param_value) self.assertEqual(2, estimator2.dataset_index) model2 = estimator2.model self.assertIsNone(model2.dataset_index, "The model produced by the last estimator should " "not be called during fit.") dataset = pipeline_model.transform(dataset) self.assertEqual(2, model0.dataset_index) self.assertEqual(3, transformer1.dataset_index) self.assertEqual(4, model2.dataset_index) self.assertEqual(5, transformer3.dataset_index) self.assertEqual(6, dataset.index)
def pipeline_dataframe(self, stages, dataframe): print(stages) dataframe.printSchema() pipeline = Pipeline(stages=stages) pipelineModel = pipeline.fit(dataframe) model = pipelineModel.transform(dataframe) return model
def fit(self, sdf): """ :param sdf: :return: """ if self.weighter is None: raise NotImplementedError( "The weighter parameter has not been defined.") weights_arr = self.weighter.get_feature_importances(sdf) pipeline_lst = [ VectorAssembler(inputCols=self.input_cols, outputCol="vec"), StandardScaler(inputCol="vec", outputCol="standard_vec"), ElementwiseProduct(scalingVec=weights_arr, inputCol='standard_vec', outputCol='scaled_vec') ] _model = Pipeline(stages=pipeline_lst) model = _model.fit(sdf) self.model = model return self
def model_train(input,model_path): tmax_schema = types.StructType([ types.StructField('station', types.StringType()), types.StructField('date', types.DateType()), types.StructField('latitude', types.FloatType()), types.StructField('longitude', types.FloatType()), types.StructField('elevation', types.FloatType()), types.StructField('tmax', types.FloatType()), ]) data = spark.read.csv(input,schema= tmax_schema) train, validation = data.randomSplit([0.75,0.25]) train = train.cache() validation = validation.cache() sql_query = """SELECT today.latitude, today.longitude, today.elevation, dayofyear(today.date) AS dy,yesterday.tmax AS yesterday_tmax, today.tmax FROM __THIS__ as today INNER JOIN __THIS__ as yesterday ON date_sub(today.date, 1) = yesterday.date AND today.station = yesterday.station""" transformer = SQLTransformer(statement=sql_query) assemble_features = VectorAssembler(inputCols=['latitude','longitude','elevation','dy','yesterday_tmax'],outputCol='features') regressor = DecisionTreeRegressor(featuresCol='features',labelCol='tmax') weather_pipeline = Pipeline(stages=[transformer,assemble_features,regressor]) model = weather_pipeline.fit(train) model.write().overwrite().save(model_path) prediction = model.transform(validation) #Scoring the model evaluator = RegressionEvaluator(predictionCol='prediction',labelCol='tmax',metricName='rmse') score = evaluator.evaluate(prediction) print("Score of the weather model is",score)
def test_nested_pipeline_persistence(self): """ Pipeline[HashingTF, Pipeline[PCA]] """ temp_path = tempfile.mkdtemp() try: df = self.spark.createDataFrame([(["a", "b", "c"],), (["c", "d", "e"],)], ["words"]) tf = HashingTF(numFeatures=10, inputCol="words", outputCol="features") pca = PCA(k=2, inputCol="features", outputCol="pca_features") p0 = Pipeline(stages=[pca]) pl = Pipeline(stages=[tf, p0]) model = pl.fit(df) pipeline_path = temp_path + "/pipeline" pl.save(pipeline_path) loaded_pipeline = Pipeline.load(pipeline_path) self._compare_pipelines(pl, loaded_pipeline) model_path = temp_path + "/pipeline-model" model.save(model_path) loaded_model = PipelineModel.load(model_path) self._compare_pipelines(model, loaded_model) finally: try: rmtree(temp_path) except OSError: pass
def test_python_transformer_pipeline_persistence(self): """ Pipeline[MockUnaryTransformer, Binarizer] """ temp_path = tempfile.mkdtemp() try: df = self.spark.range(0, 10).toDF('input') tf = MockUnaryTransformer(shiftVal=2)\ .setInputCol("input").setOutputCol("shiftedInput") tf2 = Binarizer(threshold=6, inputCol="shiftedInput", outputCol="binarized") pl = Pipeline(stages=[tf, tf2]) model = pl.fit(df) pipeline_path = temp_path + "/pipeline" pl.save(pipeline_path) loaded_pipeline = Pipeline.load(pipeline_path) self._compare_pipelines(pl, loaded_pipeline) model_path = temp_path + "/pipeline-model" model.save(model_path) loaded_model = PipelineModel.load(model_path) self._compare_pipelines(model, loaded_model) finally: try: rmtree(temp_path) except OSError: pass
def main(spark, logger, **kwargs): logger.info("Creating a simple DataFrame ...") schema_names = ["id", "german_text"] fields = [ T.StructField(field_name, T.StringType(), True) for field_name in schema_names ] schema = T.StructType(fields) data = [ ("abc", "Hallo Herr Mustermann"), ("xyz", "Deutsch ist das Ding!"), ] df = spark.createDataFrame(data, schema) df.show() logger.info("Building the ML pipeline ...") tokenizer = RegexTokenizer( inputCol="german_text", outputCol="tokens", pattern="\\s+" ) stemmer = SnowballStemmer( inputCol="tokens", outputCol="stemmed_tokens", language="German" ) stemming_pipeline = Pipeline( stages=[ tokenizer, stemmer, ] ) logger.info("Running the stemming ML pipeline ...") stemmed_df = stemming_pipeline.fit(df).transform(df) stemmed_df.show()
def test_nested_pipeline_persistence(self): """ Pipeline[HashingTF, Pipeline[PCA]] """ temp_path = tempfile.mkdtemp() try: df = self.spark.createDataFrame([(["a", "b", "c"], ), (["c", "d", "e"], )], ["words"]) tf = HashingTF(numFeatures=10, inputCol="words", outputCol="features") pca = PCA(k=2, inputCol="features", outputCol="pca_features") p0 = Pipeline(stages=[pca]) pl = Pipeline(stages=[tf, p0]) model = pl.fit(df) pipeline_path = temp_path + "/pipeline" pl.save(pipeline_path) loaded_pipeline = Pipeline.load(pipeline_path) self._compare_pipelines(pl, loaded_pipeline) model_path = temp_path + "/pipeline-model" model.save(model_path) loaded_model = PipelineModel.load(model_path) self._compare_pipelines(model, loaded_model) finally: try: rmtree(temp_path) except OSError: pass
def test_python_transformer_pipeline_persistence(self): """ Pipeline[MockUnaryTransformer, Binarizer] """ temp_path = tempfile.mkdtemp() try: df = self.spark.range(0, 10).toDF("input") tf = MockUnaryTransformer( shiftVal=2).setInputCol("input").setOutputCol("shiftedInput") tf2 = Binarizer(threshold=6, inputCol="shiftedInput", outputCol="binarized") pl = Pipeline(stages=[tf, tf2]) model = pl.fit(df) pipeline_path = temp_path + "/pipeline" pl.save(pipeline_path) loaded_pipeline = Pipeline.load(pipeline_path) self._compare_pipelines(pl, loaded_pipeline) model_path = temp_path + "/pipeline-model" model.save(model_path) loaded_model = PipelineModel.load(model_path) self._compare_pipelines(model, loaded_model) finally: try: rmtree(temp_path) except OSError: pass
def test_pipeline(self): dataset = MockDataset() estimator0 = MockEstimator() transformer1 = MockTransformer() estimator2 = MockEstimator() transformer3 = MockTransformer() pipeline = Pipeline( stages=[estimator0, transformer1, estimator2, transformer3]) pipeline_model = pipeline.fit(dataset, { estimator0.fake: 0, transformer1.fake: 1 }) model0, transformer1, model2, transformer3 = pipeline_model.stages self.assertEqual(0, model0.dataset_index) self.assertEqual(0, model0.getFake()) self.assertEqual(1, transformer1.dataset_index) self.assertEqual(1, transformer1.getFake()) self.assertEqual(2, dataset.index) self.assertIsNone(model2.dataset_index, "The last model shouldn't be called in fit.") self.assertIsNone(transformer3.dataset_index, "The last transformer shouldn't be called in fit.") dataset = pipeline_model.transform(dataset) self.assertEqual(2, model0.dataset_index) self.assertEqual(3, transformer1.dataset_index) self.assertEqual(4, model2.dataset_index) self.assertEqual(5, transformer3.dataset_index) self.assertEqual(6, dataset.index)
def test_confusion_matrix(sdf): assem = VectorAssembler(inputCols=['Fare', 'Pclass', 'Age'], outputCol='features') rf = RandomForestClassifier(featuresCol='features', labelCol='Survived', numTrees=20) pipeline = Pipeline(stages=[assem, rf]) model = pipeline.fit(sdf.fillna(0.0)) predictions = model.transform(sdf.fillna(0.0)).select( 'probability', 'Survived') bcm = BinaryClassificationMetrics(predictions, scoreCol='probability', labelCol='Survived') predictions = predictions.toHandy().to_metrics_RDD('probability', 'Survived') predictions = np.array(predictions.collect()) scm = bcm.confusionMatrix().toArray() pcm = confusion_matrix(predictions[:, 1], predictions[:, 0] > .5) npt.assert_array_almost_equal(scm, pcm) scm = bcm.confusionMatrix(.3).toArray() pcm = confusion_matrix(predictions[:, 1], predictions[:, 0] > .3) npt.assert_array_almost_equal(scm, pcm)
def test_model_log(tmpdir): conda_env = os.path.join(str(tmpdir), "conda_env.yml") _mlflow_conda_env(conda_env, additional_pip_deps=["pyspark=={}".format(pyspark_version)]) iris = datasets.load_iris() feature_names = ["0", "1", "2", "3"] pandas_df = pd.DataFrame(iris.data, columns=feature_names) # to make spark_udf work pandas_df['label'] = pd.Series(iris.target) spark_session = pyspark.sql.SparkSession.builder \ .config(key="spark_session.python.worker.reuse", value=True) \ .master("local-cluster[2, 1, 1024]") \ .getOrCreate() spark_df = spark_session.createDataFrame(pandas_df) assembler = VectorAssembler(inputCols=feature_names, outputCol="features") lr = LogisticRegression(maxIter=50, regParam=0.1, elasticNetParam=0.8) pipeline = Pipeline(stages=[assembler, lr]) # Fit the model model = pipeline.fit(spark_df) # Print the coefficients and intercept for multinomial logistic regression preds_df = model.transform(spark_df) preds1 = [x.prediction for x in preds_df.select("prediction").collect()] old_tracking_uri = mlflow.get_tracking_uri() cnt = 0 # should_start_run tests whether or not calling log_model() automatically starts a run. for should_start_run in [False, True]: for dfs_tmp_dir in [None, os.path.join(str(tmpdir), "test")]: print("should_start_run =", should_start_run, "dfs_tmp_dir =", dfs_tmp_dir) try: tracking_dir = os.path.abspath(str(tmpdir.mkdir("mlruns"))) mlflow.set_tracking_uri("file://%s" % tracking_dir) if should_start_run: mlflow.start_run() artifact_path = "model%d" % cnt cnt += 1 sparkm.log_model(artifact_path=artifact_path, spark_model=model, dfs_tmpdir=dfs_tmp_dir) run_id = active_run().info.run_uuid # test pyfunc x = pyfunc.load_pyfunc(artifact_path, run_id=run_id) preds2 = x.predict(pandas_df) assert preds1 == preds2 # test load model reloaded_model = sparkm.load_model(artifact_path, run_id=run_id, dfs_tmpdir=dfs_tmp_dir) preds_df_1 = reloaded_model.transform(spark_df) preds3 = [x.prediction for x in preds_df_1.select("prediction").collect()] assert preds1 == preds3 # test spar_udf preds4 = score_model_as_udf(artifact_path, run_id, pandas_df) assert preds1 == preds4 # We expect not to delete the DFS tempdir. x = dfs_tmp_dir or sparkm.DFS_TMP assert os.path.exists(x) assert os.listdir(x) shutil.rmtree(x) finally: mlflow.end_run() mlflow.set_tracking_uri(old_tracking_uri) shutil.rmtree(tracking_dir)
def train_model(training_size, mode): print('Training model with records: ' + str(training_size)) spark = pyspark.sql.SparkSession.builder.appName( 'Model Prep').getOrCreate() data_df = model_utils.get_player_df(spark, training_size, mode) pipeline = Pipeline().setStages(transform_stages()) model = pipeline.fit(data_df) model.write().overwrite().save(model_constants.MODEL_LOCATION)
def spark_gbdt(train_file, test_file, features_columns='userID'): from pyspark.ml.classification import GBTClassifier from pyspark.ml.feature import StringIndexer, VectorIndexer, VectorAssembler from pyspark.ml.pipeline import Pipeline sess = get_spark_sesssion() string_indexer = StringIndexer(inputCol="label", outputCol="idx_label") v_c = VectorAssembler(inputCols=['userID'], outputCol='v_userID') trans = Pipeline(stages=[string_indexer, v_c]) gbdt = GBTClassifier(maxDepth=5, labelCol="idx_label", predictionCol="pred", featuresCol='v_userID', seed=42, maxMemoryInMB=1024 * 10, maxIter=4) train = sess.read.load( train_file, format='csv', header=True, inferSchema=True, ) train_data = trans.fit(train).transform(train) model = gbdt.fit(train_data) model.write().overwrite().save('gbtc.model') # model = GBTClassifier.load('gbtc.model') print(model.featureImportances) test = sess.read.load(test_file, format='csv', header=True, inferSchema=True) test_data = trans.fit(test).transform(test) predict = model.transform(test_data) predict.show() save_pandas(predict.select('instanceID', 'pred').toPandas(), 'submission.gbdt.csv', index=False)
def main(inputs): data = spark.read.csv(inputs, schema=colour_schema) train, validation = data.randomSplit([0.75, 0.25]) train = train.cache() validation = validation.cache() #To convert R,G,B to LabCIE rgb_to_lab_query = rgb2lab_query(passthrough_columns=['word']) sql_transformed = SQLTransformer(statement=rgb_to_lab_query) rgb_assembler = VectorAssembler(inputCols=['R', 'G', 'B'], outputCol='features') lab_assembler = VectorAssembler(inputCols=['labL', 'labA', 'labB'], outputCol='features') word_indexer = StringIndexer(inputCol='word', outputCol='indexed') classifier = MultilayerPerceptronClassifier(labelCol='indexed', layers=[3, 30, 11]) rgb_pipeline = Pipeline(stages=[rgb_assembler, word_indexer, classifier]) lab_pipeline = Pipeline( stages=[sql_transformed, lab_assembler, word_indexer, classifier]) rgb_model = rgb_pipeline.fit(train) lab_model = lab_pipeline.fit(train) prediction = rgb_model.transform(validation) prediction_lab = lab_model.transform(validation) prediction.show() prediction_lab.show() #Testing the model evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='indexed', metricName='f1') lab_evaluator = MulticlassClassificationEvaluator( predictionCol='prediction', labelCol='indexed', metricName='f1') score = evaluator.evaluate(prediction) lab_score = lab_evaluator.evaluate(prediction_lab) plot_predictions(rgb_model, 'RGB', labelCol='word') plot_predictions(lab_model, 'LAB', labelCol='word') print('Validation score for RGB model: %g' % (score, )) print('Validation score for LAB model:', lab_score)
def test_save_with_sample_input_containing_unsupported_data_type_raises_serialization_exception( spark_context, model_path): sql_context = SQLContext(spark_context) unsupported_df = sql_context.createDataFrame([(1, "2016-09-30"), (2, "2017-02-27")]) unsupported_df = unsupported_df.withColumn("_2", unsupported_df._2.cast(DateType())) pipeline = Pipeline(stages=[]) model = pipeline.fit(unsupported_df) # The Spark `DateType` is not supported by MLeap, so we expect serialization to fail. with pytest.raises(mleap.MLeapSerializationException): sparkm.save_model(spark_model=model, path=model_path, sample_input=unsupported_df)
def MachineLearning(df): file_dataSVM = "G:/Projects/Spark-Machine-Learning/Spark Machine Learning/Spark Machine Learning/svm/" data = df.select(['Summary','Sentiment']).withColumnRenamed('Sentiment','label') data = data.withColumn('length',length(data['Summary'])) # Basic sentence tokenizer tokenizer = Tokenizer(inputCol="Summary", outputCol="words") #remove stop words remover = StopWordsRemover(inputCol="words", outputCol="filtered_features") #transoform dataset to vectors cv = HashingTF(inputCol="filtered_features", outputCol="features1", numFeatures=1000) #calculate IDF for all dataset idf = IDF(inputCol= 'features1', outputCol = 'tf_idf') normalizer = StandardScaler(inputCol="tf_idf", outputCol="normFeatures", withStd=True, withMean=False) selector = ChiSqSelector(numTopFeatures=150, featuresCol="normFeatures", outputCol="selectedFeatures", labelCol="label") #prepare data for ML spark library cleanUp = VectorAssembler(inputCols =['selectedFeatures'],outputCol='features') # Normalize each Vector using $L^1$ norm. pipeline = Pipeline(stages=[tokenizer, remover, cv, idf,normalizer,selector,cleanUp]) pipelineModel = pipeline.fit(data) data = pipelineModel.transform(data) data.printSchema() train_data, test_data = data.randomSplit([0.7,0.3],seed=2018) lr = LogisticRegression(featuresCol="features", labelCol='label') lrModel = lr.fit(train_data) beta = np.sort(lrModel.coefficients) plt.plot(beta) plt.ylabel('Beta Coefficients') plt.show() trainingSummary = lrModel.summary roc = trainingSummary.roc.toPandas() plt.plot(roc['FPR'],roc['TPR']) plt.ylabel('False Positive Rate') plt.xlabel('True Positive Rate') plt.title('ROC Curve') plt.show() print('Training set areaUnderROC: ' + str(trainingSummary.areaUnderROC)) pr = trainingSummary.pr.toPandas() plt.plot(pr['recall'],pr['precision']) plt.ylabel('Precision') plt.xlabel('Recall') plt.show() predictions = lrModel.transform(test_data) evaluator = BinaryClassificationEvaluator() print('Test Area Under ROC', evaluator.evaluate(predictions))
def strat_scatterplot(sdf, col1, col2, n=30): stages = [] for col in [col1, col2]: splits = get_buckets(sdf.select(col).rdd.map(itemgetter(0)), n) stages.append(Bucketizer(splits=splits, inputCol=col, outputCol="__{}_bucket".format(col), handleInvalid="skip")) pipeline = Pipeline(stages=stages) model = pipeline.fit(sdf) return model, sdf.count()
def spark_model_iris(iris_df): feature_names, iris_pandas_df, iris_spark_df = iris_df assembler = VectorAssembler(inputCols=feature_names, outputCol="features") lr = LogisticRegression(maxIter=50, regParam=0.1, elasticNetParam=0.8) pipeline = Pipeline(stages=[assembler, lr]) # Fit the model model = pipeline.fit(iris_spark_df) preds_df = model.transform(iris_spark_df) preds = [x.prediction for x in preds_df.select("prediction").collect()] return SparkModelWithData( model=model, spark_df=iris_spark_df, pandas_df=iris_pandas_df, predictions=preds )
def test_pipeline(self, bag): from pyspark.ml.pipeline import Pipeline # create and save and load pth = "/tmp/spatial-join" new_p = Pipeline().setStages([bag["transformer"]]) new_p.write().overwrite().save(pth) saved_p = Pipeline.load(pth) # check transformations inp = bag["input"] exp = bag["expected"] check(new_p.fit(inp), inp, exp) check(saved_p.fit(inp), inp, exp)
def test_mleap_module_model_save_with_unsupported_transformer_raises_serialization_exception( spark_model_iris, model_path): class CustomTransformer(JavaModel): def _transform(self, dataset): return dataset unsupported_pipeline = Pipeline(stages=[CustomTransformer()]) unsupported_model = unsupported_pipeline.fit(spark_model_iris.spark_df) with pytest.raises(mlflow.mleap.MLeapSerializationException): mlflow.mleap.save_model(spark_model=unsupported_model, path=model_path, sample_input=spark_model_iris.spark_df)
def test_spark_module_model_save_with_mleap_and_unsupported_transformer_raises_exception( spark_model_iris, model_path): class CustomTransformer(JavaModel): def _transform(self, dataset): return dataset unsupported_pipeline = Pipeline(stages=[CustomTransformer()]) unsupported_model = unsupported_pipeline.fit(spark_model_iris.spark_df) with pytest.raises(ValueError): sparkm.save_model(spark_model=unsupported_model, path=model_path, sample_input=spark_model_iris.spark_df)
def oneHotEncoding(clickDF , columns): """ ohe = OneHotEncoderEstimator """ allStages = [StringIndexer(inputCol=column, outputCol=column+STRING_INDEXER_OUT_SUFFIX).setHandleInvalid("skip") for column in columns] oneHotEncodeInputOutputNames = [(column+STRING_INDEXER_OUT_SUFFIX , column+ONE_HOT_ENCODER_OUT_SUFFIX) for column in columns] oneHotEncodeInputOutputNames = list(zip(*oneHotEncodeInputOutputNames)) ohe = OneHotEncoderEstimator(inputCols=oneHotEncodeInputOutputNames[0] , outputCols=oneHotEncodeInputOutputNames[1]) allStages.append(ohe); pipeline = Pipeline(stages=allStages) clickDF = pipeline.fit(clickDF).transform(clickDF) deletedColumns = list(oneHotEncodeInputOutputNames[0])+columns; return clickDF;
def dataToVectorForTree(clickDF,categoricalColumnsNames , numericColumnNames): print ("===== Imputing=======") clickDF , imputedColumnNames = impute(clickDF,numericColumnNames) print ("===== String Indexer=======") allStages = [StringIndexer(inputCol=column, outputCol=column+STRING_INDEXER_OUT_SUFFIX).setHandleInvalid("skip") for column in categoricalColumnsNames] stringIndexderColumnsNames = [(column+STRING_INDEXER_OUT_SUFFIX , column+ONE_HOT_ENCODER_OUT_SUFFIX) for column in categoricalColumnsNames] stringIndexderColumnsNames = list(zip(*stringIndexderColumnsNames)) pipeline = Pipeline(stages=allStages) clickDF = pipeline.fit(clickDF).transform(clickDF) all_feature_columns = imputedColumnNames + list(stringIndexderColumnsNames[0]); print ("===== Assambler =======") feature_assembler = VectorAssembler(inputCols=all_feature_columns,outputCol="features") return feature_assembler.transform(clickDF);
def test_model_log(tmpdir): conda_env = os.path.join(str(tmpdir), "conda_env.yml") _mlflow_conda_env( conda_env, additional_pip_deps=["pyspark=={}".format(pyspark_version)]) iris = datasets.load_iris() X = iris.data # we only take the first two features. y = iris.target pandas_df = pd.DataFrame(X, columns=iris.feature_names) pandas_df['label'] = pd.Series(y) spark_session = pyspark.sql.SparkSession.builder \ .config(key="spark_session.python.worker.reuse", value=True) \ .master("local-cluster[2, 1, 1024]") \ .getOrCreate() spark_df = spark_session.createDataFrame(pandas_df) model_path = tmpdir.mkdir("model") assembler = VectorAssembler(inputCols=iris.feature_names, outputCol="features") lr = LogisticRegression(maxIter=50, regParam=0.1, elasticNetParam=0.8) pipeline = Pipeline(stages=[assembler, lr]) # Fit the model model = pipeline.fit(spark_df) # Print the coefficients and intercept for multinomial logistic regression preds_df = model.transform(spark_df) preds1 = [x.prediction for x in preds_df.select("prediction").collect()] old_tracking_uri = tracking.get_tracking_uri() # should_start_run tests whether or not calling log_model() automatically starts a run. for should_start_run in [False, True]: try: tracking_dir = os.path.abspath(str(tmpdir.mkdir("mlruns"))) tracking.set_tracking_uri("file://%s" % tracking_dir) if should_start_run: tracking.start_run() sparkm.log_model(artifact_path="model", spark_model=model) run_id = tracking.active_run().info.run_uuid x = pyfunc.load_pyfunc("model", run_id=run_id) preds2 = x.predict(pandas_df) assert preds1 == preds2 reloaded_model = sparkm.load_model("model", run_id=run_id) preds_df_1 = reloaded_model.transform(spark_df) preds3 = [ x.prediction for x in preds_df_1.select("prediction").collect() ] assert preds1 == preds3 finally: tracking.end_run() tracking.set_tracking_uri(old_tracking_uri) shutil.rmtree(tracking_dir)
def strat_scatterplot(sdf, col1, col2, n=30): stages = [] for col in [col1, col2]: splits = np.linspace( *sdf.agg(F.min(col), F.max(col)).rdd.map(tuple).collect()[0], n + 1) bucket_name = '__{}_bucket'.format(col) stages.append( Bucketizer(splits=splits, inputCol=col, outputCol=bucket_name, handleInvalid="skip")) pipeline = Pipeline(stages=stages) model = pipeline.fit(sdf) return model, sdf.count()
def main(spark, df, model_file): # import metadata df = spark.read.parquet(df) print("imported meta data") # make indexer on all the tracks in metadata indexer1 = StringIndexer(inputCol="track_id", outputCol="track_index", handleInvalid="skip") pipeline = Pipeline(stages=[indexer1]) model = pipeline.fit(df) print("mapped user_index") # output the model indexer model.write().overwrite().save(model_file)
def test_get_metrics_by_threshold(sdf): assem = VectorAssembler(inputCols=['Fare', 'Pclass', 'Age'], outputCol='features') rf = RandomForestClassifier(featuresCol='features', labelCol='Survived', numTrees=20, seed=13) pipeline = Pipeline(stages=[assem, rf]) model = pipeline.fit(sdf.fillna(0.0)) predictions = model.transform(sdf.fillna(0.0)).select( 'probability', 'Survived') bcm = BinaryClassificationMetrics(predictions, scoreCol='probability', labelCol='Survived') metrics = bcm.getMetricsByThreshold() predictions = predictions.toHandy().to_metrics_RDD('probability', 'Survived') predictions = np.array(predictions.collect()) pr = np.array(bcm.pr().collect()) idx = pr[:, 0].argmax() pr = pr[:idx + 1, :] precision, recall, thresholds = precision_recall_curve( predictions[:, 1], predictions[:, 0]) npt.assert_array_almost_equal(precision, pr[:, 1][::-1]) npt.assert_array_almost_equal(recall, pr[:, 0][::-1]) roc = np.array(bcm.roc().collect()) idx = roc[:, 1].argmax() roc = roc[:idx + 1, :] sroc = pd.DataFrame(np.round(roc, 6), columns=['fpr', 'tpr']) sroc = sroc.groupby('fpr').agg({'tpr': [np.min, np.max]}) fpr, tpr, thresholds = roc_curve(predictions[:, 1], predictions[:, 0]) idx = tpr.argmax() proc = pd.DataFrame({ 'fpr': np.round(fpr[:idx + 1], 6), 'tpr': np.round(tpr[:idx + 1], 6) }) proc = proc.groupby('fpr').agg({'tpr': [np.min, np.max]}) sroc = sroc.join(proc, how='inner', rsuffix='sk') npt.assert_array_almost_equal(sroc.iloc[:, 0], proc.iloc[:, 0]) npt.assert_array_almost_equal(sroc.iloc[:, 1], proc.iloc[:, 1])
def main(input, model_file): tmax_schema = types.StructType([ types.StructField('station', types.StringType()), types.StructField('date', types.DateType()), types.StructField('latitude', types.FloatType()), types.StructField('longitude', types.FloatType()), types.StructField('elevation', types.FloatType()), types.StructField('tmax', types.FloatType()), ]) data = spark.read.csv(input, schema=tmax_schema) train, validation = data.randomSplit([0.75, 0.25], seed=123) train = train.cache() validation = validation.cache() y_tmax = SQLTransformer( statement= "SELECT today.station,today.latitude,today.longitude,today.elevation,today.date,today.tmax,yesterday.tmax AS yesterday_tmax FROM __THIS__ as today INNER JOIN __THIS__ as yesterday ON date_sub(today.date, 1) = yesterday.date AND today.station = yesterday.station" ) getvalues = SQLTransformer( statement= "SELECT station,latitude,longitude,elevation,dayofyear(date) AS dayofyear,tmax,yesterday_tmax from __THIS__" ) assemble_features = VectorAssembler(inputCols=[ 'latitude', 'longitude', 'elevation', 'dayofyear', 'yesterday_tmax' ], outputCol='features') classifier = GBTRegressor(featuresCol='features', labelCol='tmax') pipeline = Pipeline( stages=[y_tmax, getvalues, assemble_features, classifier]) model = pipeline.fit(train) predictions = model.transform(validation) r2_evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='tmax', metricName='r2') r2 = r2_evaluator.evaluate(predictions) print('-----------------------------------') print('r2: %g' % (r2, )) print('-----------------------------------') rmse_evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='tmax', metricName='rmse') rmse = rmse_evaluator.evaluate(predictions) print('rmse: %g' % (rmse, )) model.write().overwrite().save(model_file)
def mahalanobis(sdf, colnames): """Computes Mahalanobis distance from origin and compares to critical values using Chi-Squared distribution to identify possible outliers. """ check_columns(sdf, colnames) # Builds pipeline to assemble feature columns and scale them assembler = VectorAssembler(inputCols=colnames, outputCol='__features') scaler = StandardScaler(inputCol='__features', outputCol='__scaled', withMean=True) pipeline = Pipeline(stages=[assembler, scaler]) features = pipeline.fit(sdf).transform(sdf) # Computes correlation between features and inverts it # Since we scaled the features, we can assume they have unit variance # and therefore, correlation and covariance matrices are the same! mat = Correlation.corr(features, '__scaled').head()[0].toArray() inv_mat = inv(mat) # Computes critical value critical_value = chi2.ppf(0.999, len(colnames)) # Builds Pandas UDF to compute Mahalanobis distance from origin # sqrt((V - 0) * inv_M * (V - 0)) try: import pyarrow @F.pandas_udf('double') def pudf_mult(v): return v.apply(lambda v: np.sqrt(np.dot(np.dot(v, inv_mat), v))) except: @F.udf('double') def pudf_mult(v): return v.apply(lambda v: np.sqrt(np.dot(np.dot(v, inv_mat), v))) # Convert feature vector into array features = dense_to_array(features, '__scaled', '__array_scaled') # Computes Mahalanobis distance and flags as outliers all elements above critical value distance = (features.withColumn( '__mahalanobis', pudf_mult('__array_scaled')).withColumn( '__outlier', F.col('__mahalanobis') > critical_value).drop( '__features', '__scaled', '__array_scaled')) return distance
def simple_train_model(input_df): xgboost_params = { "eta": 0.023, "max_depth": 10, "min_child_weight": 0.3, "subsample": 0.7, "colsample_bytree": 0.82, "colsample_bylevel": 0.9, "eval_metric": "auc", "seed": 49, "silent": 1, "objective": "binary:logistic", "round": 10, "nWorkers": 2 } xgb_model = XGBoostClassifier(xgboost_params) pipeline = Pipeline(stages=[xgb_model]) return pipeline.fit(input_df)
def test_pipeline(self): dataset = MockDataset() estimator0 = MockEstimator() transformer1 = MockTransformer() estimator2 = MockEstimator() transformer3 = MockTransformer() pipeline = Pipeline(stages=[estimator0, transformer1, estimator2, transformer3]) pipeline_model = pipeline.fit(dataset, {estimator0.fake: 0, transformer1.fake: 1}) model0, transformer1, model2, transformer3 = pipeline_model.stages self.assertEqual(0, model0.dataset_index) self.assertEqual(0, model0.getFake()) self.assertEqual(1, transformer1.dataset_index) self.assertEqual(1, transformer1.getFake()) self.assertEqual(2, dataset.index) self.assertIsNone(model2.dataset_index, "The last model shouldn't be called in fit.") self.assertIsNone(transformer3.dataset_index, "The last transformer shouldn't be called in fit.") dataset = pipeline_model.transform(dataset) self.assertEqual(2, model0.dataset_index) self.assertEqual(3, transformer1.dataset_index) self.assertEqual(4, model2.dataset_index) self.assertEqual(5, transformer3.dataset_index) self.assertEqual(6, dataset.index)
def simple_train_model(input_df): lr_model = LogisticRegression(regParam=0.01) pipeline = Pipeline(stages=[lr_model]) return pipeline.fit(input_df)
irisBucketizedWidth = widthBucketizer.transform(irisBucketizedLength) display(irisBucketizedWidth) # COMMAND ---------- # MAGIC %md # MAGIC Let's combine the two bucketizers into a [Pipeline](http://spark.apache.org/docs/latest/ml-guide.html#pipeline-components) that performs both bucketizations. A `Pipeline` is make up of stages which can be set using `setStages` and passing in a `list` of stages in Python or an `Array` of stages in `Scala`. `Pipeline` is an estimator, which means it implements a `fit` method which returns a `PipelineModel`. A `PipelineModel` is a transformer, which means that it implements a `transform` method which can be used to run the stages. # COMMAND ---------- from pyspark.ml.pipeline import Pipeline pipelineBucketizer = Pipeline().setStages([lengthBucketizer, widthBucketizer]) pipelineModelBucketizer = pipelineBucketizer.fit(irisSeparateFeatures) irisBucketized = pipelineModelBucketizer.transform(irisSeparateFeatures) display(irisBucketized) # COMMAND ---------- # MAGIC %md # MAGIC Now that we have created two new features through bucketing, let's combined those two features into a `Vector` with `VectorAssembler`. # MAGIC # MAGIC Set the params of `assembler` so that both "lengthFeatures" and "widthFeatures" are assembled into a column called "featuresBucketized". # MAGIC # MAGIC Then, set the stages of `pipeline` to include both bucketizers and the assembler as the last stage. # MAGIC # MAGIC Finally, use `pipeline` to generate a new `DataFrame` called `irisAssembled`.
d2 = d1.toDF("number", "name", "SI", "GOO", "DONG", "x", "y", "b_code", "h_code", "utmk_x", "utmk_y", "wtm_x", "wtm_y") d3 = d2.select(d2.GOO.alias("loc"), d2.x, d2.y) d3.show(5, False) indexer = StringIndexer(inputCol="loc", outputCol="loccode") assembler = VectorAssembler(inputCols=["loccode", "x", "y"], outputCol="features") kmeans = KMeans(k=5, seed=1, featuresCol="features") pipeline = Pipeline(stages=[indexer, assembler, kmeans]) model = pipeline.fit(d3) d4 = model.transform(d3) d4.groupBy("prediction") \ .agg(functions.collect_set("loc").alias("loc")) \ .orderBy("prediction").show(100, False) WSSSE = model.stages[2].computeCost(d4) print("Within Set Sum of Squared Errors = %d" % WSSSE) print("Cluster Centers: ") for v in model.stages[2].clusterCenters(): print(v) spark.stop
irisBucketizedWidth = widthBucketizer.transform(irisBucketizedLength) display(irisBucketizedWidth) # COMMAND ---------- # MAGIC %md # MAGIC Let's combine the two bucketizers into a [Pipeline](http://spark.apache.org/docs/latest/ml-guide.html#pipeline-components) that performs both bucketizations. A `Pipeline` is made up of stages which can be set using `setStages` and passing in a `list` of stages in Python or an `Array` of stages in Scala. `Pipeline` is an estimator, which means it implements a `fit` method which returns a `PipelineModel`. A `PipelineModel` is a transformer, which means that it implements a `transform` method which can be used to run the stages. # COMMAND ---------- from pyspark.ml.pipeline import Pipeline pipelineBucketizer = Pipeline().setStages([lengthBucketizer, widthBucketizer]) pipelineModelBucketizer = pipelineBucketizer.fit(irisSeparateFeatures) irisBucketized = pipelineModelBucketizer.transform(irisSeparateFeatures) display(irisBucketized) # COMMAND ---------- # MAGIC %md # MAGIC Now that we have created two new features through bucketing, let's combine those two features into a `Vector` with `VectorAssembler`. VectorAssembler can be found in [pyspark.ml.feature](https://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.feature.VectorAssembler) for Python and the [org.apache.spark.ml.feature](http://spark.apache.org/docs/latest/api/scala/#org.apache.spark.ml.feature.VectorAssembler) package for Scala. # MAGIC # MAGIC Set the params of `assembler` so that both "lengthFeatures" and "widthFeatures" are assembled into a column called "featuresBucketized". # MAGIC # MAGIC Then, set the stages of `pipeline` to include both bucketizers and the assembler as the last stage. # MAGIC # MAGIC Finally, use `pipeline` to generate a new `DataFrame` called `irisAssembled`.
(rf .setMaxBins(10) .setMaxDepth(2) .setNumTrees(20) .setSeed(0)) # COMMAND ---------- # MAGIC %md # MAGIC Next, we'll build a pipeline that includes the `StringIndexer`, `PolynomialExpansion`, and `RandomForestClassifier`. # COMMAND ---------- rfPipeline = Pipeline().setStages([stringIndexer, px, rf]) rfModelPipeline = rfPipeline.fit(irisTrain) rfPredictions = rfModelPipeline.transform(irisTest) print multiEval.evaluate(rfPredictions) # COMMAND ---------- display(rfPredictions) # COMMAND ---------- # MAGIC %md # MAGIC So what exactly did `PolynomialExpansion` do? # COMMAND ----------
evaluator = RegressionEvaluator(labelCol="weight", predictionCol="predic_weight") # root mean squared error rmse = evaluator.evaluate(d13) # mean squared error mse = evaluator.setMetricName("mse").evaluate(d13) # R2 metric r2 = evaluator.setMetricName("r2").evaluate(d13) # mean absolute error mae = evaluator.setMetricName("mae").evaluate(d13) print("rmse:%d, mse:%d, r2:%d, mae:%d" % (rmse, mse, r2, mae)) # 파이프라인 pipeline = Pipeline(stages=[gradeIndexer, genderIndexer, assembler, lr]) samples2 = df9.randomSplit([0.7, 0.3]) training2 = samples2[0] test2 = samples2[1] # 파이프라인 모델 생성 pipelineModel = pipeline.fit(training2) # 파이프라인 모델을 이용한 예측값 생성 pipelineModel.transform(test2).show(5, False) spark.stop
lr = (LinearRegression() .setLabelCol('sepalWidth') .setMaxIter(1000)) print lr.explainParams() # COMMAND ---------- # MAGIC %md # MAGIC Next, we'll create a `Pipeline` that only contains one stage for the linear regression. # COMMAND ---------- from pyspark.ml.pipeline import Pipeline pipeline = Pipeline().setStages([lr]) pipelineModel = pipeline.fit(irisSepalSample) sepalPredictions = pipelineModel.transform(irisSepalSample) display(sepalPredictions) # COMMAND ---------- # MAGIC %md # MAGIC What does our resulting model look like? # COMMAND ---------- lrModel = pipelineModel.stages[-1] print type(lrModel) print '\n', lrModel.intercept, lrModel.weights
d6.groupBy("label").count().show(truncate=False) dataArr = d6.randomSplit([0.7, 0.3]) train = dataArr[0] test = dataArr[1] indexer = StringIndexer(inputCol="road", outputCol="roadcode") assembler = VectorAssembler(inputCols=["roadcode", "mon", "tue", "wed", "thu", "fri", "sat", "sun"], outputCol="features") dt = DecisionTreeClassifier(labelCol="label", featuresCol="features") pipeline = Pipeline(stages=[indexer, assembler, dt]) model = pipeline.fit(train) predict = model.transform(test) predict.select("label", "probability", "prediction").show(3, False) # areaUnderROC, areaUnderPR evaluator = BinaryClassificationEvaluator(labelCol="label", metricName="areaUnderROC") print(evaluator.evaluate(predict)) treeModel = model.stages[2] print("Learned classification tree model:%s" % treeModel.toDebugString) spark.stop
irisBucketizedWidth = widthBucketizer.transform(irisBucketizedLength) display(irisBucketizedWidth) # COMMAND ---------- # MAGIC %md # MAGIC Let's combine the two bucketizers into a [Pipeline](http://spark.apache.org/docs/latest/ml-guide.html#pipeline-components) that performs both bucketizations. A `Pipeline` is make up of stages which can be set using `setStages` and passing in a `list` of stages in Python or an `Array` of stages in `Scala`. `Pipeline` is an estimator, which means it implements a `fit` method which returns a `PipelineModel`. A `PipelineModel` is a transformer, which means that it implements a `transform` method which can be used to run the stages. # COMMAND ---------- from pyspark.ml.pipeline import Pipeline pipelineBucketizer = Pipeline().setStages([lengthBucketizer, widthBucketizer]) pipelineModelBucketizer = pipelineBucketizer.fit(irisSeparateFeatures) irisBucketized = pipelineModelBucketizer.transform(irisSeparateFeatures) display(irisBucketized) # COMMAND ---------- # MAGIC %md # MAGIC Now that we have created two new features through bucketing, let's combined those two features into a `Vector` with `VectorAssembler`. # MAGIC # MAGIC Set the params of `assembler` so that both "lengthFeatures" and "widthFeatures" are assembled into a column called "featuresBucketized". # MAGIC # MAGIC Then, set the stages of `pipeline` to include both bucketizers and the assembler as the last stage. # MAGIC # MAGIC Finally, use `pipeline` to generate a new `DataFrame` called `irisAssembled`.
firstMlModel.clusterCenters() # Sudarome `Pipeline` žingsnių seką iš `vecAssembler` ir `kmeans` komponentų. # In[66]: from pyspark.ml.pipeline import Pipeline firstPipeline = Pipeline(stages=[vecAssembler, firstMlKMeans]) # In[67]: firstPipelineModel = firstPipeline.fit(ca1MlDF) # In[73]: firstPipelineModel.transform(ca1MlDF).show(5) # In[74]: MlKmeansWSSSEResults = collections.namedtuple("MlKmeansWSSSEResults", ["ks", "WSSSEs", "pipelineModels"]) def mlKmeansWSSSEsByK(initialDF, kValues): vecAssembler = VectorAssembler(inputCols=["x", "y"], outputCol="features") pipelineModels = [Pipeline(stages=[vecAssembler, MlKMeans(k=k)]).fit(initialDF) for k in kValues]