def _fit(self, dataset): stages = self.getStages() for stage in stages: if not (isinstance(stage, Estimator) or isinstance(stage, Transformer)): raise TypeError( "Cannot recognize a pipeline stage of type %s." % type(stage)) indexOfLastEstimator = -1 for i, stage in enumerate(stages): if isinstance(stage, Estimator): indexOfLastEstimator = i transformers = [] for i, stage in enumerate(stages): if i <= indexOfLastEstimator: if isinstance(stage, Transformer): transformers.append(stage) dataset = stage.transform(dataset) elif isinstance(stage, RecursiveEstimator): model = stage.fit(dataset, pipeline=PipelineModel(transformers)) transformers.append(model) if i < indexOfLastEstimator: dataset = model.transform(dataset) else: model = stage.fit(dataset) transformers.append(model) if i < indexOfLastEstimator: dataset = model.transform(dataset) else: transformers.append(stage) return PipelineModel(transformers)
def main(): if len(sys.argv) == 5: print("load data and transform features...") df = load_data(sys.argv[1]) df, labelIndexer = features_transform(df) trainingData, testData = df.randomSplit([0.7, 0.3], seed=6) trainingData.cache() print( "train or load RandomForest models with tuning parameters, then make a prediction on testData..." ) if os.path.exists(sys.argv[2]): persistedModel = PipelineModel.load(sys.argv[2]) evaluator = MulticlassClassificationEvaluator(metricName='f1') predictions = persistedModel.transform(testData) else: model1, predictions, evaluator = rf_model(trainingData, testData, labelIndexer) print("rf model evaluation...") # print F1 score of the prediction print_scores(predictions, evaluator) print( "train or load LogisticRegression models with tuning parameters, then make a prediction on testData" ) if os.path.exists(sys.argv[3]): persistedModel = PipelineModel.load(sys.argv[3]) evaluator = MulticlassClassificationEvaluator(metricName='f1') predictions = persistedModel.transform(testData) else: model2, predictions, evaluator = lr_model(trainingData, testData, labelIndexer) print("lr model evaluation...") # print F1 score of the prediction print_scores(predictions, evaluator) print( "train or load DecisionTree models with tuning parameters, then make a prediction on testData" ) if os.path.exists(sys.argv[4]): persistedModel = PipelineModel.load(sys.argv[4]) evaluator = MulticlassClassificationEvaluator(metricName='f1') predictions = persistedModel.transform(testData) else: model3, predictions, evaluator = dt_model(trainingData, testData, labelIndexer) print("dt model evaluation...") # print F1 score of the prediction print_scores(predictions, evaluator) else: print('Please provide the filepath of the features data '\ 'as the first argument and the filepath of the pickle file to '\ 'save the model to as the second to fourth argument. \n\nExample: python '\ 'train_classifier.py ../data/user_item.csv rf lr dt')
def test_nested_pipeline_persistence(self): """ Pipeline[HashingTF, Pipeline[PCA]] """ temp_path = tempfile.mkdtemp() try: df = self.spark.createDataFrame([(["a", "b", "c"],), (["c", "d", "e"],)], ["words"]) tf = HashingTF(numFeatures=10, inputCol="words", outputCol="features") pca = PCA(k=2, inputCol="features", outputCol="pca_features") p0 = Pipeline(stages=[pca]) pl = Pipeline(stages=[tf, p0]) model = pl.fit(df) pipeline_path = temp_path + "/pipeline" pl.save(pipeline_path) loaded_pipeline = Pipeline.load(pipeline_path) self._compare_pipelines(pl, loaded_pipeline) model_path = temp_path + "/pipeline-model" model.save(model_path) loaded_model = PipelineModel.load(model_path) self._compare_pipelines(model, loaded_model) finally: try: rmtree(temp_path) except OSError: pass
def load_xgb_model(path, m_type): """ :param path: model输入路径 :param m_type: model类型 :return: 输出对应model """ # 获取model目录下metadata dict metadata = DefaultParamsReader.loadMetadata(path, sc) # stages_dir = os.path.join(path, "stages") # stage路径 stages_dir = path + "/stages" stage_uids = metadata['paramMap']['stageUids'] # metadata中model的uid、路径名 stage_paths = {} # 构建空dict # 循环遍历 for index, stage_uid in enumerate(stage_uids): # 遍历model,获取相应的stages目录下model的路径 stage_path = \ PipelineSharedReadWrite.getStagePath(stage_uid, index, len(stage_uids), stages_dir) # stage_paths.append(stage_path) # 获取model,以及将相应路径写入字典 key = stage_uid.split('_')[0] stage_paths[key] = str(stage_path) # stage_paths = load_xgb_model(path, sc) # 根据model type选择相应的model load方法,并返回相应的model # model type为PipelineModel、XGBoostClassificationModel if m_type == 'PipelineModel': model_path = stage_paths['PipelineModel'] model = PipelineModel.load(model_path) else: model_path = stage_paths['xgbc'] model = XGBoostClassificationModel.load(model_path) return model
def load_model(path, run_id=None, dfs_tmpdir=DFS_TMP): """ Load the Spark MLlib model from the path. :param run_id: Run ID. If provided, combined with ``path`` to identify the model. :param path: Local filesystem path or run-relative artifact path to the model. :return: SparkML model. :rtype: pyspark.ml.pipeline.PipelineModel """ if run_id is not None: path = mlflow.tracking._get_model_log_dir(model_name=path, run_id=run_id) m = Model.load(os.path.join(path, 'MLmodel')) if FLAVOR_NAME not in m.flavors: raise Exception("Model does not have {} flavor".format(FLAVOR_NAME)) conf = m.flavors[FLAVOR_NAME] model_path = os.path.join(path, conf['model_data']) tmp_path = _tmp_path(dfs_tmpdir) try: # Spark ML expects the model to be stored on DFS # Copy the model to a temp DFS location first. _HadoopFileSystem.copy_from_local_file(model_path, tmp_path, removeSrc=False) pipeline_model = PipelineModel.load(tmp_path) return pipeline_model finally: _HadoopFileSystem.delete(tmp_path)
def test_nested_pipeline_persistence(self): """ Pipeline[HashingTF, Pipeline[PCA]] """ temp_path = tempfile.mkdtemp() try: df = self.spark.createDataFrame([(["a", "b", "c"], ), (["c", "d", "e"], )], ["words"]) tf = HashingTF(numFeatures=10, inputCol="words", outputCol="features") pca = PCA(k=2, inputCol="features", outputCol="pca_features") p0 = Pipeline(stages=[pca]) pl = Pipeline(stages=[tf, p0]) model = pl.fit(df) pipeline_path = temp_path + "/pipeline" pl.save(pipeline_path) loaded_pipeline = Pipeline.load(pipeline_path) self._compare_pipelines(pl, loaded_pipeline) model_path = temp_path + "/pipeline-model" model.save(model_path) loaded_model = PipelineModel.load(model_path) self._compare_pipelines(model, loaded_model) finally: try: rmtree(temp_path) except OSError: pass
def test_python_transformer_pipeline_persistence(self): """ Pipeline[MockUnaryTransformer, Binarizer] """ temp_path = tempfile.mkdtemp() try: df = self.spark.range(0, 10).toDF("input") tf = MockUnaryTransformer( shiftVal=2).setInputCol("input").setOutputCol("shiftedInput") tf2 = Binarizer(threshold=6, inputCol="shiftedInput", outputCol="binarized") pl = Pipeline(stages=[tf, tf2]) model = pl.fit(df) pipeline_path = temp_path + "/pipeline" pl.save(pipeline_path) loaded_pipeline = Pipeline.load(pipeline_path) self._compare_pipelines(pl, loaded_pipeline) model_path = temp_path + "/pipeline-model" model.save(model_path) loaded_model = PipelineModel.load(model_path) self._compare_pipelines(model, loaded_model) finally: try: rmtree(temp_path) except OSError: pass
def return_prediction(sample_id): ''' INPUT: sample_id - (list) list of sampled userId in display order OUTPUT: actuals - (list) list of actual churn status (0 or 1) probas - (list) list of churn probabilities preds - (list) list of churn prediction (0 or 1) DESCRIPTION: Make prediction on sampled data and return labels, probabilities and predictions. ''' # create a Spark session (in case of local workspace) '''please modify for actual Spark environment''' spark = SparkSession \ .builder \ .appName("Sparkify") \ .master("local") \ .getOrCreate() # load processed data feature_data_path = './data/micro_sparkify_features.parquet' print('Loading data...\n DATASET: {}'.format(feature_data_path)) feature_data = spark.read.load(feature_data_path) # load trained Gradient Boosted-Tree Classifier model from folder model_path = './models/webGbtModel' print('Loading model...\n MODEL: {}'.format(model_path)) classifierModel = PipelineModel.load(model_path) # extract subset of data for sample id print('Extracting samples...') sample_data = feature_data.filter(col('userId').isin(sample_id)) # transform with classification pipeline (churn prediction) print('Classifying data...') classifiedData = classifierModel.transform(sample_data) pd_classified = classifiedData.select('userId', 'label', 'probability', 'prediction').toPandas() print('Data classified!') # sort in display order pd_classified['userId'].astype(int, copy=False) pd_classified.set_index('userId', inplace=True) pd_classified = pd_classified.loc[sample_id, :] actuals = [x for x in pd_classified['label']] probas = [round(x[1], 3) for x in pd_classified['probability']] preds = [x for x in pd_classified['prediction']] # clear pyspark dataframe cache feature_data.unpersist() sample_data.unpersist() classifiedData.unpersist() spark.stop() return actuals, probas, preds
def logistic_regression(tr_data=None, t_data=None, proc_type='train', example=None): """Performs logistic regression pipelining.""" lr = LogisticRegression(regParam=0.001, family='multinomial') pipeline = Pipeline(stages=[lr]) if proc_type == 'load' or proc_type == 'test': model = PipelineModel.load(LR_PATH) else: model = pipeline.fit(tr_data) if os.path.exists(LR_PATH): shutil.rmtree(LR_PATH) model.save(LR_PATH) if proc_type == 'test': result = model.transform(example).collect() return result, 0. else: prediction = model.transform(t_data) evaluator = RegressionEvaluator(metricName="rmse", labelCol="label", predictionCol="prediction") rmse = evaluator.evaluate(prediction) return None, rmse
def binary_classification(tr_data=None, t_data=None, proc_type='train', example=None): """Performs binary classification pipelining.""" lr = LogisticRegression(tol=1e-6, fitIntercept=True) ovr = OneVsRest(classifier=lr) pipeline = Pipeline(stages=[ovr]) if proc_type == 'load' or proc_type == 'test': model = PipelineModel.load(BC_PATH) else: model = pipeline.fit(tr_data) if os.path.exists(BC_PATH): shutil.rmtree(BC_PATH) model.save(BC_PATH) if proc_type == 'test': result = model.transform(example).collect() return result, 0. else: prediction = model.transform(t_data) evaluator = MulticlassClassificationEvaluator( labelCol='label', predictionCol='prediction', metricName='f1') f1Score = evaluator.evaluate(prediction) return None, f1Score
def multi_class_classification(tr_data=None, t_data=None, proc_type='train', example=None): """Performs multi-class classification pipelining.""" rfc = RandomForestClassifier(labelCol='label', featuresCol='features', numTrees=10) pipeline = Pipeline(stages=[rfc]) if proc_type == 'load' or proc_type == 'test': model = PipelineModel.load(MCC_PATH) else: model = pipeline.fit(tr_data) if os.path.exists(MCC_PATH): shutil.rmtree(MCC_PATH) model.save(MCC_PATH) if proc_type == 'test': result = model.transform(example).collect() return result, 0. else: prediction = model.transform(t_data) evaluator = MulticlassClassificationEvaluator( labelCol='label', predictionCol='prediction', metricName='accuracy') accuracy = evaluator.evaluate(prediction) return None, accuracy
def test_python_transformer_pipeline_persistence(self): """ Pipeline[MockUnaryTransformer, Binarizer] """ temp_path = tempfile.mkdtemp() try: df = self.spark.range(0, 10).toDF('input') tf = MockUnaryTransformer(shiftVal=2)\ .setInputCol("input").setOutputCol("shiftedInput") tf2 = Binarizer(threshold=6, inputCol="shiftedInput", outputCol="binarized") pl = Pipeline(stages=[tf, tf2]) model = pl.fit(df) pipeline_path = temp_path + "/pipeline" pl.save(pipeline_path) loaded_pipeline = Pipeline.load(pipeline_path) self._compare_pipelines(pl, loaded_pipeline) model_path = temp_path + "/pipeline-model" model.save(model_path) loaded_model = PipelineModel.load(model_path) self._compare_pipelines(model, loaded_model) finally: try: rmtree(temp_path) except OSError: pass
def main(account_name, account_key): sc = SparkContext() sqlContext = SQLContext(sc) patient_records_container = 'patientrecords' glucose_levels_container = 'glucoselevelsaggs' preds_container = 'predictions' blob_service = BlobService(account_name=account_name, account_key=account_key) blob_service.create_container(preds_container) day_to_predict = get_most_recent_date(blob_service, glucose_levels_container) df = get_df_from_blob(blob_service, glucose_levels_container, patient_records_container, day_to_predict) project_path = 'wasb://model@{}.blob.core.windows.net/{}' si_pipe_model = PipelineModel.read().load(path=project_path.format(account_name, 'si_pipe_model')) oh_pipe_model = PipelineModel.read().load(path=project_path.format(account_name, 'oh_pipe_model')) model = RandomForestClassificationModel.read().load(path=project_path.format(account_name, 'model')) df_spark = sqlContext.createDataFrame(df) df_preds = si_pipe_model.transform(df_spark) df_preds = oh_pipe_model.transform(df_preds) num_var_names = ['time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1', 'diag_2', 'diag_3', 'number_diagnoses', 'glucose_min', 'glucose_max', 'glucose_mean', 'glucose_var'] cat_var_names = ['race', 'gender', 'age', 'weight', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id', 'payer_code', 'medical_specialty', 'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide', 'insulin', 'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone', 'change', 'diabetesMed', 'diag_1_missing', 'diag_2_missing', 'diag_3_missing', 'race_missing', 'weight_missing', 'payer_code_missing', 'medical_specialty_missing'] va = VectorAssembler(inputCols=(num_var_names + [c + "__encoded__" for c in cat_var_names]), outputCol='features') df_preds = va.transform(df_preds).select('features') df_preds = model.transform(df_preds) df_preds_pandas = df_preds.toPandas() df_preds_pandas = pd.concat([df[['patient_nbr', 'discharge_date']], df_preds_pandas['probability'].map(lambda x: x[1])], axis=1) # Save the predictions blob_service.put_block_blob_from_text(blob_name='-'.join(str(day_to_predict).split('/')) + '.csv', container_name=preds_container, text=df_preds_pandas.to_csv(index=False)) return
def _load_model(model_path, dfs_tmpdir=None): if dfs_tmpdir is None: dfs_tmpdir = DFS_TMP tmp_path = _tmp_path(dfs_tmpdir) # Spark ML expects the model to be stored on DFS # Copy the model to a temp DFS location first. We cannot delete this file, as # Spark may read from it at any point. model_path = _HadoopFileSystem.maybe_copy_from_local_file(model_path, tmp_path) return PipelineModel.load(model_path)
def load_pyfunc(path): """ Load the model as PyFunc. :param path: Local path :return: The model as PyFunc. """ spark = pyspark.sql.SparkSession.builder.config("spark.python.worker.reuse", True) \ .master("local[1]").getOrCreate() return _PyFuncModelWrapper(spark, PipelineModel.load(path))
def pipeline_model_load(self, path): """ 加载pipeline model :param path: :return: """ full_path = self.concat_path(path, self.model_key) model = PipelineModel.load(full_path) return model
def get_stages(pipeline: PipelineModel): """ Extract the stages from a fit or unfit pipeline :param pipeline: a fit or unfit Spark pipeline :return: stages list """ if hasattr(pipeline, 'getStages'): return pipeline.getStages() # unfit pipeline return pipeline.stages # fit pipeline
def _transform(self, dataset): for t in self.stages: if isinstance(t, HasRecursiveTransform): # drops current stage from the recursive pipeline within dataset = t.transform_recursive(dataset, PipelineModel(self.stages[:-1])) elif isinstance(t, AnnotatorProperties) and t.getLazyAnnotator(): pass else: dataset = t.transform(dataset) return dataset
def load_pyfunc(path): """ Load a Python Function model from a local file. :param path: Local path. :return: The model as PyFunc. """ spark = pyspark.sql.SparkSession.builder.config("spark.python.worker.reuse", True) \ .master("local[1]").getOrCreate() # We do not need any DFS here as pyfunc should create its own SparkContext with no executors return _PyFuncModelWrapper(spark, PipelineModel.load("file:" + os.path.abspath(path)))
def attach_tensorflow_model_to_pipeline(path, pipelineModel, inputCol, tfInput, tfOutput, predictionCol='predicted', tfDropout=None, toKeepDropout=False): spark_model = load_tensorflow_model(path, inputCol, tfInput, tfOutput, predictionCol, tfDropout, toKeepDropout) return PipelineModel(stages=[pipelineModel, spark_model])
def load_pyfunc(path): """ Load a persisted Spark MLlib PipelineModel as a ``python_function`` model. >>> pyfunc_model = load_pyfunc("/tmp/pyfunc-spark-model") >>> predictions = pyfunc_model.predict(test_pandas_df) :param path: Local filesystem path to the model saved by :py:func:`mlflow.spark.log_model`. :rtype: Pyfunc format model with function ``model.predict(pandas DataFrame) -> pandas DataFrame``. """ spark = pyspark.sql.SparkSession.builder.config("spark.python.worker.reuse", True) \ .master("local[1]").getOrCreate() # We do not need any DFS here as pyfunc should create its own SparkContext with no executors return _PyFuncModelWrapper(spark, PipelineModel.load("file:" + os.path.abspath(path)))
def load_model(path, run_id=None): """ Load the Spark MLlib model from the given path. :param run_id: Run ID. If provided it is combined with path to identify the model. :param path: Local filesystem path or Run-relative artifact path to the model. :return: SparkML model. :rtype: pyspark.ml.pipeline.PipelineModel """ if run_id is not None: path = mlflow.tracking._get_model_log_dir(model_name=path, run_id=run_id) m = Model.load(os.path.join(path, 'MLmodel')) if FLAVOR_NAME not in m.flavors: raise Exception("Model does not have {} flavor".format(FLAVOR_NAME)) conf = m.flavors[FLAVOR_NAME] return PipelineModel.load(os.path.join(path, conf['model_data']))
def main(spark, df_user, model_file, output_file): # import model model = PipelineModel.load(model_file) print("imported model") # import user data df = spark.read.parquet(df_user) print("imported user data") # transform metadata to get track_index userdf = model.stages[0].transform(df) print("mapped user_index") # output a parquet file userdf.write.parquet(output_file)
def attach_pytorch_model_to_pipeline( network: nn.Module, pipeline_model: PipelineModel, inputCol: str = 'features', predictionCol: str = 'predicted', useVectorOut: bool = False) -> PipelineModel: """ Attaches a pytorch model to an existing pyspark pipeline. :param network: Pytorch Network :param pipeline_model: An existing spark pipeline model (This is a fitted pipeline) :param inputCol: The input column to the dataframe for the pytorch network :param predictionCol: The prediction column. :param useVectorOut: option to use a vector output. :return: a spark PipelineModel """ spark_model = create_spark_torch_model(network, inputCol, predictionCol, useVectorOut) return PipelineModel(stages=[pipeline_model, spark_model])
def test_save_pipeline(self): processed = self.generate_random_data() mg = build_graph(SparkFlowTests.create_random_model) spark_model = SparkAsyncDL(inputCol='features', tensorflowGraph=mg, tfInput='x:0', tfLabel='y:0', tfOutput='outer/Sigmoid:0', tfOptimizer='adam', tfLearningRate=.1, iters=20, partitions=2, predictionCol='predicted', labelCol='label') p = Pipeline(stages=[spark_model]).fit(processed) p.write().overwrite().save('example_pipeline') p = PysparkPipelineWrapper.unwrap( PipelineModel.load('example_pipeline')) data = p.transform(processed).take(10) nb_errors = SparkFlowTests.calculate_errors(data) self.assertTrue(nb_errors < len(data))
def load_model(path, run_id=None, dfs_tmpdir=None): """ Load the Spark MLlib model from the path. :param run_id: Run ID. If provided, combined with ``path`` to identify the model. :param path: Local filesystem path or run-relative artifact path to the model. :return: SparkML model. :rtype: pyspark.ml.pipeline.PipelineModel >>> from mlflow import spark >>> model = mlflow.spark.load_model("spark-model") >>> # Prepare test documents, which are unlabeled (id, text) tuples. >>> test = spark.createDataFrame([ ... (4, "spark i j k"), ... (5, "l m n"), ... (6, "spark hadoop spark"), ... (7, "apache hadoop")], ["id", "text"]) >>> # Make predictions on test documents. >>> prediction = model.transform(test) """ dfs_tmpdir = dfs_tmpdir if dfs_tmpdir is not None else DFS_TMP if run_id is not None: path = mlflow.tracking.utils._get_model_log_dir(model_name=path, run_id=run_id) m = Model.load(os.path.join(path, 'MLmodel')) if FLAVOR_NAME not in m.flavors: raise Exception("Model does not have {} flavor".format(FLAVOR_NAME)) conf = m.flavors[FLAVOR_NAME] model_path = os.path.join(path, conf['model_data']) tmp_path = _tmp_path(dfs_tmpdir) # Spark ML expects the model to be stored on DFS # Copy the model to a temp DFS location first. We cannot delete this file, as # Spark may read from it at any point. _HadoopFileSystem.copy_from_local_file(model_path, tmp_path, removeSrc=False) pipeline_model = PipelineModel.load(tmp_path) eprint("Copied SparkML model to %s" % tmp_path) return pipeline_model
input_df = spark.read.option("header", True).csv(input_data) # transform the train data tmp_df = input_df.rdd \ .map(single_line_transform) # build the single user features,we should explict name the label,features. # features is the column with all features in a `Vectors.dense` train_df = tmp_df.reduceByKey(single_instance_hanler) \ .map(lambda r: Row(label=int(r[1].label), features=Vectors.dense(r[1].feats))) \ .toDF() # simple version, save and load # https://spark.apache.org/docs/latest/ml-tuning.html pipe_lr_model = simple_train_model(train_df) print(pipe_lr_model) pipe_lr_model.write().overwrite().save(model_save_path) xgb_model = pipe_lr_model.stages[0] xgb_model.saveBooster(model_save_path + "_booster") result = pipe_lr_model.transform(train_df) print(result.take(3)) # reload the model with pipeline loaded_model = PipelineModel.load(model_save_path) result = loaded_model.transform(train_df) print(result.take(3))
# 모델 생성 알고리즘 (로지스틱 회귀 평가자) lr = LogisticRegression(maxIter=10, regParam=0.01, labelCol="gender") # 모델 생성 model = lr.fit(assembled_training) # 예측값 생성 model.transform(assembled_training).show() # 파이프라인 pipeline = Pipeline(stages=[assembler, lr]) # 파이프라인 모델 생성 pipelineModel = pipeline.fit(training) # 파이프라인 모델을 이용한 예측값 생성 pipelineModel.transform(training).show() path1 = "/Users/beginspark/Temp/regression-model" path2 = "/Users/beginspark/Temp/pipelinemodel" # 모델 저장 model.write().overwrite().save(path1) pipelineModel.write().overwrite().save(path2) # 저장된 모델 불러오기 loadedModel = LogisticRegressionModel.load(path1) loadedPipelineModel = PipelineModel.load(path2) spark.stop
def words_to_vector(tweets): model_path = 'hdfs://spark01.ctweb.inweb.org.br:9000/limonero/models/' \ 'word_vector.0000' model = PipelineModel.load(model_path) return model.transform(tweets)
outputCol='features') # Demonstration of some options. Not all are required # Note: This uses the barrier execution mode, which is sensitive to the number of partitions spark_model = SparkTorch(inputCol='features', labelCol='_c0', predictionCol='predictions', torchObj=torch_obj, iters=50, verbose=1, validationPct=0.2, miniBatch=128) # Create and save the Pipeline p = Pipeline(stages=[vector_assembler, spark_model]).fit(df) p.write().overwrite().save('simple_cnn') # Example of loading the pipeline loaded_pipeline = PysparkPipelineWrapper.unwrap( PipelineModel.load('simple_cnn')) # Run predictions and evaluation predictions = loaded_pipeline.transform(df).persist() evaluator = MulticlassClassificationEvaluator(labelCol="_c0", predictionCol="predictions", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("Train accuracy = %g" % accuracy)
df = df.withColumn(f, df[f].cast('string')) df = df.dropna() train, test = df.randomSplit([0.8, 0.2], seed=0) class_index = StringIndexer(inputCol='class', outputCol='label') vector = VectorAssembler(inputCols=feature_cols, outputCol='feature') model = LinearSVC(featuresCol='feature', labelCol='label') pipeline = Pipeline(stages=[class_index, vector, model]) pipeline = pipeline.fit(train) if os.path.exists(MODEL_SAVE_PATH): shutil.rmtree(MODEL_SAVE_PATH) pipeline.write().overwrite().save(pipeline) # pipeline.save('/to/path') load_pipeline = PipelineModel.load('pipeline') test_predict = load_pipeline.transform(test) evaluator = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction', labelCol='label') print(evaluator.evaluate(test_predict, {evaluator.metricName: 'areaUnderROC'})) print(evaluator.evaluate(test_predict, {evaluator.metricName: 'areaUnderPR'})) origin_test_df = df.select(feature_cols) predict_df = load_pipeline.transform(origin_test_df) print(predict_df.show(20)) from pyspark2pmml import PMMLBuilder
.map(single_line_transform) # 合并单个用户的训练数据,构建训练特征,需要注意的是需要明确指定label,features. # features是所有特征, 用Vectors.dense来构建特征的数据结构 train_df = tmp_df.reduceByKey(single_instance_hanler) \ .map(lambda r: Row(label=int(r[1].label), features=Vectors.dense(r[1].feats))) \ .toDF() # 简单版本的训练模型, 保存模型 # 关于模型超参调整和cv请参照,https://spark.apache.org/docs/latest/ml-tuning.html pipe_lr_model = simple_train_model(train_df) print(pipe_lr_model) pipe_lr_model.write().overwrite().save(model_save_path) # 重新加载模型 loaded_model = PipelineModel.load(model_save_path) result = loaded_model.transform(train_df) print(result.take(3)) # 带有超参调整的cv模型 # train,test数据切分 bst_model_path = model_save_path + "_bst_model" train_df, test_df = train_df.randomSplit([0.8, 0.2], seed=12345) bst_model = train_with_tune(train_df) bst_model.write().overwrite().save(bst_model_path) # 用训练得到最佳模型来对测试数据进行预测 # 预测结果的数据结构是类似下面的结构: # features = Vectors.dense(...) # label=0,
secure=True) cos.fget_object(cos_bucket_name, model_filepath, model_filepath) cos.fget_object(cos_bucket_name, train_data_filepath, train_data_filepath) cos.fget_object(cos_bucket_name, 'evaluation.json', 'evaluation.json') if aios_manifest_path: cos.fget_object(cos_bucket_name, aios_manifest_path, aios_manifest_path) os.system('unzip %s' % model_filepath) print('model ' + model_filepath + ' is downloaded') os.system('unzip %s' % train_data_filepath) print('train_data ' + train_data_filepath + ' is downloaded') sc = SparkContext() model = PipelineModel.load(model_filepath.split('.')[0]) pipeline = Pipeline(stages=model.stages) spark = SparkSession.builder.getOrCreate() train_data = spark.read.csv(path=train_data_filepath.split('.')[0], sep=",", header=True, inferSchema=True) ''' Remove previous deployed model ''' wml_client = WatsonMachineLearningAPIClient(WML_CREDENTIALS) model_deployment_ids = wml_client.deployments.get_uids() deleted_model_id = None for deployment_id in model_deployment_ids: deployment = wml_client.deployments.get_details(deployment_id) model_id = deployment['entity']['deployable_asset']['guid'] if deployment['entity']['name'] == DEPLOYMENT_NAME: print('Deleting deployment id', deployment_id)
from flask import Flask from flask import render_template, request, jsonify from plotly.graph_objs import Bar from pyspark.sql import SparkSession from pyspark.sql.functions import avg, col, concat, desc, explode, lit, min, max, split, udf, isnull, from_unixtime, instr, when from pyspark.ml.pipeline import PipelineModel app = Flask(__name__) # load data spark = SparkSession.builder.appName("Spark").getOrCreate() # load model model = PipelineModel.load("../models/lr") # index webpage displays cool visuals and receives user input text for model @app.route('/') @app.route('/index') def index(): path = "../data/page_churn_byUser.csv" df = spark.read.csv(path, header=True, inferSchema=True) df.persist() genre_counts1 = df.filter(df.churn == 0) genre_names1 = genre_counts1.select( "page").distinct().toPandas()["page"].tolist() genre_counts2 = df.filter(df.churn == 1) genre_names2 = genre_counts2.select( "page").distinct().toPandas()["page"].tolist()