def test_parallelism_does_not_change_output(self): df = self.spark.createDataFrame( [ (0.0, Vectors.dense(1.0, 0.8)), (1.0, Vectors.sparse(2, [], [])), (2.0, Vectors.dense(0.5, 0.5)), ], ["label", "features"], ) ovrPar1 = OneVsRest(classifier=LogisticRegression(maxIter=5, regParam=0.01), parallelism=1) modelPar1 = ovrPar1.fit(df) ovrPar2 = OneVsRest(classifier=LogisticRegression(maxIter=5, regParam=0.01), parallelism=2) modelPar2 = ovrPar2.fit(df) for i, model in enumerate(modelPar1.models): self.assertTrue( np.allclose( model.coefficients.toArray(), modelPar2.models[i].coefficients.toArray(), atol=1e-4, )) self.assertTrue( np.allclose(model.intercept, modelPar2.models[i].intercept, atol=1e-4))
def test_support_for_weightCol(self): df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8), 1.0), (1.0, Vectors.sparse(2, [], []), 1.0), (2.0, Vectors.dense(0.5, 0.5), 1.0)], ["label", "features", "weight"]) # classifier inherits hasWeightCol lr = LogisticRegression(maxIter=5, regParam=0.01) ovr = OneVsRest(classifier=lr, weightCol="weight") self.assertIsNotNone(ovr.fit(df)) # classifier doesn't inherit hasWeightCol dt = FMClassifier() ovr2 = OneVsRest(classifier=dt, weightCol="weight") self.assertIsNotNone(ovr2.fit(df))
def test_parallelism_doesnt_change_output(self): df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)), (1.0, Vectors.sparse(2, [], [])), (2.0, Vectors.dense(0.5, 0.5))], ["label", "features"]) ovrPar1 = OneVsRest(classifier=LogisticRegression(maxIter=5, regParam=.01), parallelism=1) modelPar1 = ovrPar1.fit(df) ovrPar2 = OneVsRest(classifier=LogisticRegression(maxIter=5, regParam=.01), parallelism=2) modelPar2 = ovrPar2.fit(df) for i, model in enumerate(modelPar1.models): self.assertTrue(np.allclose(model.coefficients.toArray(), modelPar2.models[i].coefficients.toArray(), atol=1E-4)) self.assertTrue(np.allclose(model.intercept, modelPar2.models[i].intercept, atol=1E-4))
def test_support_for_weightCol(self): df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8), 1.0), (1.0, Vectors.sparse(2, [], []), 1.0), (2.0, Vectors.dense(0.5, 0.5), 1.0)], ["label", "features", "weight"]) # classifier inherits hasWeightCol lr = LogisticRegression(maxIter=5, regParam=0.01) ovr = OneVsRest(classifier=lr, weightCol="weight") self.assertIsNotNone(ovr.fit(df)) # classifier doesn't inherit hasWeightCol dt = DecisionTreeClassifier() ovr2 = OneVsRest(classifier=dt, weightCol="weight") self.assertIsNotNone(ovr2.fit(df))
def test_should_log_model(dataset_binomial, dataset_multinomial): mlflow.pyspark.ml.autolog(log_models=True) lor = LogisticRegression() ova1 = OneVsRest(classifier=lor) mlor_model = lor.fit(dataset_multinomial) assert _should_log_model(mlor_model) ova1_model = ova1.fit(dataset_multinomial) assert _should_log_model(ova1_model) with mock.patch( "mlflow.pyspark.ml._log_model_allowlist", { "pyspark.ml.regression.LinearRegressionModel", "pyspark.ml.classification.OneVsRestModel" }, ), mock.patch("mlflow.pyspark.ml._logger.warning") as mock_warning: lr = LinearRegression() lr_model = lr.fit(dataset_binomial) assert _should_log_model(lr_model) lor_model = lor.fit(dataset_binomial) assert not _should_log_model(lor_model) mock_warning.called_once_with( _get_warning_msg_for_skip_log_model(lor_model)) assert not _should_log_model(ova1_model)
def test_one_vs_rest(self): this_script_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) input_path = os.path.join(this_script_dir, "data", "sample_multiclass_classification_data.txt") data = self.spark.read.format("libsvm").load(input_path) lr = LogisticRegression(maxIter=100, tol=0.0001, regParam=0.01) ovr = OneVsRest(classifier=lr) model = ovr.fit(data) feature_count = data.first()[1].size model_onnx = convert_sparkml(model, 'Sparkml OneVsRest', [ ('features', FloatTensorType([1, feature_count])) ], spark_session=self.spark) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) expected = [ predicted.toPandas().prediction.values.astype(numpy.float32), ] paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlOneVsRest") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['prediction'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def one_vs_rest(training, test): lr = LogisticRegression(maxIter=10, regParam=0.01) ovr = OneVsRest(classifier=lr) model = ovr.fit(training) result = model.transform(test) accuracy = 1.0 * result.rdd.filter( lambda l: l.label == l.prediction).count() / test.count() print "OneVsRest 模型的正确率为:", accuracy
def one_vs_rest_classifier(trainingDataFrame, classifier=None): if not classifier: classifier = LogisticRegression(regParam=0.01) ovr = OneVsRest(classifier=classifier) ovrModel = ovr.fit(trainingDataFrame) result = {} result["model"] = ovrModel return result
def test_param_map_captures_wrapped_params(dataset_binomial): lor = LogisticRegression(maxIter=3, standardization=False) ova = OneVsRest(classifier=lor, labelCol="abcd") param_map = _get_instance_param_map(ova) assert param_map["labelCol"] == "abcd" assert param_map["classifier"] == lor.uid assert param_map[f"{lor.uid}.maxIter"] == 3 assert not param_map[f"{lor.uid}.standardization"] assert param_map[f"{lor.uid}.tol"] == lor.getOrDefault(lor.tol) mlflow.pyspark.ml.autolog() with mlflow.start_run() as run: ova.fit(dataset_binomial.withColumn("abcd", dataset_binomial.label)) run_id = run.info.run_id run_data = get_run_data(run_id) assert run_data.params == truncate_param_dict( stringify_dict_values(_get_instance_param_map(ova)))
def test_output_columns(self): df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)), (1.0, Vectors.sparse(2, [], [])), (2.0, Vectors.dense(0.5, 0.5))], ["label", "features"]) lr = LogisticRegression(maxIter=5, regParam=0.01) ovr = OneVsRest(classifier=lr, parallelism=1) model = ovr.fit(df) output = model.transform(df) self.assertEqual(output.columns, ["label", "features", "rawPrediction", "prediction"])
def test_raw_prediction_column_is_of_vector_type(self): # SPARK-35142: `OneVsRestModel` outputs raw prediction as a string column df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)), (1.0, Vectors.sparse(2, [], [])), (2.0, Vectors.dense(0.5, 0.5))], ["label", "features"]) lr = LogisticRegression(maxIter=5, regParam=0.01) ovr = OneVsRest(classifier=lr, parallelism=1) model = ovr.fit(df) row = model.transform(df).head() self.assertIsInstance(row["rawPrediction"], DenseVector)
def test_param_map_captures_wrapped_params(dataset_binomial): lor = LogisticRegression(maxIter=3, standardization=False) ova = OneVsRest(classifier=lor, labelCol="abcd") param_map = get_params_to_log(ova) assert param_map["labelCol"] == "abcd" assert param_map["classifier"] == "LogisticRegression" assert param_map["LogisticRegression.maxIter"] == 3 assert not param_map["LogisticRegression.standardization"] assert param_map["LogisticRegression.tol"] == lor.getOrDefault(lor.tol) mlflow.pyspark.ml.autolog() with mlflow.start_run() as run: ova.fit(dataset_binomial.withColumn("abcd", dataset_binomial.label)) metadata = _gen_estimator_metadata(ova) estimator_info = load_json_artifact("estimator_info.json") assert metadata.hierarchy == estimator_info["hierarchy"] run_id = run.info.run_id run_data = get_run_data(run_id) assert run_data.params == truncate_param_dict(stringify_dict_values(get_params_to_log(ova)))
def test_copy(self): df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)), (1.0, Vectors.sparse(2, [], [])), (2.0, Vectors.dense(0.5, 0.5))], ["label", "features"]) lr = LogisticRegression(maxIter=5, regParam=0.01) ovr = OneVsRest(classifier=lr) ovr1 = ovr.copy({lr.maxIter: 10}) self.assertEqual(ovr.getClassifier().getMaxIter(), 5) self.assertEqual(ovr1.getClassifier().getMaxIter(), 10) model = ovr.fit(df) model1 = model.copy({model.predictionCol: "indexed"}) self.assertEqual(model1.getPredictionCol(), "indexed")
def test_copy(self): df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)), (1.0, Vectors.sparse(2, [], [])), (2.0, Vectors.dense(0.5, 0.5))], ["label", "features"]) lr = LogisticRegression(maxIter=5, regParam=0.01) ovr = OneVsRest(classifier=lr) ovr1 = ovr.copy({lr.maxIter: 10}) self.assertEqual(ovr.getClassifier().getMaxIter(), 5) self.assertEqual(ovr1.getClassifier().getMaxIter(), 10) model = ovr.fit(df) model1 = model.copy({model.predictionCol: "indexed"}) self.assertEqual(model1.getPredictionCol(), "indexed")
def test_should_log_model(dataset_binomial, dataset_multinomial, dataset_text): mlflow.pyspark.ml.autolog(log_models=True) lor = LogisticRegression() ova1 = OneVsRest(classifier=lor) with mlflow.start_run(): mlor_model = lor.fit(dataset_multinomial) assert _should_log_model(mlor_model) with mlflow.start_run(): ova1_model = ova1.fit(dataset_multinomial) assert _should_log_model(ova1_model) tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") lr = LogisticRegression(maxIter=2) pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) with mlflow.start_run(): pipeline_model = pipeline.fit(dataset_text) assert _should_log_model(pipeline_model) nested_pipeline = Pipeline( stages=[tokenizer, Pipeline(stages=[hashingTF, lr])]) with mlflow.start_run(): nested_pipeline_model = nested_pipeline.fit(dataset_text) assert _should_log_model(nested_pipeline_model) with mock.patch( "mlflow.pyspark.ml._log_model_allowlist", { "pyspark.ml.regression.LinearRegressionModel", "pyspark.ml.classification.OneVsRestModel", "pyspark.ml.pipeline.PipelineModel", }, ), mock.patch("mlflow.pyspark.ml._logger.warning") as mock_warning: lr = LinearRegression() with mlflow.start_run(): lr_model = lr.fit(dataset_binomial) assert _should_log_model(lr_model) with mlflow.start_run(): lor_model = lor.fit(dataset_binomial) assert not _should_log_model(lor_model) mock_warning.called_once_with( _get_warning_msg_for_skip_log_model(lor_model)) assert not _should_log_model(ova1_model) assert not _should_log_model(pipeline_model) assert not _should_log_model(nested_pipeline_model)
def test_getAllNestedStages(self): def _check_uid_set_equal(stages, expected_stages): uids = set(map(lambda x: x.uid, stages)) expected_uids = set(map(lambda x: x.uid, expected_stages)) self.assertEqual(uids, expected_uids) df1 = self.spark.createDataFrame( [ (Vectors.dense([1.0, 2.0]), 1.0), (Vectors.dense([-1.0, -2.0]), 0.0), ], ["features", "label"], ) df2 = self.spark.createDataFrame( [ (1.0, 2.0, 1.0), (1.0, 2.0, 0.0), ], ["a", "b", "label"], ) vs = VectorAssembler(inputCols=["a", "b"], outputCol="features") lr = LogisticRegression() pipeline = Pipeline(stages=[vs, lr]) pipelineModel = pipeline.fit(df2) ova = OneVsRest(classifier=lr) ovaModel = ova.fit(df1) ova_pipeline = Pipeline(stages=[vs, ova]) nested_pipeline = Pipeline(stages=[ova_pipeline]) _check_uid_set_equal( MetaAlgorithmReadWrite.getAllNestedStages(pipeline), [pipeline, vs, lr]) _check_uid_set_equal( MetaAlgorithmReadWrite.getAllNestedStages(pipelineModel), [pipelineModel] + pipelineModel.stages, ) _check_uid_set_equal(MetaAlgorithmReadWrite.getAllNestedStages(ova), [ova, lr]) _check_uid_set_equal( MetaAlgorithmReadWrite.getAllNestedStages(ovaModel), [ovaModel, lr] + ovaModel.models) _check_uid_set_equal( MetaAlgorithmReadWrite.getAllNestedStages(nested_pipeline), [nested_pipeline, ova_pipeline, vs, ova, lr], )
def test_onevsrest(self): temp_path = tempfile.mkdtemp() df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)), (1.0, Vectors.sparse(2, [], [])), (2.0, Vectors.dense(0.5, 0.5))] * 10, ["label", "features"]) lr = LogisticRegression(maxIter=5, regParam=0.01) ovr = OneVsRest(classifier=lr) model = ovr.fit(df) ovrPath = temp_path + "/ovr" ovr.save(ovrPath) loadedOvr = OneVsRest.load(ovrPath) self._compare_pipelines(ovr, loadedOvr) modelPath = temp_path + "/ovrModel" model.save(modelPath) loadedModel = OneVsRestModel.load(modelPath) self._compare_pipelines(model, loadedModel)
def test_onevsrest(self): temp_path = tempfile.mkdtemp() df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)), (1.0, Vectors.sparse(2, [], [])), (2.0, Vectors.dense(0.5, 0.5))] * 10, ["label", "features"]) lr = LogisticRegression(maxIter=5, regParam=0.01) ovr = OneVsRest(classifier=lr) model = ovr.fit(df) ovrPath = temp_path + "/ovr" ovr.save(ovrPath) loadedOvr = OneVsRest.load(ovrPath) self._compare_pipelines(ovr, loadedOvr) modelPath = temp_path + "/ovrModel" model.save(modelPath) loadedModel = OneVsRestModel.load(modelPath) self._compare_pipelines(model, loadedModel)
def ovr_classifier(training, testing): from pyspark.ml.classification import LogisticRegression, OneVsRest #Instantiate the base classifier. lr = LogisticRegression(maxIter=10, tol=1E-6, fitIntercept=True) #Instantiate the One Vs Rest Classifier. ovr = OneVsRest(classifier=lr) # train the multiclass model. ovr_model = ovr.fit(training) #Test model ovr_predictions = ovr_model.transform(testing) #Evaluate model ovr_evaluator = MulticlassClassificationEvaluator(metricName='accuracy') ovr_accuracy = ovr_evaluator.evaluate(ovr_predictions) return ovr_accuracy
def test_meta_estimator_fit(dataset_binomial): mlflow.pyspark.ml.autolog() with mlflow.start_run() as run: svc = LinearSVC() ova = OneVsRest(classifier=svc) ova_model = ova.fit(dataset_binomial) run_id = run.info.run_id run_data = get_run_data(run_id) assert run_data.params == truncate_param_dict(stringify_dict_values(get_params_to_log(ova))) assert run_data.tags == get_expected_class_tags(ova) assert MODEL_DIR in run_data.artifacts loaded_model = load_model_by_run_id(run_id) assert loaded_model.stages[0].uid == ova_model.uid # assert no nested run spawned query = "tags.{} = '{}'".format(MLFLOW_PARENT_RUN_ID, run.info.run_id) assert len(mlflow.search_runs([run.info.experiment_id])) == 1 assert len(mlflow.search_runs([run.info.experiment_id], query)) == 0
def main(): spark = create_session('wash_post_shootings') spark.sparkContext.setLogLevel('ERROR') try: df = create_df(spark) sh = Shootings(df) sh.show() df = sh.get_df() (train, test) = df.randomSplit([0.8, 0.2]) # instantiate the base classifier. lr = LogisticRegression(maxIter=10, tol=1E-6, fitIntercept=True, featuresCol='features', labelCol='label') # instantiate the One Vs Rest Classifier. ovr = OneVsRest(classifier=lr) # train the multiclass model. ovrModel = ovr.fit(train) # score the model on test data. predictions = ovrModel.transform(test) # obtain evaluator. evaluator = MulticlassClassificationEvaluator(metricName="accuracy") # compute the classification error on test data. accuracy = evaluator.evaluate(predictions) print("Test Error = %g" % (1.0 - accuracy)) print("Accuracy = %.2f" % (accuracy * 100)) except Exception as e: print(e) finally: spark.sparkContext.stop()
def main(): parser = argparse.ArgumentParser(description='Pyspark Training') parser.add_argument('--data_dir', type=str, default='../../data', help='Data location.') args = parser.parse_args() # Get the MNIST data. X, y = load_mnist(args.data_dir) # Create a train and test set split. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) # Convert numpy arrays to Pyspark dataframe. df = create_df(y_train, X_train) # Create a train and validation set. (train, val) = df.randomSplit([0.1, 0.90]) # instantiate logistic regression with hyperparameters. lr = LogisticRegression(maxIter=10, tol=1E-6, fitIntercept=True) # instantiate the One Vs Rest Classifier. ovr = OneVsRest(classifier=lr) # train the multiclass model ovrModel = ovr.fit(train) # score the model on test data. predictions = ovrModel.transform(val) # obtain evaluator. evaluator = MulticlassClassificationEvaluator(metricName="accuracy") # compute the classification error on validation data. accuracy = evaluator.evaluate(predictions) print("Validation Accuracy = {}".format(accuracy)) print("Validation Error = {}".format(1.0 - accuracy))
def test_save_load(self): temp_path = tempfile.mkdtemp() sqlContext = SQLContext(self.sc) df = sqlContext.createDataFrame( [(0.0, Vectors.dense(1.0, 0.8)), (1.0, Vectors.sparse(2, [], [])), (2.0, Vectors.dense(0.5, 0.5))], ["label", "features"], ) lr = LogisticRegression(maxIter=5, regParam=0.01) ovr = OneVsRest(classifier=lr) model = ovr.fit(df) ovrPath = temp_path + "/ovr" ovr.save(ovrPath) loadedOvr = OneVsRest.load(ovrPath) self.assertEqual(loadedOvr.getFeaturesCol(), ovr.getFeaturesCol()) self.assertEqual(loadedOvr.getLabelCol(), ovr.getLabelCol()) self.assertEqual(loadedOvr.getClassifier().uid, ovr.getClassifier().uid) modelPath = temp_path + "/ovrModel" model.save(modelPath) loadedModel = OneVsRestModel.load(modelPath) for m, n in zip(model.models, loadedModel.models): self.assertEqual(m.uid, n.uid)
def test_save_load(self): temp_path = tempfile.mkdtemp() sqlContext = SQLContext(self.sc) df = sqlContext.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)), (1.0, Vectors.sparse(2, [], [])), (2.0, Vectors.dense(0.5, 0.5))], ["label", "features"]) lr = LogisticRegression(maxIter=5, regParam=0.01) ovr = OneVsRest(classifier=lr) model = ovr.fit(df) ovrPath = temp_path + "/ovr" ovr.save(ovrPath) loadedOvr = OneVsRest.load(ovrPath) self.assertEqual(loadedOvr.getFeaturesCol(), ovr.getFeaturesCol()) self.assertEqual(loadedOvr.getLabelCol(), ovr.getLabelCol()) self.assertEqual(loadedOvr.getClassifier().uid, ovr.getClassifier().uid) modelPath = temp_path + "/ovrModel" model.save(modelPath) loadedModel = OneVsRestModel.load(modelPath) for m, n in zip(model.models, loadedModel.models): self.assertEqual(m.uid, n.uid)
def test_should_log_model_with_wildcards_in_allowlist(dataset_binomial, dataset_multinomial): mlflow.pyspark.ml.autolog(log_models=True) lor = LogisticRegression() ova1 = OneVsRest(classifier=lor) ova1_model = ova1.fit(dataset_multinomial) with mock.patch( "mlflow.pyspark.ml._log_model_allowlist", { "pyspark.ml.regression.*", "pyspark.ml.classification.LogisticRegressionModel", "pyspark.ml.feature.*", }, ): lr = LinearRegression() with mlflow.start_run(): lr_model = lr.fit(dataset_binomial) assert _should_log_model(lr_model) with mlflow.start_run(): lor_model = lor.fit(dataset_binomial) assert _should_log_model(lor_model) assert not _should_log_model(ova1_model)
# Normalize each feature to have unit standard deviation. train_df_tmp = scalerModel.transform(parsed_labelpoint_df) train_df = train_df_tmp.drop("features").withColumnRenamed( "scaledFeatures", "features") # show the frequency of each label tmp_df = train_df.groupBy("label").count() tmp_df.show(10) lr = LogisticRegression(maxIter=10, regParam=0.01, elasticNetParam=0.2) # instantiate the One Vs Rest Classifier. ovr = OneVsRest(classifier=lr) # train the multiclass model. lrModel = ovr.fit(train_df) shutil.rmtree(logreg_path, ignore_errors=True) lrModel.save(logreg_path) if opt == "test" or opt == "all": print "Loading test data..." test_data = sc.textFile(golden_file) parsed_test_data = test_data.map(kup.parse_multiClass) parsed_test_data_df = spark.createDataFrame(parsed_test_data, ["label", "features"]) # load the scaler and perform feature scaling on test data scalerModel = StandardScalerModel.load(scalerPath) test_df_tmp = scalerModel.transform(parsed_test_data_df) test_df = test_df_tmp.drop("features").withColumnRenamed(
# Import the One-vs-Rest function from Pyspark. from pyspark.ml.classification import OneVsRest # Multiclass classification using random forest without resampling. rf = RandomForestClassifier(numTrees=100, maxDepth=4, maxBins=32, featuresCol='pca_features', labelCol='label') # Define the one-vs-rest model. one_vs_rest_model = OneVsRest(classifier=rf) # Fit the one-vs-rest model to the train data without resampling. rf_ovr_model = one_vs_rest_model.fit(train_multi) # Predict on the test data. rf_ovr_predictions = rf_ovr_model.transform(test_multi) # Compute the metrics for the one-vs-rest model. print('Random Forest with OneVsRest:') for i in range(21): get_metrics(rf_ovr_predictions, label=i) # Random Forest with OneVsRest: # Label: 0 # accuracy: 0.566459184078024 # f1_score: 0.7225638839838437 # precision: 0.5658787811352546 # recall: 0.9992421478092458
labelZeroDf = readImages(imageDir + "l0").withColumn("label", lit(0)) labelOneDf = readImages(imageDir + "l1").withColumn("label", lit(1)) labelTwoDf = readImages(imageDir + "l2").withColumn("label", lit(2)) labelThreeDf = readImages(imageDir + "l3").withColumn("label", lit(3)) labelFourDf = readImages(imageDir + "l4").withColumn("label", lit(4)) finalTrainDf = labelZeroDf.unionAll(labelOneDf).unionAll(labelTwoDf).unionAll( labelThreeDf).unionAll(labelFourDf) trainSize = finalTrainDf.count() print(str(trainSize)) featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="InceptionV3") method = DecisionTreeClassifier(labelCol="label", featuresCol="features") ovr = OneVsRest(classifier=method) featureVector = featurizer.transform(finalTrainDf) model_dc = ovr.fit(featureVector) model_dc.write().overwrite().save(imageDir + 'model-decision-tree-classifier-new') predictions = model_dc.transform(featureVector) evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("Train Data set accuracy with decision-tree-classifier for " + str(trainSize) + " images = " + str(accuracy) + " and error " + str(1 - accuracy))
#tmpTrainDf = readImages(imageDir + "/test1") tmpTrainRDD = tmpTrainDf.rdd.map(lambda x : Row(filepath = x[0], image = x[1], fileName = getFileName(x[0]))) tmpTrainX = tmpTrainRDD.toDF() csvTrainTmp = spark.read.format("csv").option("header", "true").load(imageDir + "/train25.csv") #csvTrainTmp = spark.read.format("csv").option("header", "true").load(imageDir + "/test1.csv") csvTrainRDD = csvTrainTmp.rdd.map(lambda x : Row(image = x[0], label = int(x[1]))) csvTrain = csvTrainRDD.toDF() finalTrainDataFrame = tmpTrainX.join(csvTrain, tmpTrainX.fileName == csvTrain.image, 'inner').drop(csvTrain.image) featurizer = DeepImageFeaturizer(inputCol="image",outputCol="features", modelName="InceptionV3") method = LogisticRegression(maxIter=50, regParam=0.05, elasticNetParam=0.3, labelCol="label") ovr = OneVsRest(classifier = method) featureVector = featurizer.transform(finalTrainDataFrame).persist() model_svm = ovr.fit(featureVector) model_svm.write().overwrite().save('hdfs://192.168.65.188:8020/paih/model-support-vector-machine') predictions = model_svm.transform(featureVector).persist() predictionAndLabels = predictions.select("prediction", "label") evaluator = MulticlassClassificationEvaluator(metricName="accuracy") print("Train Data set accuracy with Support vector Machine = " + str(evaluator.evaluate(predictionAndLabels)) + " and error " + str(1 - evaluator.evaluate(predictionAndLabels))) #predictions.show() #apply evaluator on constructed model for training data. #apply deep learning for feature extraction #apply genetic algo for feature selection #apply ml model(eg. lr) #apply genetic algorithm for feature selection
csvTrain = csvTrainRDD.toDF() finalTrainDataFrame = tmpTrainX.join(csvTrain, tmpTrainX.fileName == csvTrain.image, 'inner').drop(csvTrain.image) featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="InceptionV3") method = LogisticRegression(maxIter=50, regParam=0.05, elasticNetParam=0.3, labelCol="label") ovr = OneVsRest(classifier=method) featureVector = featurizer.transform(finalTrainDataFrame).persist() model_lr = ovr.fit(featureVector) model_lr.write().overwrite().save( 'hdfs://192.168.65.188:8020/paih/model-logistic-regression') predictions = model_lr.transform(featureVector).persist() predictionAndLabels = predictions.select("prediction", "label") evaluator = MulticlassClassificationEvaluator(metricName="accuracy") print("Train Data set accuracy with Logistic Regression = " + str(evaluator.evaluate(predictionAndLabels)) + " and error " + str(1 - evaluator.evaluate(predictionAndLabels))) #predictions.show() #apply evaluator on constructed model for training data. #apply deep learning for feature extraction #apply genetic algo for feature selection
# $example on$ # load data file. inputData = spark.read.format("libsvm") \ .load("data/mllib/sample_multiclass_classification_data.txt") # generate the train/test split. (train, test) = inputData.randomSplit([0.8, 0.2]) # instantiate the base classifier. lr = LogisticRegression(maxIter=10, tol=1E-6, fitIntercept=True) # instantiate the One Vs Rest Classifier. ovr = OneVsRest(classifier=lr) # train the multiclass model. ovrModel = ovr.fit(train) # score the model on test data. predictions = ovrModel.transform(test) # obtain evaluator. evaluator = MulticlassClassificationEvaluator(metricName="precision") # compute the classification error on test data. precision = evaluator.evaluate(predictions) print("Test Error : " + str(1 - precision)) # $example off$ spark.stop()
def Train(self): st = time.time() categorical_columns = self._dataframe_helper.get_string_columns() numerical_columns = self._dataframe_helper.get_numeric_columns() result_column = self._dataframe_context.get_result_column() categorical_columns = [ x for x in categorical_columns if x != result_column ] model_path = self._dataframe_context.get_model_path() pipeline_filepath = model_path + "/LogisticRegression/TrainedModels/pipeline" model_filepath = model_path + "/LogisticRegression/TrainedModels/model" summary_filepath = model_path + "/LogisticRegression/ModelSummary/summary.json" df = self._data_frame pipeline = MLUtils.create_pyspark_ml_pipeline(numerical_columns, categorical_columns, result_column) pipelineModel = pipeline.fit(df) indexed = pipelineModel.transform(df) MLUtils.save_pipeline_or_model(pipelineModel, pipeline_filepath) trainingData, validationData = MLUtils.get_training_and_validation_data( indexed, result_column, 0.8) OriginalTargetconverter = IndexToString( inputCol="label", outputCol="originalTargetColumn") levels = trainingData.select("label").distinct().collect() if self._classifier == "lr": if len(levels) == 2: lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8) elif len(levels) > 2: lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8, family="multinomial") fit = lr.fit(trainingData) elif self._classifier == "OneVsRest": lr = LogisticRegression() ovr = OneVsRest(classifier=lr) fit = ovr.fit(trainingData) transformed = fit.transform(validationData) MLUtils.save_pipeline_or_model(fit, model_filepath) print fit.coefficientMatrix print fit.interceptVector # feature_importance = MLUtils.calculate_sparkml_feature_importance(indexed,fit,categorical_columns,numerical_columns) label_classes = transformed.select("label").distinct().collect() results = transformed.select(["prediction", "label"]) if len(label_classes) > 2: evaluator = MulticlassClassificationEvaluator( predictionCol="prediction") evaluator.evaluate(results) self._model_summary["model_accuracy"] = evaluator.evaluate( results, {evaluator.metricName: "accuracy"}) # accuracy of the model else: evaluator = BinaryClassificationEvaluator( rawPredictionCol="prediction") evaluator.evaluate(results) # print evaluator.evaluate(results,{evaluator.metricName: "areaUnderROC"}) # print evaluator.evaluate(results,{evaluator.metricName: "areaUnderPR"}) self._model_summary["model_accuracy"] = evaluator.evaluate( results, {evaluator.metricName: "areaUnderPR"}) # accuracy of the model # self._model_summary["feature_importance"] = MLUtils.transform_feature_importance(feature_importance) self._model_summary["runtime_in_seconds"] = round((time.time() - st), 2) transformed = OriginalTargetconverter.transform(transformed) label_indexer_dict = [ dict(enumerate(field.metadata["ml_attr"]["vals"])) for field in transformed.schema.fields if field.name == "label" ][0] prediction_to_levels = udf(lambda x: label_indexer_dict[x], StringType()) transformed = transformed.withColumn( "predictedClass", prediction_to_levels(transformed.prediction)) prediction_df = transformed.select( ["originalTargetColumn", "predictedClass"]).toPandas() objs = { "actual": prediction_df["originalTargetColumn"], "predicted": prediction_df["predictedClass"] } self._model_summary[ "confusion_matrix"] = MLUtils.calculate_confusion_matrix( objs["actual"], objs["predicted"]) overall_precision_recall = MLUtils.calculate_overall_precision_recall( objs["actual"], objs["predicted"]) self._model_summary[ "precision_recall_stats"] = overall_precision_recall[ "classwise_stats"] self._model_summary["model_precision"] = overall_precision_recall[ "precision"] self._model_summary["model_recall"] = overall_precision_recall[ "recall"] self._model_summary["target_variable"] = result_column self._model_summary[ "test_sample_prediction"] = overall_precision_recall[ "prediction_split"] self._model_summary["algorithm_name"] = "Random Forest" self._model_summary["validation_method"] = "Train and Test" self._model_summary["independent_variables"] = len( categorical_columns) + len(numerical_columns) self._model_summary["level_counts"] = CommonUtils.get_level_count_dict( trainingData, categorical_columns, self._dataframe_context.get_column_separator(), dataType="spark") # print json.dumps(self._model_summary,indent=2) self._model_summary["total_trees"] = 100 self._model_summary["total_rules"] = 300 CommonUtils.write_to_file( summary_filepath, json.dumps({"modelSummary": self._model_summary}))
tmpTrainX = tmpTrainRDD.toDF() csvTrainTmp = spark.read.format("csv").option( "header", "true").load(imageDir + "train25.csv") #csvTrainTmp = spark.read.format("csv").option("header", "true").load(imageDir + "/test1.csv") csvTrainRDD = csvTrainTmp.rdd.map(lambda x: Row(image=x[0], label=int(x[1]))) csvTrain = csvTrainRDD.toDF() finalTrainDataFrame = tmpTrainX.join(csvTrain, tmpTrainX.fileName == csvTrain.image, 'inner').drop(csvTrain.image) featurizer = DeepImageFeaturizer(inputCol="image", outputCol="features", modelName="InceptionV3") method = GBTClassifier(labelCol="label", featuresCol="features", maxIter=10) ovr = OneVsRest(classifier=method) featureVector = featurizer.transform(finalTrainDataFrame).persist() model_gbt = ovr.fit(featureVector) model_gbt.write().overwrite().save(imageDir + 'model-gradiant-boosted-tree-classifier') predictions = model_gbt.transform(featureVector).persist() predictionAndLabels = predictions.select("prediction", "label") predictionAndLabels.persist() #predictionAndLabels.show() evaluator = MulticlassClassificationEvaluator(metricName="accuracy") print("Train Data set Gradiant boosted tree classifier = " + str(evaluator.evaluate(predictionAndLabels)) + " and error " + str(1 - evaluator.evaluate(predictionAndLabels)))
assembler = VectorAssembler( inputCols=[x for x in train.columns if x not in ignore], outputCol='features') train_LP = assembler.transform(train).select(['label', 'features']) evaluation_LP = assembler.transform(evaluation).select(['label', 'features']) #Definimos el algoritmo del modelo (OvR) # instantiate the base classifier. Only LogisticRegression and NaiveBayes are supported lr = LogisticRegression(maxIter=20, tol=1E-6, fitIntercept=True) # elasticNetParam=0.1 # instantiate the One Vs Rest Classifier. ovr = OneVsRest(classifier=lr) # Fit the model # train the multiclass model. ovrModel = ovr.fit(train_LP) # Make predictions. # score the model on test data. predictions = ovrModel.transform(evaluation_LP) # Select (prediction, true label) and compute evaluation error evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("Test Error = %g" % (1.0 - accuracy)) # maxIter=10, tol=1E-6, fitIntercept=True # Error = 0.519118
# $example on$ # load data file. inputData = spark.read.format("libsvm") \ .load("data/mllib/sample_multiclass_classification_data.txt") # generate the train/test split. (train, test) = inputData.randomSplit([0.8, 0.2]) # instantiate the base classifier. lr = LogisticRegression(maxIter=10, tol=1E-6, fitIntercept=True) # instantiate the One Vs Rest Classifier. ovr = OneVsRest(classifier=lr) # train the multiclass model. ovrModel = ovr.fit(train) # score the model on test data. predictions = ovrModel.transform(test) # obtain evaluator. evaluator = MulticlassClassificationEvaluator(metricName="precision") # compute the classification error on test data. precision = evaluator.evaluate(predictions) print("Test Error : " + str(1 - precision)) # $example off$ spark.stop()
newData = indexed['features', 'label'] newData.show() # split data (trainingData, testData) = newData.randomSplit([0.7, 0.3]) # Classification from pyspark.ml.classification import LogisticRegression, OneVsRest from pyspark.ml.evaluation import MulticlassClassificationEvaluator # instantiate the base classifier. lr = LogisticRegression(maxIter=10, tol=1E-6, fitIntercept=True) # instantiate the One Vs Rest Classifier. ovr = OneVsRest(classifier=lr) # train the multiclass model. ovrModel = ovr.fit(trainingData) # score the model on test data. predictions = ovrModel.transform(testData) # obtain evaluator. evaluator = MulticlassClassificationEvaluator(metricName="accuracy") # compute the classification error on test data. accuracy = evaluator.evaluate(predictions) print("Test Error : " + str(1 - accuracy))