예제 #1
0
 def test_parallelism_does_not_change_output(self):
     df = self.spark.createDataFrame(
         [
             (0.0, Vectors.dense(1.0, 0.8)),
             (1.0, Vectors.sparse(2, [], [])),
             (2.0, Vectors.dense(0.5, 0.5)),
         ],
         ["label", "features"],
     )
     ovrPar1 = OneVsRest(classifier=LogisticRegression(maxIter=5,
                                                       regParam=0.01),
                         parallelism=1)
     modelPar1 = ovrPar1.fit(df)
     ovrPar2 = OneVsRest(classifier=LogisticRegression(maxIter=5,
                                                       regParam=0.01),
                         parallelism=2)
     modelPar2 = ovrPar2.fit(df)
     for i, model in enumerate(modelPar1.models):
         self.assertTrue(
             np.allclose(
                 model.coefficients.toArray(),
                 modelPar2.models[i].coefficients.toArray(),
                 atol=1e-4,
             ))
         self.assertTrue(
             np.allclose(model.intercept,
                         modelPar2.models[i].intercept,
                         atol=1e-4))
예제 #2
0
 def test_support_for_weightCol(self):
     df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8), 1.0),
                                      (1.0, Vectors.sparse(2, [], []), 1.0),
                                      (2.0, Vectors.dense(0.5, 0.5), 1.0)],
                                     ["label", "features", "weight"])
     # classifier inherits hasWeightCol
     lr = LogisticRegression(maxIter=5, regParam=0.01)
     ovr = OneVsRest(classifier=lr, weightCol="weight")
     self.assertIsNotNone(ovr.fit(df))
     # classifier doesn't inherit hasWeightCol
     dt = FMClassifier()
     ovr2 = OneVsRest(classifier=dt, weightCol="weight")
     self.assertIsNotNone(ovr2.fit(df))
예제 #3
0
 def test_parallelism_doesnt_change_output(self):
     df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
                                      (1.0, Vectors.sparse(2, [], [])),
                                      (2.0, Vectors.dense(0.5, 0.5))],
                                     ["label", "features"])
     ovrPar1 = OneVsRest(classifier=LogisticRegression(maxIter=5, regParam=.01), parallelism=1)
     modelPar1 = ovrPar1.fit(df)
     ovrPar2 = OneVsRest(classifier=LogisticRegression(maxIter=5, regParam=.01), parallelism=2)
     modelPar2 = ovrPar2.fit(df)
     for i, model in enumerate(modelPar1.models):
         self.assertTrue(np.allclose(model.coefficients.toArray(),
                                     modelPar2.models[i].coefficients.toArray(), atol=1E-4))
         self.assertTrue(np.allclose(model.intercept, modelPar2.models[i].intercept, atol=1E-4))
예제 #4
0
 def test_support_for_weightCol(self):
     df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8), 1.0),
                                      (1.0, Vectors.sparse(2, [], []), 1.0),
                                      (2.0, Vectors.dense(0.5, 0.5), 1.0)],
                                     ["label", "features", "weight"])
     # classifier inherits hasWeightCol
     lr = LogisticRegression(maxIter=5, regParam=0.01)
     ovr = OneVsRest(classifier=lr, weightCol="weight")
     self.assertIsNotNone(ovr.fit(df))
     # classifier doesn't inherit hasWeightCol
     dt = DecisionTreeClassifier()
     ovr2 = OneVsRest(classifier=dt, weightCol="weight")
     self.assertIsNotNone(ovr2.fit(df))
예제 #5
0
def test_should_log_model(dataset_binomial, dataset_multinomial):
    mlflow.pyspark.ml.autolog(log_models=True)
    lor = LogisticRegression()

    ova1 = OneVsRest(classifier=lor)
    mlor_model = lor.fit(dataset_multinomial)
    assert _should_log_model(mlor_model)

    ova1_model = ova1.fit(dataset_multinomial)
    assert _should_log_model(ova1_model)

    with mock.patch(
            "mlflow.pyspark.ml._log_model_allowlist",
        {
            "pyspark.ml.regression.LinearRegressionModel",
            "pyspark.ml.classification.OneVsRestModel"
        },
    ), mock.patch("mlflow.pyspark.ml._logger.warning") as mock_warning:
        lr = LinearRegression()
        lr_model = lr.fit(dataset_binomial)
        assert _should_log_model(lr_model)
        lor_model = lor.fit(dataset_binomial)
        assert not _should_log_model(lor_model)
        mock_warning.called_once_with(
            _get_warning_msg_for_skip_log_model(lor_model))
        assert not _should_log_model(ova1_model)
예제 #6
0
    def test_one_vs_rest(self):
        this_script_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
        input_path = os.path.join(this_script_dir, "data", "sample_multiclass_classification_data.txt")
        data = self.spark.read.format("libsvm").load(input_path)
        lr = LogisticRegression(maxIter=100, tol=0.0001, regParam=0.01)
        ovr = OneVsRest(classifier=lr)
        model = ovr.fit(data)

        feature_count = data.first()[1].size
        model_onnx = convert_sparkml(model, 'Sparkml OneVsRest', [
            ('features', FloatTensorType([1, feature_count]))
        ], spark_session=self.spark)
        self.assertTrue(model_onnx is not None)

        # run the model
        predicted = model.transform(data)
        data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        expected = [
            predicted.toPandas().prediction.values.astype(numpy.float32),
        ]
        paths = save_data_models(data_np, expected, model, model_onnx,
                                    basename="SparkmlOneVsRest")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['prediction'], data_np, onnx_model_path)
        compare_results(expected, output, decimal=5)
def one_vs_rest(training, test):
    lr = LogisticRegression(maxIter=10, regParam=0.01)
    ovr = OneVsRest(classifier=lr)
    model = ovr.fit(training)
    result = model.transform(test)
    accuracy = 1.0 * result.rdd.filter(
        lambda l: l.label == l.prediction).count() / test.count()
    print "OneVsRest 模型的正确率为:", accuracy
def one_vs_rest_classifier(trainingDataFrame, classifier=None):
    if not classifier:
        classifier = LogisticRegression(regParam=0.01)
    ovr = OneVsRest(classifier=classifier)
    ovrModel = ovr.fit(trainingDataFrame)
    result = {}
    result["model"] = ovrModel
    return result
def test_param_map_captures_wrapped_params(dataset_binomial):
    lor = LogisticRegression(maxIter=3, standardization=False)
    ova = OneVsRest(classifier=lor, labelCol="abcd")

    param_map = _get_instance_param_map(ova)
    assert param_map["labelCol"] == "abcd"
    assert param_map["classifier"] == lor.uid
    assert param_map[f"{lor.uid}.maxIter"] == 3
    assert not param_map[f"{lor.uid}.standardization"]
    assert param_map[f"{lor.uid}.tol"] == lor.getOrDefault(lor.tol)

    mlflow.pyspark.ml.autolog()
    with mlflow.start_run() as run:
        ova.fit(dataset_binomial.withColumn("abcd", dataset_binomial.label))
    run_id = run.info.run_id
    run_data = get_run_data(run_id)
    assert run_data.params == truncate_param_dict(
        stringify_dict_values(_get_instance_param_map(ova)))
예제 #10
0
 def test_output_columns(self):
     df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
                                      (1.0, Vectors.sparse(2, [], [])),
                                      (2.0, Vectors.dense(0.5, 0.5))],
                                     ["label", "features"])
     lr = LogisticRegression(maxIter=5, regParam=0.01)
     ovr = OneVsRest(classifier=lr, parallelism=1)
     model = ovr.fit(df)
     output = model.transform(df)
     self.assertEqual(output.columns, ["label", "features", "rawPrediction", "prediction"])
예제 #11
0
 def test_raw_prediction_column_is_of_vector_type(self):
     # SPARK-35142: `OneVsRestModel` outputs raw prediction as a string column
     df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
                                      (1.0, Vectors.sparse(2, [], [])),
                                      (2.0, Vectors.dense(0.5, 0.5))],
                                     ["label", "features"])
     lr = LogisticRegression(maxIter=5, regParam=0.01)
     ovr = OneVsRest(classifier=lr, parallelism=1)
     model = ovr.fit(df)
     row = model.transform(df).head()
     self.assertIsInstance(row["rawPrediction"], DenseVector)
def test_param_map_captures_wrapped_params(dataset_binomial):
    lor = LogisticRegression(maxIter=3, standardization=False)
    ova = OneVsRest(classifier=lor, labelCol="abcd")

    param_map = get_params_to_log(ova)
    assert param_map["labelCol"] == "abcd"
    assert param_map["classifier"] == "LogisticRegression"
    assert param_map["LogisticRegression.maxIter"] == 3
    assert not param_map["LogisticRegression.standardization"]
    assert param_map["LogisticRegression.tol"] == lor.getOrDefault(lor.tol)

    mlflow.pyspark.ml.autolog()
    with mlflow.start_run() as run:
        ova.fit(dataset_binomial.withColumn("abcd", dataset_binomial.label))
        metadata = _gen_estimator_metadata(ova)
        estimator_info = load_json_artifact("estimator_info.json")
        assert metadata.hierarchy == estimator_info["hierarchy"]
    run_id = run.info.run_id
    run_data = get_run_data(run_id)
    assert run_data.params == truncate_param_dict(stringify_dict_values(get_params_to_log(ova)))
예제 #13
0
 def test_copy(self):
     df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
                                      (1.0, Vectors.sparse(2, [], [])),
                                      (2.0, Vectors.dense(0.5, 0.5))],
                                     ["label", "features"])
     lr = LogisticRegression(maxIter=5, regParam=0.01)
     ovr = OneVsRest(classifier=lr)
     ovr1 = ovr.copy({lr.maxIter: 10})
     self.assertEqual(ovr.getClassifier().getMaxIter(), 5)
     self.assertEqual(ovr1.getClassifier().getMaxIter(), 10)
     model = ovr.fit(df)
     model1 = model.copy({model.predictionCol: "indexed"})
     self.assertEqual(model1.getPredictionCol(), "indexed")
예제 #14
0
 def test_copy(self):
     df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
                                      (1.0, Vectors.sparse(2, [], [])),
                                      (2.0, Vectors.dense(0.5, 0.5))],
                                     ["label", "features"])
     lr = LogisticRegression(maxIter=5, regParam=0.01)
     ovr = OneVsRest(classifier=lr)
     ovr1 = ovr.copy({lr.maxIter: 10})
     self.assertEqual(ovr.getClassifier().getMaxIter(), 5)
     self.assertEqual(ovr1.getClassifier().getMaxIter(), 10)
     model = ovr.fit(df)
     model1 = model.copy({model.predictionCol: "indexed"})
     self.assertEqual(model1.getPredictionCol(), "indexed")
예제 #15
0
def test_should_log_model(dataset_binomial, dataset_multinomial, dataset_text):
    mlflow.pyspark.ml.autolog(log_models=True)
    lor = LogisticRegression()

    ova1 = OneVsRest(classifier=lor)
    with mlflow.start_run():
        mlor_model = lor.fit(dataset_multinomial)
    assert _should_log_model(mlor_model)

    with mlflow.start_run():
        ova1_model = ova1.fit(dataset_multinomial)
    assert _should_log_model(ova1_model)

    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(),
                          outputCol="features")
    lr = LogisticRegression(maxIter=2)
    pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
    with mlflow.start_run():
        pipeline_model = pipeline.fit(dataset_text)
    assert _should_log_model(pipeline_model)

    nested_pipeline = Pipeline(
        stages=[tokenizer, Pipeline(stages=[hashingTF, lr])])
    with mlflow.start_run():
        nested_pipeline_model = nested_pipeline.fit(dataset_text)
    assert _should_log_model(nested_pipeline_model)

    with mock.patch(
            "mlflow.pyspark.ml._log_model_allowlist",
        {
            "pyspark.ml.regression.LinearRegressionModel",
            "pyspark.ml.classification.OneVsRestModel",
            "pyspark.ml.pipeline.PipelineModel",
        },
    ), mock.patch("mlflow.pyspark.ml._logger.warning") as mock_warning:
        lr = LinearRegression()
        with mlflow.start_run():
            lr_model = lr.fit(dataset_binomial)
        assert _should_log_model(lr_model)
        with mlflow.start_run():
            lor_model = lor.fit(dataset_binomial)
        assert not _should_log_model(lor_model)
        mock_warning.called_once_with(
            _get_warning_msg_for_skip_log_model(lor_model))
        assert not _should_log_model(ova1_model)
        assert not _should_log_model(pipeline_model)
        assert not _should_log_model(nested_pipeline_model)
예제 #16
0
    def test_getAllNestedStages(self):
        def _check_uid_set_equal(stages, expected_stages):
            uids = set(map(lambda x: x.uid, stages))
            expected_uids = set(map(lambda x: x.uid, expected_stages))
            self.assertEqual(uids, expected_uids)

        df1 = self.spark.createDataFrame(
            [
                (Vectors.dense([1.0, 2.0]), 1.0),
                (Vectors.dense([-1.0, -2.0]), 0.0),
            ],
            ["features", "label"],
        )
        df2 = self.spark.createDataFrame(
            [
                (1.0, 2.0, 1.0),
                (1.0, 2.0, 0.0),
            ],
            ["a", "b", "label"],
        )
        vs = VectorAssembler(inputCols=["a", "b"], outputCol="features")
        lr = LogisticRegression()
        pipeline = Pipeline(stages=[vs, lr])
        pipelineModel = pipeline.fit(df2)
        ova = OneVsRest(classifier=lr)
        ovaModel = ova.fit(df1)

        ova_pipeline = Pipeline(stages=[vs, ova])
        nested_pipeline = Pipeline(stages=[ova_pipeline])

        _check_uid_set_equal(
            MetaAlgorithmReadWrite.getAllNestedStages(pipeline),
            [pipeline, vs, lr])
        _check_uid_set_equal(
            MetaAlgorithmReadWrite.getAllNestedStages(pipelineModel),
            [pipelineModel] + pipelineModel.stages,
        )
        _check_uid_set_equal(MetaAlgorithmReadWrite.getAllNestedStages(ova),
                             [ova, lr])
        _check_uid_set_equal(
            MetaAlgorithmReadWrite.getAllNestedStages(ovaModel),
            [ovaModel, lr] + ovaModel.models)
        _check_uid_set_equal(
            MetaAlgorithmReadWrite.getAllNestedStages(nested_pipeline),
            [nested_pipeline, ova_pipeline, vs, ova, lr],
        )
예제 #17
0
 def test_onevsrest(self):
     temp_path = tempfile.mkdtemp()
     df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
                                      (1.0, Vectors.sparse(2, [], [])),
                                      (2.0, Vectors.dense(0.5, 0.5))] * 10,
                                     ["label", "features"])
     lr = LogisticRegression(maxIter=5, regParam=0.01)
     ovr = OneVsRest(classifier=lr)
     model = ovr.fit(df)
     ovrPath = temp_path + "/ovr"
     ovr.save(ovrPath)
     loadedOvr = OneVsRest.load(ovrPath)
     self._compare_pipelines(ovr, loadedOvr)
     modelPath = temp_path + "/ovrModel"
     model.save(modelPath)
     loadedModel = OneVsRestModel.load(modelPath)
     self._compare_pipelines(model, loadedModel)
예제 #18
0
 def test_onevsrest(self):
     temp_path = tempfile.mkdtemp()
     df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
                                      (1.0, Vectors.sparse(2, [], [])),
                                      (2.0, Vectors.dense(0.5, 0.5))] * 10,
                                     ["label", "features"])
     lr = LogisticRegression(maxIter=5, regParam=0.01)
     ovr = OneVsRest(classifier=lr)
     model = ovr.fit(df)
     ovrPath = temp_path + "/ovr"
     ovr.save(ovrPath)
     loadedOvr = OneVsRest.load(ovrPath)
     self._compare_pipelines(ovr, loadedOvr)
     modelPath = temp_path + "/ovrModel"
     model.save(modelPath)
     loadedModel = OneVsRestModel.load(modelPath)
     self._compare_pipelines(model, loadedModel)
예제 #19
0
def ovr_classifier(training, testing):
    from pyspark.ml.classification import LogisticRegression, OneVsRest

    #Instantiate the base classifier.
    lr = LogisticRegression(maxIter=10, tol=1E-6, fitIntercept=True)
    #Instantiate the One Vs Rest Classifier.
    ovr = OneVsRest(classifier=lr)
    # train the multiclass model.
    ovr_model = ovr.fit(training)

    #Test model
    ovr_predictions = ovr_model.transform(testing)

    #Evaluate model
    ovr_evaluator = MulticlassClassificationEvaluator(metricName='accuracy')
    ovr_accuracy = ovr_evaluator.evaluate(ovr_predictions)
    return ovr_accuracy
def test_meta_estimator_fit(dataset_binomial):
    mlflow.pyspark.ml.autolog()
    with mlflow.start_run() as run:
        svc = LinearSVC()
        ova = OneVsRest(classifier=svc)
        ova_model = ova.fit(dataset_binomial)

    run_id = run.info.run_id
    run_data = get_run_data(run_id)
    assert run_data.params == truncate_param_dict(stringify_dict_values(get_params_to_log(ova)))
    assert run_data.tags == get_expected_class_tags(ova)
    assert MODEL_DIR in run_data.artifacts
    loaded_model = load_model_by_run_id(run_id)
    assert loaded_model.stages[0].uid == ova_model.uid

    # assert no nested run spawned
    query = "tags.{} = '{}'".format(MLFLOW_PARENT_RUN_ID, run.info.run_id)
    assert len(mlflow.search_runs([run.info.experiment_id])) == 1
    assert len(mlflow.search_runs([run.info.experiment_id], query)) == 0
예제 #21
0
def main():
    spark = create_session('wash_post_shootings')
    spark.sparkContext.setLogLevel('ERROR')

    try:
        df = create_df(spark)

        sh = Shootings(df)
        sh.show()

        df = sh.get_df()

        (train, test) = df.randomSplit([0.8, 0.2])

        # instantiate the base classifier.
        lr = LogisticRegression(maxIter=10,
                                tol=1E-6,
                                fitIntercept=True,
                                featuresCol='features',
                                labelCol='label')

        # instantiate the One Vs Rest Classifier.
        ovr = OneVsRest(classifier=lr)

        # train the multiclass model.
        ovrModel = ovr.fit(train)

        # score the model on test data.
        predictions = ovrModel.transform(test)

        # obtain evaluator.
        evaluator = MulticlassClassificationEvaluator(metricName="accuracy")

        # compute the classification error on test data.
        accuracy = evaluator.evaluate(predictions)
        print("Test Error = %g" % (1.0 - accuracy))
        print("Accuracy = %.2f" % (accuracy * 100))

    except Exception as e:
        print(e)
    finally:
        spark.sparkContext.stop()
예제 #22
0
def main():
    parser = argparse.ArgumentParser(description='Pyspark Training')
    parser.add_argument('--data_dir',
                        type=str,
                        default='../../data',
                        help='Data location.')
    args = parser.parse_args()

    # Get the MNIST data.
    X, y = load_mnist(args.data_dir)

    # Create a train and test set split.
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.33,
                                                        random_state=42)

    # Convert numpy arrays to Pyspark dataframe.
    df = create_df(y_train, X_train)

    # Create a train and validation set.
    (train, val) = df.randomSplit([0.1, 0.90])

    # instantiate logistic regression with hyperparameters.
    lr = LogisticRegression(maxIter=10, tol=1E-6, fitIntercept=True)

    # instantiate the One Vs Rest Classifier.
    ovr = OneVsRest(classifier=lr)

    # train the multiclass model
    ovrModel = ovr.fit(train)

    # score the model on test data.
    predictions = ovrModel.transform(val)

    # obtain evaluator.
    evaluator = MulticlassClassificationEvaluator(metricName="accuracy")

    # compute the classification error on validation data.
    accuracy = evaluator.evaluate(predictions)
    print("Validation Accuracy = {}".format(accuracy))
    print("Validation Error = {}".format(1.0 - accuracy))
예제 #23
0
파일: tests.py 프로젝트: JeremyNixon/spark
 def test_save_load(self):
     temp_path = tempfile.mkdtemp()
     sqlContext = SQLContext(self.sc)
     df = sqlContext.createDataFrame(
         [(0.0, Vectors.dense(1.0, 0.8)), (1.0, Vectors.sparse(2, [], [])), (2.0, Vectors.dense(0.5, 0.5))],
         ["label", "features"],
     )
     lr = LogisticRegression(maxIter=5, regParam=0.01)
     ovr = OneVsRest(classifier=lr)
     model = ovr.fit(df)
     ovrPath = temp_path + "/ovr"
     ovr.save(ovrPath)
     loadedOvr = OneVsRest.load(ovrPath)
     self.assertEqual(loadedOvr.getFeaturesCol(), ovr.getFeaturesCol())
     self.assertEqual(loadedOvr.getLabelCol(), ovr.getLabelCol())
     self.assertEqual(loadedOvr.getClassifier().uid, ovr.getClassifier().uid)
     modelPath = temp_path + "/ovrModel"
     model.save(modelPath)
     loadedModel = OneVsRestModel.load(modelPath)
     for m, n in zip(model.models, loadedModel.models):
         self.assertEqual(m.uid, n.uid)
예제 #24
0
파일: tests.py 프로젝트: yoavfreund/spark
 def test_save_load(self):
     temp_path = tempfile.mkdtemp()
     sqlContext = SQLContext(self.sc)
     df = sqlContext.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
                                      (1.0, Vectors.sparse(2, [], [])),
                                      (2.0, Vectors.dense(0.5, 0.5))],
                                     ["label", "features"])
     lr = LogisticRegression(maxIter=5, regParam=0.01)
     ovr = OneVsRest(classifier=lr)
     model = ovr.fit(df)
     ovrPath = temp_path + "/ovr"
     ovr.save(ovrPath)
     loadedOvr = OneVsRest.load(ovrPath)
     self.assertEqual(loadedOvr.getFeaturesCol(), ovr.getFeaturesCol())
     self.assertEqual(loadedOvr.getLabelCol(), ovr.getLabelCol())
     self.assertEqual(loadedOvr.getClassifier().uid,
                      ovr.getClassifier().uid)
     modelPath = temp_path + "/ovrModel"
     model.save(modelPath)
     loadedModel = OneVsRestModel.load(modelPath)
     for m, n in zip(model.models, loadedModel.models):
         self.assertEqual(m.uid, n.uid)
예제 #25
0
def test_should_log_model_with_wildcards_in_allowlist(dataset_binomial,
                                                      dataset_multinomial):
    mlflow.pyspark.ml.autolog(log_models=True)
    lor = LogisticRegression()
    ova1 = OneVsRest(classifier=lor)
    ova1_model = ova1.fit(dataset_multinomial)

    with mock.patch(
            "mlflow.pyspark.ml._log_model_allowlist",
        {
            "pyspark.ml.regression.*",
            "pyspark.ml.classification.LogisticRegressionModel",
            "pyspark.ml.feature.*",
        },
    ):
        lr = LinearRegression()
        with mlflow.start_run():
            lr_model = lr.fit(dataset_binomial)
        assert _should_log_model(lr_model)
        with mlflow.start_run():
            lor_model = lor.fit(dataset_binomial)
        assert _should_log_model(lor_model)
        assert not _should_log_model(ova1_model)
예제 #26
0
        # Normalize each feature to have unit standard deviation.
        train_df_tmp = scalerModel.transform(parsed_labelpoint_df)
        train_df = train_df_tmp.drop("features").withColumnRenamed(
            "scaledFeatures", "features")

        # show the frequency of each label
        tmp_df = train_df.groupBy("label").count()
        tmp_df.show(10)

        lr = LogisticRegression(maxIter=10, regParam=0.01, elasticNetParam=0.2)

        # instantiate the One Vs Rest Classifier.
        ovr = OneVsRest(classifier=lr)
        # train the multiclass model.
        lrModel = ovr.fit(train_df)
        shutil.rmtree(logreg_path, ignore_errors=True)
        lrModel.save(logreg_path)

    if opt == "test" or opt == "all":

        print "Loading test data..."
        test_data = sc.textFile(golden_file)
        parsed_test_data = test_data.map(kup.parse_multiClass)
        parsed_test_data_df = spark.createDataFrame(parsed_test_data,
                                                    ["label", "features"])

        # load the scaler and perform feature scaling on test data
        scalerModel = StandardScalerModel.load(scalerPath)
        test_df_tmp = scalerModel.transform(parsed_test_data_df)
        test_df = test_df_tmp.drop("features").withColumnRenamed(
# Import the One-vs-Rest function from Pyspark.
from pyspark.ml.classification import OneVsRest

# Multiclass classification using random forest without resampling.
rf = RandomForestClassifier(numTrees=100,
                            maxDepth=4,
                            maxBins=32,
                            featuresCol='pca_features',
                            labelCol='label')

# Define the one-vs-rest model.
one_vs_rest_model = OneVsRest(classifier=rf)

# Fit the one-vs-rest model to the train data without resampling.
rf_ovr_model = one_vs_rest_model.fit(train_multi)

# Predict on the test data.
rf_ovr_predictions = rf_ovr_model.transform(test_multi)

# Compute the metrics for the one-vs-rest model.
print('Random Forest with OneVsRest:')
for i in range(21):
    get_metrics(rf_ovr_predictions, label=i)

# Random Forest with OneVsRest:
# Label: 0
#  accuracy: 0.566459184078024
#  f1_score: 0.7225638839838437
#  precision: 0.5658787811352546
#  recall: 0.9992421478092458
예제 #28
0
labelZeroDf = readImages(imageDir + "l0").withColumn("label", lit(0))
labelOneDf = readImages(imageDir + "l1").withColumn("label", lit(1))
labelTwoDf = readImages(imageDir + "l2").withColumn("label", lit(2))
labelThreeDf = readImages(imageDir + "l3").withColumn("label", lit(3))
labelFourDf = readImages(imageDir + "l4").withColumn("label", lit(4))
finalTrainDf = labelZeroDf.unionAll(labelOneDf).unionAll(labelTwoDf).unionAll(
    labelThreeDf).unionAll(labelFourDf)

trainSize = finalTrainDf.count()
print(str(trainSize))

featurizer = DeepImageFeaturizer(inputCol="image",
                                 outputCol="features",
                                 modelName="InceptionV3")
method = DecisionTreeClassifier(labelCol="label", featuresCol="features")
ovr = OneVsRest(classifier=method)
featureVector = featurizer.transform(finalTrainDf)
model_dc = ovr.fit(featureVector)
model_dc.write().overwrite().save(imageDir +
                                  'model-decision-tree-classifier-new')
predictions = model_dc.transform(featureVector)

evaluator = MulticlassClassificationEvaluator(labelCol="label",
                                              predictionCol="prediction",
                                              metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Train Data set accuracy with decision-tree-classifier for " +
      str(trainSize) + " images = " + str(accuracy) + " and error " +
      str(1 - accuracy))
예제 #29
0
#tmpTrainDf = readImages(imageDir + "/test1")
tmpTrainRDD = tmpTrainDf.rdd.map(lambda x : Row(filepath = x[0], image = x[1], fileName = getFileName(x[0])))
tmpTrainX = tmpTrainRDD.toDF()
csvTrainTmp = spark.read.format("csv").option("header", "true").load(imageDir + "/train25.csv")
#csvTrainTmp = spark.read.format("csv").option("header", "true").load(imageDir + "/test1.csv")
csvTrainRDD = csvTrainTmp.rdd.map(lambda x : Row(image = x[0], label = int(x[1])))
csvTrain = csvTrainRDD.toDF()
finalTrainDataFrame = tmpTrainX.join(csvTrain, tmpTrainX.fileName == csvTrain.image, 'inner').drop(csvTrain.image)


featurizer = DeepImageFeaturizer(inputCol="image",outputCol="features", modelName="InceptionV3")

method = LogisticRegression(maxIter=50, regParam=0.05, elasticNetParam=0.3, labelCol="label")
ovr = OneVsRest(classifier = method)
featureVector = featurizer.transform(finalTrainDataFrame).persist()
model_svm = ovr.fit(featureVector)
model_svm.write().overwrite().save('hdfs://192.168.65.188:8020/paih/model-support-vector-machine')

predictions = model_svm.transform(featureVector).persist()
predictionAndLabels = predictions.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Train Data set accuracy with Support vector Machine = " + str(evaluator.evaluate(predictionAndLabels)) + " and error " + str(1 - evaluator.evaluate(predictionAndLabels)))
#predictions.show()

#apply evaluator on constructed model for training data.

#apply deep learning for feature extraction
#apply genetic algo for feature selection
#apply ml model(eg. lr)
#apply genetic algorithm for feature selection
예제 #30
0
csvTrain = csvTrainRDD.toDF()
finalTrainDataFrame = tmpTrainX.join(csvTrain,
                                     tmpTrainX.fileName == csvTrain.image,
                                     'inner').drop(csvTrain.image)

featurizer = DeepImageFeaturizer(inputCol="image",
                                 outputCol="features",
                                 modelName="InceptionV3")

method = LogisticRegression(maxIter=50,
                            regParam=0.05,
                            elasticNetParam=0.3,
                            labelCol="label")
ovr = OneVsRest(classifier=method)
featureVector = featurizer.transform(finalTrainDataFrame).persist()
model_lr = ovr.fit(featureVector)
model_lr.write().overwrite().save(
    'hdfs://192.168.65.188:8020/paih/model-logistic-regression')

predictions = model_lr.transform(featureVector).persist()
predictionAndLabels = predictions.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Train Data set accuracy with Logistic Regression = " +
      str(evaluator.evaluate(predictionAndLabels)) + " and error " +
      str(1 - evaluator.evaluate(predictionAndLabels)))
#predictions.show()

#apply evaluator on constructed model for training data.

#apply deep learning for feature extraction
#apply genetic algo for feature selection
예제 #31
0
    # $example on$
    # load data file.
    inputData = spark.read.format("libsvm") \
        .load("data/mllib/sample_multiclass_classification_data.txt")

    # generate the train/test split.
    (train, test) = inputData.randomSplit([0.8, 0.2])

    # instantiate the base classifier.
    lr = LogisticRegression(maxIter=10, tol=1E-6, fitIntercept=True)

    # instantiate the One Vs Rest Classifier.
    ovr = OneVsRest(classifier=lr)

    # train the multiclass model.
    ovrModel = ovr.fit(train)

    # score the model on test data.
    predictions = ovrModel.transform(test)

    # obtain evaluator.
    evaluator = MulticlassClassificationEvaluator(metricName="precision")

    # compute the classification error on test data.
    precision = evaluator.evaluate(predictions)
    print("Test Error : " + str(1 - precision))
    # $example off$

    spark.stop()
    def Train(self):
        st = time.time()
        categorical_columns = self._dataframe_helper.get_string_columns()
        numerical_columns = self._dataframe_helper.get_numeric_columns()
        result_column = self._dataframe_context.get_result_column()
        categorical_columns = [
            x for x in categorical_columns if x != result_column
        ]

        model_path = self._dataframe_context.get_model_path()
        pipeline_filepath = model_path + "/LogisticRegression/TrainedModels/pipeline"
        model_filepath = model_path + "/LogisticRegression/TrainedModels/model"
        summary_filepath = model_path + "/LogisticRegression/ModelSummary/summary.json"

        df = self._data_frame
        pipeline = MLUtils.create_pyspark_ml_pipeline(numerical_columns,
                                                      categorical_columns,
                                                      result_column)
        pipelineModel = pipeline.fit(df)
        indexed = pipelineModel.transform(df)
        MLUtils.save_pipeline_or_model(pipelineModel, pipeline_filepath)
        trainingData, validationData = MLUtils.get_training_and_validation_data(
            indexed, result_column, 0.8)
        OriginalTargetconverter = IndexToString(
            inputCol="label", outputCol="originalTargetColumn")
        levels = trainingData.select("label").distinct().collect()

        if self._classifier == "lr":
            if len(levels) == 2:
                lr = LogisticRegression(maxIter=10,
                                        regParam=0.3,
                                        elasticNetParam=0.8)
            elif len(levels) > 2:
                lr = LogisticRegression(maxIter=10,
                                        regParam=0.3,
                                        elasticNetParam=0.8,
                                        family="multinomial")
            fit = lr.fit(trainingData)
        elif self._classifier == "OneVsRest":
            lr = LogisticRegression()
            ovr = OneVsRest(classifier=lr)
            fit = ovr.fit(trainingData)
        transformed = fit.transform(validationData)
        MLUtils.save_pipeline_or_model(fit, model_filepath)

        print fit.coefficientMatrix
        print fit.interceptVector

        # feature_importance = MLUtils.calculate_sparkml_feature_importance(indexed,fit,categorical_columns,numerical_columns)
        label_classes = transformed.select("label").distinct().collect()
        results = transformed.select(["prediction", "label"])
        if len(label_classes) > 2:
            evaluator = MulticlassClassificationEvaluator(
                predictionCol="prediction")
            evaluator.evaluate(results)
            self._model_summary["model_accuracy"] = evaluator.evaluate(
                results,
                {evaluator.metricName: "accuracy"})  # accuracy of the model
        else:
            evaluator = BinaryClassificationEvaluator(
                rawPredictionCol="prediction")
            evaluator.evaluate(results)
            # print evaluator.evaluate(results,{evaluator.metricName: "areaUnderROC"})
            # print evaluator.evaluate(results,{evaluator.metricName: "areaUnderPR"})
            self._model_summary["model_accuracy"] = evaluator.evaluate(
                results,
                {evaluator.metricName: "areaUnderPR"})  # accuracy of the model

        # self._model_summary["feature_importance"] = MLUtils.transform_feature_importance(feature_importance)
        self._model_summary["runtime_in_seconds"] = round((time.time() - st),
                                                          2)

        transformed = OriginalTargetconverter.transform(transformed)
        label_indexer_dict = [
            dict(enumerate(field.metadata["ml_attr"]["vals"]))
            for field in transformed.schema.fields if field.name == "label"
        ][0]
        prediction_to_levels = udf(lambda x: label_indexer_dict[x],
                                   StringType())
        transformed = transformed.withColumn(
            "predictedClass", prediction_to_levels(transformed.prediction))
        prediction_df = transformed.select(
            ["originalTargetColumn", "predictedClass"]).toPandas()
        objs = {
            "actual": prediction_df["originalTargetColumn"],
            "predicted": prediction_df["predictedClass"]
        }

        self._model_summary[
            "confusion_matrix"] = MLUtils.calculate_confusion_matrix(
                objs["actual"], objs["predicted"])
        overall_precision_recall = MLUtils.calculate_overall_precision_recall(
            objs["actual"], objs["predicted"])
        self._model_summary[
            "precision_recall_stats"] = overall_precision_recall[
                "classwise_stats"]
        self._model_summary["model_precision"] = overall_precision_recall[
            "precision"]
        self._model_summary["model_recall"] = overall_precision_recall[
            "recall"]
        self._model_summary["target_variable"] = result_column
        self._model_summary[
            "test_sample_prediction"] = overall_precision_recall[
                "prediction_split"]
        self._model_summary["algorithm_name"] = "Random Forest"
        self._model_summary["validation_method"] = "Train and Test"
        self._model_summary["independent_variables"] = len(
            categorical_columns) + len(numerical_columns)
        self._model_summary["level_counts"] = CommonUtils.get_level_count_dict(
            trainingData,
            categorical_columns,
            self._dataframe_context.get_column_separator(),
            dataType="spark")
        # print json.dumps(self._model_summary,indent=2)
        self._model_summary["total_trees"] = 100
        self._model_summary["total_rules"] = 300
        CommonUtils.write_to_file(
            summary_filepath, json.dumps({"modelSummary":
                                          self._model_summary}))
예제 #33
0
tmpTrainX = tmpTrainRDD.toDF()
csvTrainTmp = spark.read.format("csv").option(
    "header", "true").load(imageDir + "train25.csv")
#csvTrainTmp = spark.read.format("csv").option("header", "true").load(imageDir + "/test1.csv")
csvTrainRDD = csvTrainTmp.rdd.map(lambda x: Row(image=x[0], label=int(x[1])))
csvTrain = csvTrainRDD.toDF()
finalTrainDataFrame = tmpTrainX.join(csvTrain,
                                     tmpTrainX.fileName == csvTrain.image,
                                     'inner').drop(csvTrain.image)

featurizer = DeepImageFeaturizer(inputCol="image",
                                 outputCol="features",
                                 modelName="InceptionV3")

method = GBTClassifier(labelCol="label", featuresCol="features", maxIter=10)
ovr = OneVsRest(classifier=method)
featureVector = featurizer.transform(finalTrainDataFrame).persist()
model_gbt = ovr.fit(featureVector)
model_gbt.write().overwrite().save(imageDir +
                                   'model-gradiant-boosted-tree-classifier')

predictions = model_gbt.transform(featureVector).persist()

predictionAndLabels = predictions.select("prediction", "label")
predictionAndLabels.persist()
#predictionAndLabels.show()
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Train Data set Gradiant boosted tree classifier = " +
      str(evaluator.evaluate(predictionAndLabels)) + " and error " +
      str(1 - evaluator.evaluate(predictionAndLabels)))
예제 #34
0
assembler = VectorAssembler(
    inputCols=[x for x in train.columns if x not in ignore],
    outputCol='features')
train_LP = assembler.transform(train).select(['label', 'features'])
evaluation_LP = assembler.transform(evaluation).select(['label', 'features'])

#Definimos el algoritmo del modelo (OvR)
# instantiate the base classifier. Only LogisticRegression and NaiveBayes are supported
lr = LogisticRegression(maxIter=20, tol=1E-6,
                        fitIntercept=True)  # elasticNetParam=0.1
# instantiate the One Vs Rest Classifier.
ovr = OneVsRest(classifier=lr)

# Fit the model
# train the multiclass model.
ovrModel = ovr.fit(train_LP)

# Make predictions.
# score the model on test data.
predictions = ovrModel.transform(evaluation_LP)

# Select (prediction, true label) and compute evaluation error
evaluator = MulticlassClassificationEvaluator(labelCol="label",
                                              predictionCol="prediction",
                                              metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))

# maxIter=10, tol=1E-6, fitIntercept=True
# Error = 0.519118
예제 #35
0
    # $example on$
    # load data file.
    inputData = spark.read.format("libsvm") \
        .load("data/mllib/sample_multiclass_classification_data.txt")

    # generate the train/test split.
    (train, test) = inputData.randomSplit([0.8, 0.2])

    # instantiate the base classifier.
    lr = LogisticRegression(maxIter=10, tol=1E-6, fitIntercept=True)

    # instantiate the One Vs Rest Classifier.
    ovr = OneVsRest(classifier=lr)

    # train the multiclass model.
    ovrModel = ovr.fit(train)

    # score the model on test data.
    predictions = ovrModel.transform(test)

    # obtain evaluator.
    evaluator = MulticlassClassificationEvaluator(metricName="precision")

    # compute the classification error on test data.
    precision = evaluator.evaluate(predictions)
    print("Test Error : " + str(1 - precision))
    # $example off$

    spark.stop()
newData = indexed['features', 'label']
newData.show()

# split data
(trainingData, testData) = newData.randomSplit([0.7, 0.3])

# Classification

from pyspark.ml.classification import LogisticRegression, OneVsRest
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# instantiate the base classifier.
lr = LogisticRegression(maxIter=10, tol=1E-6, fitIntercept=True)

# instantiate the One Vs Rest Classifier.
ovr = OneVsRest(classifier=lr)

# train the multiclass model.
ovrModel = ovr.fit(trainingData)

# score the model on test data.
predictions = ovrModel.transform(testData)

# obtain evaluator.
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")

# compute the classification error on test data.
accuracy = evaluator.evaluate(predictions)
print("Test Error : " + str(1 - accuracy))