Exemplo n.º 1
0
    def test_binarizer(self):
        b0 = Binarizer()
        self.assertListEqual(b0.params, [
            b0.inputCol, b0.inputCols, b0.outputCol, b0.outputCols,
            b0.threshold, b0.thresholds
        ])
        self.assertTrue(all([~b0.isSet(p) for p in b0.params]))
        self.assertTrue(b0.hasDefault(b0.threshold))
        self.assertEqual(b0.getThreshold(), 0.0)
        b0.setParams(inputCol="input", outputCol="output").setThreshold(1.0)
        self.assertTrue(not all([b0.isSet(p) for p in b0.params]))
        self.assertEqual(b0.getThreshold(), 1.0)
        self.assertEqual(b0.getInputCol(), "input")
        self.assertEqual(b0.getOutputCol(), "output")

        b0c = b0.copy({b0.threshold: 2.0})
        self.assertEqual(b0c.uid, b0.uid)
        self.assertListEqual(b0c.params, b0.params)
        self.assertEqual(b0c.getThreshold(), 2.0)

        b1 = Binarizer(threshold=2.0, inputCol="input", outputCol="output")
        self.assertNotEqual(b1.uid, b0.uid)
        self.assertEqual(b1.getThreshold(), 2.0)
        self.assertEqual(b1.getInputCol(), "input")
        self.assertEqual(b1.getOutputCol(), "output")
Exemplo n.º 2
0
 def naiveOutliers(self, df, c):   
     binazer_2sdu = Binarizer(threshold = 2.0, inputCol=c, outputCol="2SDU_"+c)
     binazer_3sdu = Binarizer(threshold = 3.0, inputCol=c, outputCol="3SDU_"+c)
     binazer_2sdd = Binarizer(threshold = 2.0, inputCol=c, outputCol="2SDD_"+c)
     binazer_3sdd = Binarizer(threshold = 3.0, inputCol=c, outputCol="3SDD_"+c)   
      
     df = binazer_2sdu.transform(df.select('snapshotDate','ID',c))
     df = binazer_3sdu.transform(df)
     df = df.withColumn(c, -1.0 * df[c])
     df = binazer_2sdd.transform(df)
     df = binazer_3sdd.transform(df)
                      
     return( df.select('snapshotDate','ID','2SDU_'+c, '3SDU_'+c, '2SDD_'+c, '3SDD_'+c,) )
Exemplo n.º 3
0
def performance(prediction):
    '''
    performance of model
    '''
    binarizer = Binarizer(threshold=0.5,
                          inputCol="prediction",
                          outputCol="b_prediction")
    binarizedDataFrame = binarizer.transform(prediction)
    binarizer = Binarizer(threshold=0.5, inputCol="label", outputCol="b_label")
    binarizedDataFrame = binarizer.transform(binarizedDataFrame)
    prediction_label = binarizedDataFrame.select('b_prediction', 'b_label')
    metrics = BinaryClassificationMetrics(prediction_label.rdd)
    return metrics.areaUnderROC
Exemplo n.º 4
0
    def test_model_binarizer(self):
        import numpy
        data = self.spark.createDataFrame([(0, 0.1), (1, 0.8), (2, 0.2)],
                                          ["id", "feature"])
        model = Binarizer(inputCol='feature', outputCol='binarized')

        # the input name should match that of what StringIndexer.inputCol
        model_onnx = convert_sparkml(model, 'Sparkml Binarizer',
                                     [('feature', FloatTensorType([1, 1]))])
        self.assertTrue(model_onnx is not None)

        # run the model
        predicted = model.transform(data)
        expected = predicted.select("binarized").toPandas().values.astype(
            numpy.float32)
        data_np = data.select('feature').toPandas().values.astype(
            numpy.float32)
        paths = save_data_models(data_np,
                                 expected,
                                 model,
                                 model_onnx,
                                 basename="SparkmlBinarizer")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['binarized'], data_np,
                                               onnx_model_path)
        compare_results(expected, output, decimal=5)
Exemplo n.º 5
0
    def test_python_transformer_pipeline_persistence(self):
        """
        Pipeline[MockUnaryTransformer, Binarizer]
        """
        temp_path = tempfile.mkdtemp()

        try:
            df = self.spark.range(0, 10).toDF("input")
            tf = MockUnaryTransformer(
                shiftVal=2).setInputCol("input").setOutputCol("shiftedInput")
            tf2 = Binarizer(threshold=6,
                            inputCol="shiftedInput",
                            outputCol="binarized")
            pl = Pipeline(stages=[tf, tf2])
            model = pl.fit(df)

            pipeline_path = temp_path + "/pipeline"
            pl.save(pipeline_path)
            loaded_pipeline = Pipeline.load(pipeline_path)
            self._compare_pipelines(pl, loaded_pipeline)

            model_path = temp_path + "/pipeline-model"
            model.save(model_path)
            loaded_model = PipelineModel.load(model_path)
            self._compare_pipelines(model, loaded_model)
        finally:
            try:
                rmtree(temp_path)
            except OSError:
                pass
Exemplo n.º 6
0
def best_model(algo, bin, log):
    mdl = algo.fit(trainingData)
    pred = mdl.transform(testData)
    if bin:
        bina = Binarizer(threshold=0.5,
                         inputCol="prediction_c",
                         outputCol="prediction")
        pred = bina.transform(pred)
    acc = evaluator_multi.evaluate(pred)
    area_under_curve = evaluator_bin.evaluate(pred)
    print("Accuracy:", acc)
    print("Area Under ROC:", area_under_curve)
    print("Top Three Features")
    if log:
        feature_importance = mdl.coefficients.values
        for f in np.abs(feature_importance).argsort()[-3:][::-1]:
            print(schemaNames[f + 1], end=" ")
        print("")

    else:
        feature_importance = mdl.featureImportances

        top_features = np.zeros(ncolumns - 1)
        top_features[feature_importance.indices] = feature_importance.values
        for f in top_features.argsort()[-3:][::-1]:
            print(schemaNames[f + 1], end=" ")
        print("")

    return best_model
Exemplo n.º 7
0
def findmodel(algo, bin, log):
    model = algo.fit(trainingData)
    predictions = model.transform(testData)
    if bin:
        binarizer = Binarizer(threshold=0.5,
                              inputCol="prediction_c",
                              outputCol="prediction")
        predictions = binarizer.transform(predictions)
    accuracy = evaluatorM.evaluate(predictions)
    auc = evaluatorB.evaluate(predictions)
    print("Accuracy:", accuracy)
    print("Area Under ROC:", auc)
    print("Top Features")
    if log:
        fi = model.coefficients.values
        for i in np.abs(fi).argsort()[-3:][::-1]:
            print(schemaNames[i + 1], end=" ")
        print("")

    else:
        fi = model.featureImportances

        imp_feat = np.zeros(ncolumns - 1)
        imp_feat[fi.indices] = fi.values
        for i in imp_feat.argsort()[-3:][::-1]:
            print(schemaNames[i + 1], end=" ")
        print("")

    return model
Exemplo n.º 8
0
 def test_preserve_set_state(self):
     dataset = self.spark.createDataFrame([(0.5,)], ["data"])
     binarizer = Binarizer(inputCol="data")
     self.assertFalse(binarizer.isSet("threshold"))
     binarizer.transform(dataset)
     binarizer._transfer_params_from_java()
     self.assertFalse(binarizer.isSet("threshold"),
                      "Params not explicitly set should remain unset after transform")
Exemplo n.º 9
0
def binarization_by_threshold(dataFrame, threshold, inputCol):
    # 对连续值根据阈值threshold二值化
    binarizer = Binarizer(threshold=threshold,
                          inputCol=inputCol,
                          outputCol='%s_binarized' % (inputCol))
    binarizedDataFrame = binarizer.transform(dataFrame)
    print('Binarizer output with Threshold = %f' % binarizer.getThreshold())
    return binarizedDataFrame
Exemplo n.º 10
0
 def test_default_params_transferred(self):
     dataset = self.spark.createDataFrame([(0.5, )], ["data"])
     binarizer = Binarizer(inputCol="data")
     # intentionally change the pyspark default, but don't set it
     binarizer._defaultParamMap[binarizer.outputCol] = "my_default"
     result = binarizer.transform(dataset).select("my_default").collect()
     self.assertFalse(binarizer.isSet(binarizer.outputCol))
     self.assertEqual(result[0][0], 1.0)
def pre_processing(continuousDataFrame):
    binarizer = Binarizer(threshold=0.5,
                          inputCol="feature",
                          outputCol="binarized_feature")

    binarizedDataFrame = binarizer.transform(continuousDataFrame)

    print("Binarizer output with Threshold = %f" % binarizer.getThreshold())
    binarizedDataFrame.show()
Exemplo n.º 12
0
def binaryScalerModel(df, conf):
    """
        input: spark-dataFrame, conf [configuration params]
        return value: model
    """
    input = conf.get("inputCol", None)
    output = conf.get("outputCol", None)
    tres = conf.get("threshold", 0.0)
    model = Binarizer(threshold=tres, inputCol=input, outputCol=output)
    return model
Exemplo n.º 13
0
 def binarizer(self, df, column):
     """
     按指定阈值 二值化Binarizer
     """
     # 对连续值根据阈值threshold二值化
     binarizer = Binarizer(threshold=5.1,
                           inputCol=column,
                           outputCol=column + '_binarized_feature')
     binarizedDataFrame = binarizer.transform(df)
     print('Binarizer output with Threshold = %f' %
           binarizer.getThreshold())
     return binarizedDataFrame
def decisionTreeRegressor(data, ncolumns, schemaNames):
    from pyspark.ml import Pipeline
    from pyspark.ml.regression import DecisionTreeRegressor
    from pyspark.ml.tuning import ParamGridBuilder
    from pyspark.ml.feature import StringIndexer, VectorIndexer
    from pyspark.ml.tuning import CrossValidator
    from pyspark.ml.evaluation import RegressionEvaluator
    from pyspark.ml.feature import Binarizer
    from pyspark.ml.evaluation import BinaryClassificationEvaluator
    import numpy as np
    import time

    binarizer = Binarizer(
        threshold=0.00001,
        inputCol="features",
        outputCol="binarized_features",
    )
    binarizedDataFrame = binarizer.transform(data)

    (trainingData, testData) = binarizedDataFrame.randomSplit([0.9, 0.1], 50)
    dtr = DecisionTreeRegressor(labelCol="label",
                                featuresCol="binarized_features",
                                maxDepth=10,
                                maxBins=10,
                                impurity='Variance')

    timer = ''
    start = time.time()
    cvModel = dtr.fit(trainingData)
    end = time.time()
    timer = ((end - start) / 60)

    prediction = cvModel.transform(testData)
    evaluator = RegressionEvaluator\
         (labelCol="label", predictionCol="prediction", metricName="rmse")
    rmse = evaluator.evaluate(prediction)

    evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction")
    areaUC = evaluator.evaluate(prediction)

    fi = cvModel.featureImportances
    imp_feat = np.zeros(ncolumns - 1)
    imp_feat[fi.indices] = fi.values
    x = np.arange(ncolumns - 1)
    idx = (-imp_feat).argsort()[:3]
    feat = []
    for i in idx:
        feat.append(schemaNames[i])

    return feat, rmse, areaUC, timer
Exemplo n.º 15
0
def que1():
    for i, ct in enumerate([
            DecisionTreeClassifier(seed=9008),
            DecisionTreeRegressor(predictionCol="prediction_c", seed=9008),
            LogisticRegression()
    ]):
        binarizer = None
        if i == 0:
            print("[*] DecisionTree Classifier")
            paramB = ParamGridBuilder().addGrid(
                ct.maxDepth,
                [5, 10, 20]).addGrid(ct.maxBins, [16, 32]).addGrid(
                    ct.impurity, ["gini", "entropy"]).build()
            continue
        elif i == 1:
            print("[*] DecisionTree Regressor")
            paramB = ParamGridBuilder().addGrid(
                ct.maxDepth,
                [5, 10, 20]).addGrid(ct.maxBins, [16, 32]).addGrid(
                    ct.minInfoGain, [0.0, 0.25, 0.3]).build()
            binarizer = Binarizer(threshold=0.5,
                                  inputCol="prediction_c",
                                  outputCol="prediction")
        else:
            print("[*] Logistic Regression")
            paramB = ParamGridBuilder().addGrid(ct.maxIter,
                                                [5, 10, 15]).addGrid(
                                                    ct.regParam,
                                                    [0.05, 0.1, 0.5]).build()

        if binarizer is not None: pipeline = Pipeline(stages=[ct, binarizer])
        else: pipeline = Pipeline(stages=[ct])

        print("[*] Running for areaUnderROC")
        bp, metric_roc = run_metric(
            s_train, s_test, pipeline, paramB,
            BinaryClassificationEvaluator(rawPredictionCol="prediction",
                                          metricName="areaUnderROC"))
        print("[*] Done for areaUnderROC")
        print("[*] Best Params: %s, AreaUnderROC value: %f" % (bp, metric_roc))

        print("[*] Running for accuracy")
        mp, metric_acc = run_metric(
            s_train, s_test, pipeline, paramB,
            MulticlassClassificationEvaluator(predictionCol="prediction",
                                              metricName="accuracy"))
        print("[*] Done for accuracy")
        print("[*] Best Params: %s, Accuracy value: %f" % (mp, metric_acc))
Exemplo n.º 16
0
def prep_data(sqlContext, data, drops):
    """Prepares date for ML. Preparation includes: making a label column (by the rule: naacess > 10),
	applying drops and transforming data into LabeledPoint"""

    binarizer = Binarizer(threshold=10.0,
                          inputCol="naccess",
                          outputCol="target")
    data = binarizer.transform(data)

    drops = drops.split(",")
    cols = [x for x in data.columns if x not in set(drops)]

    data = data.select(cols)

    labeled = label_data(data)
    preped_data = sqlContext.createDataFrame(labeled, ['features', 'label'])

    return preped_data
Exemplo n.º 17
0
def que2():
    algo_name = [
        "Decision Tree Classifier", "Decision Tree Regressor",
        "Logistic Regression"
    ]
    for i, ct in enumerate([
            DecisionTreeClassifier(seed=9008,
                                   maxDepth=10,
                                   maxBins=16,
                                   impurity="gini"),
            DecisionTreeRegressor(predictionCol="prediction_c",
                                  seed=9008,
                                  maxDepth=10,
                                  maxBins=16,
                                  minInfoGain=0.0),
            LogisticRegression(regParam=0.05, maxIter=15)
    ]):
        if i == 1:
            pipeline = Pipeline(stages=[
                ct,
                Binarizer(threshold=0.5,
                          inputCol="prediction_c",
                          outputCol="prediction")
            ])
        else:
            pipeline = Pipeline(stages=[ct])

        thread_full_classification(
            n_col, f_train, f_test, pipeline,
            BinaryClassificationEvaluator(rawPredictionCol="prediction",
                                          metricName="areaUnderROC"),
            algo_name[i], "areaUnderROC", i)
        thread_full_classification(
            n_col, f_train, f_test, pipeline,
            MulticlassClassificationEvaluator(predictionCol="prediction",
                                              metricName="accuracy"),
            algo_name[i], "accuracy", i)
df = df.drop('number')

# In[8]:

df = df.na.drop()

# In[9]:

#df.count(),len(df.columns)

# Creating categorical variable: Let create a categorical variable to denote if the humidity is not low. If the value is less than 25%, then we want the categorical value to be 0, otherwise the categorical value should be 1. We can create this categorical variable as a column in a DataFrame using Binarizer

# In[10]:

binarizer = Binarizer(threshold=24.99999,
                      inputCol="relative_humidity_3pm",
                      outputCol="label")
binarizedDF = binarizer.transform(df)

# In[11]:

#binarizedDF.describe()

# # Creating target variable named label

# The threshold argument specifies the threshold value for the variable, inputCol is the input column to read, and outputCol is the name of the new categorical column. The second line applies the Binarizer and creates a new DataFrame with the categorical column. We can look at the first four values in the new DataFrame:

# In[12]:

#binarizedDF.select("relative_humidity_3pm","label").show(4)
ngram = NGram(n=2, inputCol="words", outputCol="ngrams")

ngramDataFrame = ngram.transform(wordDataFrame)
ngramDataFrame.select("ngrams").show(truncate=False)

# COMMAND ----------

###Binarizer, takes the numerical inputs and converts them into binary output (0 and 1) with respect to the threshold provided
from pyspark.ml.feature import Binarizer

continuousDataFrame = spark.createDataFrame([(0, 0.1), (1, 0.8), (2, 0.2)],
                                            ["id", "feature"])

binarizer = Binarizer(threshold=0.5,
                      inputCol="feature",
                      outputCol="binarized_feature")

binarizedDataFrame = binarizer.transform(continuousDataFrame)

print("Binarizer output with Threshold = %f" % binarizer.getThreshold())
binarizedDataFrame.show()

# COMMAND ----------

###PCA is a statistical procedure used to reduce the vector's dimensions. This example reduces a 5 dimensional feature into a 3 dimensional pca feature
from pyspark.ml.feature import PCA
from pyspark.ml.linalg import Vectors

data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]), ),
        (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]), ),
Exemplo n.º 20
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Jun 25 19:59:05 2017

@author: vishal
"""

from __future__ import print_function

from pyspark.sql import SparkSession

session = SparkSession.builder.appName("Binarizer").getOrCreate()

continuousDataFrame = session.createDataFrame([(0, 0.1), (1, 0.8), (2, 0.2)],
                                              ["id", "feature"])

#continuousDataFrame.show()

from pyspark.ml.feature import Binarizer

binarizer = Binarizer(threshold=0.5,
                      inputCol='feature',
                      outputCol='binarized_feature')

binarizedDataFrame = binarizer.transform(continuousDataFrame)
binarizedDataFrame.show()

session.stop()
Exemplo n.º 21
0
def test_log_stage_type_params(spark_session):
    from pyspark.ml.base import Estimator, Transformer, Model
    from pyspark.ml.evaluation import Evaluator
    from pyspark.ml.param import Param, Params
    from pyspark.ml.feature import Binarizer, OneHotEncoder

    class TestingEstimator(Estimator):

        transformer = Param(Params._dummy(), "transformer",
                            "a transformer param")
        model = Param(Params._dummy(), "model", "a model param")
        evaluator = Param(Params._dummy(), "evaluator", "an evaluator param")

        def setTransformer(self, transformer: Transformer):
            return self._set(transformer=transformer)

        def setModel(self, model: Model):
            return self._set(model=model)

        def setEvaluator(self, evaluator: Evaluator):
            return self._set(evaluator=evaluator)

        def _fit(self, dataset):
            return TestingModel()

    class TestingModel(Model):
        def _transform(self, dataset):
            return dataset

    binarizer = Binarizer(threshold=1.0,
                          inputCol="values",
                          outputCol="features")
    df = spark_session.createDataFrame([(0.0, ), (1.0, ), (2.0, )], ["input"])
    ohe = OneHotEncoder().setInputCols(["input"]).setOutputCols(["output"])
    ohemodel = ohe.fit(df)
    bcd = BinaryClassificationEvaluator(metricName="areaUnderROC")

    estimator = TestingEstimator().setTransformer(binarizer).setModel(
        ohemodel).setEvaluator(bcd)
    param_map = get_params_to_log(estimator)
    assert param_map["transformer"] == "Binarizer"
    assert param_map["model"] == "OneHotEncoderModel"
    assert param_map["evaluator"] == "BinaryClassificationEvaluator"

    mlflow.pyspark.ml.autolog()
    with mlflow.start_run() as run:
        estimator.fit(df)
        metadata = _gen_estimator_metadata(estimator)
        estimator_info = load_json_artifact("estimator_info.json")
        assert metadata.hierarchy == estimator_info["hierarchy"]
        assert isinstance(estimator_info["hierarchy"]["params"], dict)
        assert estimator_info["hierarchy"]["params"]["transformer"][
            "name"] == "Binarizer"
        assert estimator_info["hierarchy"]["params"]["model"][
            "name"] == "OneHotEncoderModel"
        assert (estimator_info["hierarchy"]["params"]["evaluator"]["name"] ==
                "BinaryClassificationEvaluator")
    run_id = run.info.run_id
    run_data = get_run_data(run_id)
    assert run_data.params == truncate_param_dict(
        stringify_dict_values(get_params_to_log(estimator)))
Exemplo n.º 22
0
print("Evaluating Oversampling dataset:")

lrModel = lr.fit(o_train)
prediction = lrModel.transform(o_test)
rmse = eval.evaluate(prediction)
print("RMSE: %.3f" % rmse)
r2 = eval.evaluate(prediction, {eval.metricName: "r2"})
print("r2: %.3f" % r2)

## Random Forest

## oversampling

print("Evaluating oversampling dataset Random Forest:")
binarizer = Binarizer(threshold=0, inputCol="label", outputCol="bin_labels")
train_bin = binarizer.transform(o_train)
test_bin = binarizer.transform(o_test)
evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction")
dt = RandomForestClassifier(labelCol="bin_labels",
                            featuresCol="pcaFeatures",
                            numTrees=10,
                            maxDepth=30)
dtModel = dt.fit(train_bin)
predictions = dtModel.transform(test_bin)
accuracy = evaluator.evaluate(predictions)
print("LR Accuracy = %g " % accuracy)
print(
    'AUC:',
    BinaryClassificationMetrics(predictions['label',
                                            'prediction'].rdd).areaUnderROC)
#https://spark.apache.org/docs/2.0.2/ml-features.html
#https://books.google.co.uk/books?id=HVQoDwAAQBAJ&pg=PA98&lpg=PA98&dq=pyspark+continuous+variables+into+binary&source=bl&ots=tLFqNhEgbH&sig=UiT8nAfOB6uzvNLwMfTEGAw9dk0&hl=en&sa=X&ved=0ahUKEwjTqrP4-c3WAhWOZFAKHdN6CKEQ6AEINDAC#v=onepage&q=pyspark%20continuous%20variables%20into%20binary&f=false

# Useful transformation functions:
#binarizer: converts continuous variables to 1 / 0 depending on set threshold
#bucketizer: ^ similar but for multi-class problems
#MaxAbsScaler: rescale data between -1 and 1 range
#MinMaxSacler: rescale data between 0 and 1 range
#OneHotEncoder: encodes categorical column to binary vectors
#PCA: self explanatory
#StandardScaler: convert so mean = 0 and sd = 1

from pyspark.ml.feature import Binarizer

binarizer = Binarizer(threshold=500,
                      inputCol="Yearly Amount Spent",
                      outputCol="label")
binarizedDataFrame = binarizer.transform(final_data)
binarizedDataFrame = binarizedDataFrame.drop("Yearly Amount Spent")
binarizedDataFrame.show()

from pyspark.ml.classification import LogisticRegression

logReg = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
fitted_logReg = logReg.fit(binarizedDataFrame)

# Print the coefficients and intercept for logistic regression
print("Coefficients: " + str(fitted_logReg.coefficients))
print("Intercept: " + str(fitted_logReg.intercept))
#log_summary = fitted_logReg.summary()
Exemplo n.º 24
0
qty_indexer = StringIndexer(inputCol="quantity",
                            outputCol="quantity_indexed",
                            handleInvalid="skip")
qty_df = qty_indexer.fit(ohe_df).transform(ohe_df)
qty_df.select("quantity", "quantity_indexed").display()

# COMMAND ----------

# MAGIC %md Binarization of continous numerical data.

# COMMAND ----------

from pyspark.ml.feature import Binarizer

binarizer = Binarizer(threshold=10,
                      inputCol="unit_price",
                      outputCol="binarized_price")
binarized_df = binarizer.transform(qty_df)
binarized_df.select("quantity", "binarized_price").display()

# COMMAND ----------

# MAGIC %md Transforming date/time columns

# COMMAND ----------

from pyspark.sql.functions import month

month_df = binarized_df.withColumn("invoice_month", month("invoice_time"))
month_indexer = StringIndexer(inputCol="invoice_month",
                              outputCol="month_indexed",
Exemplo n.º 25
0
def binarizer (dataset, inputCol, threshold=0.5):
    from pyspark.ml.feature import Binarizer
    return Binarizer(threshold=threshold, inputCol=inputCol, outputCol=inputCol + "_binarized").transform(dataset)
Exemplo n.º 26
0
# In[8]:

df = df.na.drop()

# In[9]:

#df.count(),len(df.columns)

# Creating categorical variable: Let create a categorical variable to denote if the humidity is not low. If the value is less than 25%, then we want the categorical value to be 0, otherwise the categorical value should be 1. We can create this categorical variable as a column in a DataFrame using Binarizer

# In[10]:

# binarizer = Binarizer(threshold=24.99999,inputCol="relative_humidity_3pm",outputCol="label")

binarizer = Binarizer(threshold=24.99999,
                      inputCol=target_col,
                      outputCol="label")
binarizedDF = binarizer.transform(df)

# In[11]:

#binarizedDF.describe()

# # Creating target variable named label

# The threshold argument specifies the threshold value for the variable, inputCol is the input column to read, and outputCol is the name of the new categorical column. The second line applies the Binarizer and creates a new DataFrame with the categorical column. We can look at the first four values in the new DataFrame:

# In[12]:

#binarizedDF.select("relative_humidity_3pm","label").show(4)
Exemplo n.º 27
0
remover = StopWordsRemover()    \
  .setInputCol("tokens")        \
  .setOutputCol("stopWordFree") \

counts = CountVectorizer()      \
  .setInputCol("stopWordFree")  \
  .setOutputCol("features")     \
  .setVocabSize(1000)

# COMMAND ----------

from pyspark.ml.feature import Binarizer

# Covnert to 0/1
binarizer = Binarizer()  \
  .setInputCol("rating") \
  .setOutputCol("label") \
  .setThreshold(3.5)

# COMMAND ----------

# MAGIC %md ### Workflows with Pyspark.ML Pipeline
# MAGIC <img src="https://s3-us-west-2.amazonaws.com/pub-tc/ML-workflow.png" width="640">

# COMMAND ----------

# MAGIC %md ### Train Classifier

# COMMAND ----------

from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
Exemplo n.º 28
0
df = df.join(piv_df, on='NO', how='left')

# Columns to zero fill
zfill_cols = piv_df.columns

# Zero fill the pivoted values
df = df.fillna(0, subset=zfill_cols.remove('NO'))

Binarizing Day of Week
In a previous video, we saw that it was very unlikely for a home to list on the weekend. Let's create a new field that says if the house is listed for sale on a weekday or not. In this example there is a field called List_Day_of_Week that has Monday is labeled 1.0 and Sunday is 7.0. Let's convert this to a binary field with weekday being 0 and weekend being 1. We can use the pyspark feature transformer Binarizer to do this.

# Import transformer
from pyspark.ml.feature import Binarizer

# Create the transformer
binarizer = Binarizer(threshold=5.0,inputCol='List_Day_of_Week', outputCol='Listed_On_Weekend')

# Apply the transformation to df
df = binarizer.transform(df)

# Verify transformation
df[['List_Day_of_Week', 'Listed_On_Weekend']].show()

Bucketing
If you are a homeowner its very important if a house has 1, 2, 3 or 4 bedrooms. But like bathrooms, once you hit a certain point you don't really care whether the house has 7 or 8. This example we'll look at how to figure out where are some good value points to bucket.

from pyspark.ml.feature import Bucketizer

# Plot distribution of sample_df
sns.distplot(sample_df, axlabel='BEDROOMS')
plt.show()
# **Note:** `__THIS__` is a placeholder for the DataFrame passed into the `transform` method.


# ## Generate label

# We can treat `star_rating` as a continuous numerical label or an ordered
# categorical label:
filtered.groupBy("star_rating").count().orderBy("star_rating").show()

# Rather than try to predict each value, let us see if we can distinguish
# between five-star and non-five-star ratings.  We can use the
# [Binarizer](http://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.feature.Binarizer)
# to create our binary label:
from pyspark.ml.feature import Binarizer
converted = filtered.withColumn("star_rating", col("star_rating").cast("double"))
binarizer = Binarizer(inputCol="star_rating", outputCol="high_rating", threshold = 4.5)
labeled = binarizer.transform(converted)
labeled.crosstab("star_rating", "high_rating").show()

# **Note:** `Binarizer` does not like integer values, thus we had to convert to doubles.


# ## Extract, transform, and select features

# Create function to explore features:
def explore(df, feature, label, plot=True):
  from pyspark.sql.functions import count, mean
  aggregated = df.groupby(feature).agg(count(label), mean(label)).orderBy(feature)
  aggregated.show()
  if plot == True:
    pdf = aggregated.toPandas()
Exemplo n.º 30
0
from pyspark.ml.feature import Binarizer
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession.builder.appName("binarizer").master(
        "local").getOrCreate()

    dataFrame = spark.createDataFrame([(0, 0.1), (1, 0.8), (2, 0.3)],
                                      ["id", "feature"])

    binaizer = Binarizer(inputCol="feature",
                         outputCol="binarizer",
                         threshold=0.5)

    binarizerDataFrame = binaizer.transform(dataFrame)

    binarizerDataFrame.show()

    spark.stop()