예제 #1
0
    def test_binarizer(self):
        b0 = Binarizer()
        self.assertListEqual(b0.params, [
            b0.inputCol, b0.inputCols, b0.outputCol, b0.outputCols,
            b0.threshold, b0.thresholds
        ])
        self.assertTrue(all([~b0.isSet(p) for p in b0.params]))
        self.assertTrue(b0.hasDefault(b0.threshold))
        self.assertEqual(b0.getThreshold(), 0.0)
        b0.setParams(inputCol="input", outputCol="output").setThreshold(1.0)
        self.assertTrue(not all([b0.isSet(p) for p in b0.params]))
        self.assertEqual(b0.getThreshold(), 1.0)
        self.assertEqual(b0.getInputCol(), "input")
        self.assertEqual(b0.getOutputCol(), "output")

        b0c = b0.copy({b0.threshold: 2.0})
        self.assertEqual(b0c.uid, b0.uid)
        self.assertListEqual(b0c.params, b0.params)
        self.assertEqual(b0c.getThreshold(), 2.0)

        b1 = Binarizer(threshold=2.0, inputCol="input", outputCol="output")
        self.assertNotEqual(b1.uid, b0.uid)
        self.assertEqual(b1.getThreshold(), 2.0)
        self.assertEqual(b1.getInputCol(), "input")
        self.assertEqual(b1.getOutputCol(), "output")
예제 #2
0
def binarization_by_threshold(dataFrame, threshold, inputCol):
    # 对连续值根据阈值threshold二值化
    binarizer = Binarizer(threshold=threshold,
                          inputCol=inputCol,
                          outputCol='%s_binarized' % (inputCol))
    binarizedDataFrame = binarizer.transform(dataFrame)
    print('Binarizer output with Threshold = %f' % binarizer.getThreshold())
    return binarizedDataFrame
def pre_processing(continuousDataFrame):
    binarizer = Binarizer(threshold=0.5,
                          inputCol="feature",
                          outputCol="binarized_feature")

    binarizedDataFrame = binarizer.transform(continuousDataFrame)

    print("Binarizer output with Threshold = %f" % binarizer.getThreshold())
    binarizedDataFrame.show()
예제 #4
0
 def binarizer(self, df, column):
     """
     按指定阈值 二值化Binarizer
     """
     # 对连续值根据阈值threshold二值化
     binarizer = Binarizer(threshold=5.1,
                           inputCol=column,
                           outputCol=column + '_binarized_feature')
     binarizedDataFrame = binarizer.transform(df)
     print('Binarizer output with Threshold = %f' %
           binarizer.getThreshold())
     return binarizedDataFrame
예제 #5
0
    def test_binarizer(self):
        b0 = Binarizer()
        self.assertListEqual(b0.params, [b0.inputCol, b0.outputCol, b0.threshold])
        self.assertTrue(all([~b0.isSet(p) for p in b0.params]))
        self.assertTrue(b0.hasDefault(b0.threshold))
        self.assertEqual(b0.getThreshold(), 0.0)
        b0.setParams(inputCol="input", outputCol="output").setThreshold(1.0)
        self.assertTrue(all([b0.isSet(p) for p in b0.params]))
        self.assertEqual(b0.getThreshold(), 1.0)
        self.assertEqual(b0.getInputCol(), "input")
        self.assertEqual(b0.getOutputCol(), "output")

        b0c = b0.copy({b0.threshold: 2.0})
        self.assertEqual(b0c.uid, b0.uid)
        self.assertListEqual(b0c.params, b0.params)
        self.assertEqual(b0c.getThreshold(), 2.0)

        b1 = Binarizer(threshold=2.0, inputCol="input", outputCol="output")
        self.assertNotEqual(b1.uid, b0.uid)
        self.assertEqual(b1.getThreshold(), 2.0)
        self.assertEqual(b1.getInputCol(), "input")
        self.assertEqual(b1.getOutputCol(), "output")
# COMMAND ----------

###Binarizer, takes the numerical inputs and converts them into binary output (0 and 1) with respect to the threshold provided
from pyspark.ml.feature import Binarizer

continuousDataFrame = spark.createDataFrame([(0, 0.1), (1, 0.8), (2, 0.2)],
                                            ["id", "feature"])

binarizer = Binarizer(threshold=0.5,
                      inputCol="feature",
                      outputCol="binarized_feature")

binarizedDataFrame = binarizer.transform(continuousDataFrame)

print("Binarizer output with Threshold = %f" % binarizer.getThreshold())
binarizedDataFrame.show()

# COMMAND ----------

###PCA is a statistical procedure used to reduce the vector's dimensions. This example reduces a 5 dimensional feature into a 3 dimensional pca feature
from pyspark.ml.feature import PCA
from pyspark.ml.linalg import Vectors

data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]), ),
        (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]), ),
        (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]), )]
df = spark.createDataFrame(data, ["features"])

pca = PCA(k=3, inputCol="features", outputCol="pcaFeatures")
model = pca.fit(df)
예제 #7
0
from __future__ import print_function

from pyspark.sql import SparkSession
# $example on$
from pyspark.ml.feature import Binarizer
# $example off$

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("BinarizerExample")\
        .getOrCreate()

    # $example on$
    continuousDataFrame = spark.createDataFrame([
        (0, 0.1),
        (1, 0.8),
        (2, 0.2)
    ], ["id", "feature"])

    binarizer = Binarizer(threshold=0.5, inputCol="feature", outputCol="binarized_feature")

    binarizedDataFrame = binarizer.transform(continuousDataFrame)

    print("Binarizer output with Threshold = %f" % binarizer.getThreshold())
    binarizedDataFrame.show()
    # $example off$

    spark.stop()
예제 #8
0
from pyspark.ml.feature import StopWordsRemover, NGram, Binarizer
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("stopworkremover").getOrCreate()

seqdata = spark.createDataFrame([(0, ["I", "saw", "the", "red", "balloon"]),
                                 (1, ["Mary", "had", "a", "little", "lamb"])],
                                ["id", "raw"])
remover = StopWordsRemover(inputCol="raw", outputCol="Filtered")
remover.transform(seqdata).show(truncate=False)

wordDataFrame = spark.createDataFrame(
    [(0, ["Hi", "I", "heard", "about", "Spark"]),
     (1, ["I", "wish", "Java", "could", "use", "case", "classes"]),
     (2, ["Logistic", "regression", "models", "are", "neat"])],
    ["id", "words"])
ngram = NGram(n=2, inputCol="words", outputCol="ngrams")
ngramdf = ngram.transform(wordDataFrame)
ngramdf.select("ngrams").show(truncate=False)

continuousDataFrame = spark.createDataFrame([(0, 0.1), (1, 0.8), (2, 0.2)],
                                            ["id", "feature"])
binarizer = Binarizer(threshold=0.1,
                      inputCol="feature",
                      outputCol="binarized_feature")
bnzrDataframe = binarizer.transform(continuousDataFrame)
print("binarizer threshold", binarizer.getThreshold())
bnzrDataframe.show(truncate=False)