Python Binarizer示例，pyspark.ml.feature.Binarizer Python示例

示例#1

0

显示文件

文件： test_param.py 项目： Brett-A/spark

 def test_default_params_transferred(self):
     dataset = self.spark.createDataFrame([(0.5,)], ["data"])
     binarizer = Binarizer(inputCol="data")
     # intentionally change the pyspark default, but don't set it
     binarizer._defaultParamMap[binarizer.outputCol] = "my_default"
     result = binarizer.transform(dataset).select("my_default").collect()
     self.assertFalse(binarizer.isSet(binarizer.outputCol))
     self.assertEqual(result[0][0], 1.0)

示例#2

0

显示文件

文件： pyspark_ml.py 项目： dmwm/DMWMAnalytics

def prep_data(sqlContext, data, drops):
    """Prepares date for ML. Preparation includes: making a label column (by the rule: naacess > 10),
	applying drops and transforming data into LabeledPoint"""

    binarizer = Binarizer(threshold=10.0, inputCol="naccess", outputCol="target")
    data = binarizer.transform(data)

    drops = drops.split(",")
    cols = [x for x in data.columns if x not in set(drops)]

    data = data.select(cols)

    labeled = label_data(data)
    preped_data = sqlContext.createDataFrame(labeled, ['features','label'])

    return preped_data

示例#3

0

显示文件

文件： test_param.py 项目： Brett-A/spark

 def test_preserve_set_state(self):
     dataset = self.spark.createDataFrame([(0.5,)], ["data"])
     binarizer = Binarizer(inputCol="data")
     self.assertFalse(binarizer.isSet("threshold"))
     binarizer.transform(dataset)
     binarizer._transfer_params_from_java()
     self.assertFalse(binarizer.isSet("threshold"),
                      "Params not explicitly set should remain unset after transform")

示例#4

0

显示文件

文件： binarizer_example.py 项目： lhfei/spark-in-action

from __future__ import print_function

from pyspark.sql import SparkSession
# $example on$
from pyspark.ml.feature import Binarizer
# $example off$

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("BinarizerExample")\
        .getOrCreate()

    # $example on$
    continuousDataFrame = spark.createDataFrame([
        (0, 0.1),
        (1, 0.8),
        (2, 0.2)
    ], ["id", "feature"])

    binarizer = Binarizer(threshold=0.5, inputCol="feature", outputCol="binarized_feature")

    binarizedDataFrame = binarizer.transform(continuousDataFrame)

    print("Binarizer output with Threshold = %f" % binarizer.getThreshold())
    binarizedDataFrame.show()
    # $example off$

    spark.stop()

示例#5

0

显示文件

文件： binarizer_example.py 项目： 0xqq/spark

# See the License for the specific language governing permissions and
# limitations under the License.
#

from __future__ import print_function

from pyspark import SparkContext
from pyspark.sql import SQLContext
# $example on$
from pyspark.ml.feature import Binarizer
# $example off$

if __name__ == "__main__":
    sc = SparkContext(appName="BinarizerExample")
    sqlContext = SQLContext(sc)

    # $example on$
    continuousDataFrame = sqlContext.createDataFrame([
        (0, 0.1),
        (1, 0.8),
        (2, 0.2)
    ], ["label", "feature"])
    binarizer = Binarizer(threshold=0.5, inputCol="feature", outputCol="binarized_feature")
    binarizedDataFrame = binarizer.transform(continuousDataFrame)
    binarizedFeatures = binarizedDataFrame.select("binarized_feature")
    for binarized_feature, in binarizedFeatures.collect():
        print(binarized_feature)
    # $example off$

    sc.stop()

示例#6

0

显示文件

# pyspark.ml.feature module

#
from pyspark.ml.feature import Binarizer
df = sparksession.createDataFrame([(0.5,)], ["values"])
df.collect()
binarizer = Binarizer(threshold=1.0, inputCol="values", outputCol="features")
df2 = binarizer.transform(df)
df2.dtypes
df.collect()
df2.collect()
binarizer.getOutputCol()

rawData.take(1)
binarizer2 = Binarizer(threshold=0.5, inputCol="srv_diff_host_rate", outputCol="features")
binarizer2.transform(rawData)

binarizer.explainParam('inputCol')
binarizer.inputCol
binarizer.params

rawData.select(['count']).show()


rawData.dtypes
from pyspark.ml.feature import StringIndexer
stringIndexer = StringIndexer(inputCol="y_label", outputCol='indexed_y_label')
model = stringIndexer.fit(rawData)
td = model.transform(rawData)
td.dtypes

示例#7

0

显示文件

文件： test_feature.py 项目： Brett-A/spark

    def test_binarizer(self):
        b0 = Binarizer()
        self.assertListEqual(b0.params, [b0.inputCol, b0.outputCol, b0.threshold])
        self.assertTrue(all([~b0.isSet(p) for p in b0.params]))
        self.assertTrue(b0.hasDefault(b0.threshold))
        self.assertEqual(b0.getThreshold(), 0.0)
        b0.setParams(inputCol="input", outputCol="output").setThreshold(1.0)
        self.assertTrue(all([b0.isSet(p) for p in b0.params]))
        self.assertEqual(b0.getThreshold(), 1.0)
        self.assertEqual(b0.getInputCol(), "input")
        self.assertEqual(b0.getOutputCol(), "output")

        b0c = b0.copy({b0.threshold: 2.0})
        self.assertEqual(b0c.uid, b0.uid)
        self.assertListEqual(b0c.params, b0.params)
        self.assertEqual(b0c.getThreshold(), 2.0)

        b1 = Binarizer(threshold=2.0, inputCol="input", outputCol="output")
        self.assertNotEqual(b1.uid, b0.uid)
        self.assertEqual(b1.getThreshold(), 2.0)
        self.assertEqual(b1.getInputCol(), "input")
        self.assertEqual(b1.getOutputCol(), "output")

示例#8

0

显示文件

from pyspark.sql import SparkSession
from pyspark.ml.feature import Binarizer

# Binarization is the process of thresholding numerical features to binary
# (0/1) features.
# Binarizer takes the common parameters inputCol and outputCol, as well as the
# threshold for binarization. Feature values greater than the threshold are
# binarized to 1.0; values equal to or less than the threshold are binarized to
# 0.0. Both Vector and Double types are supported for inputCol.

spark = SparkSession.builder.appName("Binarizer").getOrCreate()

continuousDataFrame = spark.createDataFrame([
    (0, 0.1),
    (1, 0.8),
    (2, 0.2)
], ["id", "feature"])

binarizer = Binarizer(inputCol="feature", outputCol="binarized_feature",
                      threshold=0.5)
binarizedDataFrame = binarizer.transform(continuousDataFrame)

print("Binarized output with threshold = %f" % binarizer.getThreshold())
binarizedDataFrame.show()

spark.stop()

示例#9

0

显示文件

文件： test_feature.py 项目： zoelin7/spark

    def test_binarizer(self):
        b0 = Binarizer()
        self.assertListEqual(
            b0.params,
            [b0.inputCol, b0.inputCols, b0.outputCol, b0.outputCols, b0.threshold, b0.thresholds],
        )
        self.assertTrue(all([~b0.isSet(p) for p in b0.params]))
        self.assertTrue(b0.hasDefault(b0.threshold))
        self.assertEqual(b0.getThreshold(), 0.0)
        b0.setParams(inputCol="input", outputCol="output").setThreshold(1.0)
        self.assertTrue(not all([b0.isSet(p) for p in b0.params]))
        self.assertEqual(b0.getThreshold(), 1.0)
        self.assertEqual(b0.getInputCol(), "input")
        self.assertEqual(b0.getOutputCol(), "output")

        b0c = b0.copy({b0.threshold: 2.0})
        self.assertEqual(b0c.uid, b0.uid)
        self.assertListEqual(b0c.params, b0.params)
        self.assertEqual(b0c.getThreshold(), 2.0)

        b1 = Binarizer(threshold=2.0, inputCol="input", outputCol="output")
        self.assertNotEqual(b1.uid, b0.uid)
        self.assertEqual(b1.getThreshold(), 2.0)
        self.assertEqual(b1.getInputCol(), "input")
        self.assertEqual(b1.getOutputCol(), "output")

示例#10

0

显示文件

# ## Generate label

# We can treat `star_rating` as a continuous numerical label or an ordered
# categorical label:
filtered.groupBy("star_rating").count().orderBy("star_rating").show()

# Rather than try to predict each value, let us see if we can distinguish
# between five-star and non-five-star ratings.  We can use the
# [Binarizer](http://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.feature.Binarizer)
# to create our binary label:
from pyspark.ml.feature import Binarizer
converted = filtered.withColumn("star_rating",
                                col("star_rating").cast("double"))
binarizer = Binarizer(inputCol="star_rating",
                      outputCol="high_rating",
                      threshold=4.5)
labeled = binarizer.transform(converted)
labeled.crosstab("star_rating", "high_rating").show()

# **Note:** `Binarizer` does not like integer values, thus we had to convert to doubles.

# ## Extract, transform, and select features


# Create function to explore features:
def explore(df, feature, label, plot=True):
    from pyspark.sql.functions import count, mean
    aggregated = df.groupby(feature).agg(count(label),
                                         mean(label)).orderBy(feature)
    aggregated.show()

示例#11

0

显示文件

文件： piptest.py 项目： weiyudang/custom_transfomer

        student = gateway.jvm.Student("weiyduang")

        gateway.help(student)

        student.show()
        a = student.returnValue()

        # test 2
        java_import(gateway.jvm, "org.apache.spark.ml.feature.Abs")
        dd = gateway.jvm.Abs()
        gateway.help(dd)

        # test 3

        bin_ = Binarizer(threshold=0, inputCol='random', outputCol='bin_feature')
        abs_ = Abs(inputCol='random', outputCol='abs_feature')
        vc=VectorAssembler(inputCols=['random','abs_feature'],outputCol="features")
        lr=LogisticRegression()
        lr.setLabelCol("bin_feature")

        #
        pipline = Pipeline(stages=[bin_, abs_,vc,lr])
        model = pipline.fit(df)
        bin_df = model.transform(df)
        bin_df.show()

        print('load model and save model')
        print("---*-***--" * 20)
        model.write().overwrite().save("./abs.model")

示例#12

0

显示文件

df = df.drop('number')

# In[8]:

df = df.na.drop()

# In[9]:

#df.count(),len(df.columns)

# Creating categorical variable: Let create a categorical variable to denote if the humidity is not low. If the value is less than 25%, then we want the categorical value to be 0, otherwise the categorical value should be 1. We can create this categorical variable as a column in a DataFrame using Binarizer

# In[10]:

binarizer = Binarizer(threshold=24.99999,
                      inputCol="relative_humidity_3pm",
                      outputCol="label")
binarizedDF = binarizer.transform(df)

# In[11]:

#binarizedDF.describe()

# # Creating target variable named label

# The threshold argument specifies the threshold value for the variable, inputCol is the input column to read, and outputCol is the name of the new categorical column. The second line applies the Binarizer and creates a new DataFrame with the categorical column. We can look at the first four values in the new DataFrame:

# In[12]:

#binarizedDF.select("relative_humidity_3pm","label").show(4)