示例#1
0
 def test_default_params_transferred(self):
     dataset = self.spark.createDataFrame([(0.5,)], ["data"])
     binarizer = Binarizer(inputCol="data")
     # intentionally change the pyspark default, but don't set it
     binarizer._defaultParamMap[binarizer.outputCol] = "my_default"
     result = binarizer.transform(dataset).select("my_default").collect()
     self.assertFalse(binarizer.isSet(binarizer.outputCol))
     self.assertEqual(result[0][0], 1.0)
示例#2
0
def prep_data(sqlContext, data, drops):
    """Prepares date for ML. Preparation includes: making a label column (by the rule: naacess > 10),
	applying drops and transforming data into LabeledPoint"""

    binarizer = Binarizer(threshold=10.0, inputCol="naccess", outputCol="target")
    data = binarizer.transform(data)

    drops = drops.split(",")
    cols = [x for x in data.columns if x not in set(drops)]

    data = data.select(cols)

    labeled = label_data(data)
    preped_data = sqlContext.createDataFrame(labeled, ['features','label'])

    return preped_data
示例#3
0
 def test_preserve_set_state(self):
     dataset = self.spark.createDataFrame([(0.5,)], ["data"])
     binarizer = Binarizer(inputCol="data")
     self.assertFalse(binarizer.isSet("threshold"))
     binarizer.transform(dataset)
     binarizer._transfer_params_from_java()
     self.assertFalse(binarizer.isSet("threshold"),
                      "Params not explicitly set should remain unset after transform")
from __future__ import print_function

from pyspark.sql import SparkSession
# $example on$
from pyspark.ml.feature import Binarizer
# $example off$

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("BinarizerExample")\
        .getOrCreate()

    # $example on$
    continuousDataFrame = spark.createDataFrame([
        (0, 0.1),
        (1, 0.8),
        (2, 0.2)
    ], ["id", "feature"])

    binarizer = Binarizer(threshold=0.5, inputCol="feature", outputCol="binarized_feature")

    binarizedDataFrame = binarizer.transform(continuousDataFrame)

    print("Binarizer output with Threshold = %f" % binarizer.getThreshold())
    binarizedDataFrame.show()
    # $example off$

    spark.stop()
示例#5
0
# See the License for the specific language governing permissions and
# limitations under the License.
#

from __future__ import print_function

from pyspark import SparkContext
from pyspark.sql import SQLContext
# $example on$
from pyspark.ml.feature import Binarizer
# $example off$

if __name__ == "__main__":
    sc = SparkContext(appName="BinarizerExample")
    sqlContext = SQLContext(sc)

    # $example on$
    continuousDataFrame = sqlContext.createDataFrame([
        (0, 0.1),
        (1, 0.8),
        (2, 0.2)
    ], ["label", "feature"])
    binarizer = Binarizer(threshold=0.5, inputCol="feature", outputCol="binarized_feature")
    binarizedDataFrame = binarizer.transform(continuousDataFrame)
    binarizedFeatures = binarizedDataFrame.select("binarized_feature")
    for binarized_feature, in binarizedFeatures.collect():
        print(binarized_feature)
    # $example off$

    sc.stop()
示例#6
0
# pyspark.ml.feature module

#
from pyspark.ml.feature import Binarizer
df = sparksession.createDataFrame([(0.5,)], ["values"])
df.collect()
binarizer = Binarizer(threshold=1.0, inputCol="values", outputCol="features")
df2 = binarizer.transform(df)
df2.dtypes
df.collect()
df2.collect()
binarizer.getOutputCol()

rawData.take(1)
binarizer2 = Binarizer(threshold=0.5, inputCol="srv_diff_host_rate", outputCol="features")
binarizer2.transform(rawData)

binarizer.explainParam('inputCol')
binarizer.inputCol
binarizer.params

rawData.select(['count']).show()


rawData.dtypes
from pyspark.ml.feature import StringIndexer
stringIndexer = StringIndexer(inputCol="y_label", outputCol='indexed_y_label')
model = stringIndexer.fit(rawData)
td = model.transform(rawData)
td.dtypes
示例#7
0
    def test_binarizer(self):
        b0 = Binarizer()
        self.assertListEqual(b0.params, [b0.inputCol, b0.outputCol, b0.threshold])
        self.assertTrue(all([~b0.isSet(p) for p in b0.params]))
        self.assertTrue(b0.hasDefault(b0.threshold))
        self.assertEqual(b0.getThreshold(), 0.0)
        b0.setParams(inputCol="input", outputCol="output").setThreshold(1.0)
        self.assertTrue(all([b0.isSet(p) for p in b0.params]))
        self.assertEqual(b0.getThreshold(), 1.0)
        self.assertEqual(b0.getInputCol(), "input")
        self.assertEqual(b0.getOutputCol(), "output")

        b0c = b0.copy({b0.threshold: 2.0})
        self.assertEqual(b0c.uid, b0.uid)
        self.assertListEqual(b0c.params, b0.params)
        self.assertEqual(b0c.getThreshold(), 2.0)

        b1 = Binarizer(threshold=2.0, inputCol="input", outputCol="output")
        self.assertNotEqual(b1.uid, b0.uid)
        self.assertEqual(b1.getThreshold(), 2.0)
        self.assertEqual(b1.getInputCol(), "input")
        self.assertEqual(b1.getOutputCol(), "output")
示例#8
0
from pyspark.sql import SparkSession
from pyspark.ml.feature import Binarizer

# Binarization is the process of thresholding numerical features to binary
# (0/1) features.
# Binarizer takes the common parameters inputCol and outputCol, as well as the
# threshold for binarization. Feature values greater than the threshold are
# binarized to 1.0; values equal to or less than the threshold are binarized to
# 0.0. Both Vector and Double types are supported for inputCol.

spark = SparkSession.builder.appName("Binarizer").getOrCreate()

continuousDataFrame = spark.createDataFrame([
    (0, 0.1),
    (1, 0.8),
    (2, 0.2)
], ["id", "feature"])

binarizer = Binarizer(inputCol="feature", outputCol="binarized_feature",
                      threshold=0.5)
binarizedDataFrame = binarizer.transform(continuousDataFrame)

print("Binarized output with threshold = %f" % binarizer.getThreshold())
binarizedDataFrame.show()

spark.stop()
示例#9
0
    def test_binarizer(self):
        b0 = Binarizer()
        self.assertListEqual(
            b0.params,
            [b0.inputCol, b0.inputCols, b0.outputCol, b0.outputCols, b0.threshold, b0.thresholds],
        )
        self.assertTrue(all([~b0.isSet(p) for p in b0.params]))
        self.assertTrue(b0.hasDefault(b0.threshold))
        self.assertEqual(b0.getThreshold(), 0.0)
        b0.setParams(inputCol="input", outputCol="output").setThreshold(1.0)
        self.assertTrue(not all([b0.isSet(p) for p in b0.params]))
        self.assertEqual(b0.getThreshold(), 1.0)
        self.assertEqual(b0.getInputCol(), "input")
        self.assertEqual(b0.getOutputCol(), "output")

        b0c = b0.copy({b0.threshold: 2.0})
        self.assertEqual(b0c.uid, b0.uid)
        self.assertListEqual(b0c.params, b0.params)
        self.assertEqual(b0c.getThreshold(), 2.0)

        b1 = Binarizer(threshold=2.0, inputCol="input", outputCol="output")
        self.assertNotEqual(b1.uid, b0.uid)
        self.assertEqual(b1.getThreshold(), 2.0)
        self.assertEqual(b1.getInputCol(), "input")
        self.assertEqual(b1.getOutputCol(), "output")
示例#10
0
# ## Generate label

# We can treat `star_rating` as a continuous numerical label or an ordered
# categorical label:
filtered.groupBy("star_rating").count().orderBy("star_rating").show()

# Rather than try to predict each value, let us see if we can distinguish
# between five-star and non-five-star ratings.  We can use the
# [Binarizer](http://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.feature.Binarizer)
# to create our binary label:
from pyspark.ml.feature import Binarizer
converted = filtered.withColumn("star_rating",
                                col("star_rating").cast("double"))
binarizer = Binarizer(inputCol="star_rating",
                      outputCol="high_rating",
                      threshold=4.5)
labeled = binarizer.transform(converted)
labeled.crosstab("star_rating", "high_rating").show()

# **Note:** `Binarizer` does not like integer values, thus we had to convert to doubles.

# ## Extract, transform, and select features


# Create function to explore features:
def explore(df, feature, label, plot=True):
    from pyspark.sql.functions import count, mean
    aggregated = df.groupby(feature).agg(count(label),
                                         mean(label)).orderBy(feature)
    aggregated.show()
示例#11
0
        student = gateway.jvm.Student("weiyduang")

        gateway.help(student)

        student.show()
        a = student.returnValue()

        # test 2
        java_import(gateway.jvm, "org.apache.spark.ml.feature.Abs")
        dd = gateway.jvm.Abs()
        gateway.help(dd)

        # test 3

        bin_ = Binarizer(threshold=0, inputCol='random', outputCol='bin_feature')
        abs_ = Abs(inputCol='random', outputCol='abs_feature')
        vc=VectorAssembler(inputCols=['random','abs_feature'],outputCol="features")
        lr=LogisticRegression()
        lr.setLabelCol("bin_feature")

        #
        pipline = Pipeline(stages=[bin_, abs_,vc,lr])
        model = pipline.fit(df)
        bin_df = model.transform(df)
        bin_df.show()

        print('load model and save model')
        print("---*-***--" * 20)
        model.write().overwrite().save("./abs.model")
示例#12
0
df = df.drop('number')

# In[8]:

df = df.na.drop()

# In[9]:

#df.count(),len(df.columns)

# Creating categorical variable: Let create a categorical variable to denote if the humidity is not low. If the value is less than 25%, then we want the categorical value to be 0, otherwise the categorical value should be 1. We can create this categorical variable as a column in a DataFrame using Binarizer

# In[10]:

binarizer = Binarizer(threshold=24.99999,
                      inputCol="relative_humidity_3pm",
                      outputCol="label")
binarizedDF = binarizer.transform(df)

# In[11]:

#binarizedDF.describe()

# # Creating target variable named label

# The threshold argument specifies the threshold value for the variable, inputCol is the input column to read, and outputCol is the name of the new categorical column. The second line applies the Binarizer and creates a new DataFrame with the categorical column. We can look at the first four values in the new DataFrame:

# In[12]:

#binarizedDF.select("relative_humidity_3pm","label").show(4)