예제 #1
0
df3.show()
df3.printSchema()
df3 = OneHotEncoder(inputCol='color1', outputCol='color2',
                    dropLast=False).transform(df3)
df3.printSchema()
df4 = StringIndexer(inputCol='type', outputCol='type1').fit(df2).transform(df3)
df4.show()
df4.printSchema()

# Vector assembler
df5 = VectorAssembler(inputCols=[
    'id', 'bone_length', 'rotting_flesh', 'hair_length', 'has_soul', 'color2'
],
                      outputCol='Features').transform(df4)
df5.show(truncate=False)
df5.printSchema()
# --------------------------------------------------------------------------

# data processing complete---
# 6 .Model building
training = df5
#training.show(truncate=False,n=5)
from pyspark.ml.classification import RandomForestClassifier
df1 = RandomForestClassifier(featuresCol='Features',
                             labelCol='type1',
                             numTrees=86,
                             maxDepth=10)
model22 = df1.fit(training)
model22.getNumTrees
#model22.numFeatures
training2 = model22.transform(training)
예제 #2
0
    train_2 = train_1.withColumn("x",
                                 train_1["oldX"].cast("float")).drop("oldX")
    train_2 = train_2.withColumn(
        "label", train_1["oldLabel"].cast("float")).drop("oldLabel")
    train_2.cache()
    train_2.show()
    train_2.printSchema()

    train_2.printSchema()
    print(train_2.dtypes)
    train_2.describe().show()

    # Converting "features" column in a Vector column
    train_2 = VectorAssembler(inputCols=["x"],
                              outputCol="feature").transform(train_2)
    train_2.printSchema()

    # Plotting Dataset
    f, axarr = plt.subplots(2, sharex=True)
    # Converting "features" DenseVector column to NPy Array
    npFeatures = np.array([])
    for i in train_2.collect():
        npFeatures = np.append(npFeatures, i['feature'].toArray())
    # Converting "label" DenseVector column to NPy Array
    npLabels = np.array([])
    for i in train_2.collect():
        npLabels = np.append(npLabels, i['label'])
    axarr[0].plot(npFeatures, npLabels, label="Data", linewidth=2)

    # Pipeline: Polynomial expansion, Linear Regression and label vs. prediction charts for every degree
    for degree in [5, 6, 7]: