df3.show() df3.printSchema() df3 = OneHotEncoder(inputCol='color1', outputCol='color2', dropLast=False).transform(df3) df3.printSchema() df4 = StringIndexer(inputCol='type', outputCol='type1').fit(df2).transform(df3) df4.show() df4.printSchema() # Vector assembler df5 = VectorAssembler(inputCols=[ 'id', 'bone_length', 'rotting_flesh', 'hair_length', 'has_soul', 'color2' ], outputCol='Features').transform(df4) df5.show(truncate=False) df5.printSchema() # -------------------------------------------------------------------------- # data processing complete--- # 6 .Model building training = df5 #training.show(truncate=False,n=5) from pyspark.ml.classification import RandomForestClassifier df1 = RandomForestClassifier(featuresCol='Features', labelCol='type1', numTrees=86, maxDepth=10) model22 = df1.fit(training) model22.getNumTrees #model22.numFeatures training2 = model22.transform(training)
train_2 = train_1.withColumn("x", train_1["oldX"].cast("float")).drop("oldX") train_2 = train_2.withColumn( "label", train_1["oldLabel"].cast("float")).drop("oldLabel") train_2.cache() train_2.show() train_2.printSchema() train_2.printSchema() print(train_2.dtypes) train_2.describe().show() # Converting "features" column in a Vector column train_2 = VectorAssembler(inputCols=["x"], outputCol="feature").transform(train_2) train_2.printSchema() # Plotting Dataset f, axarr = plt.subplots(2, sharex=True) # Converting "features" DenseVector column to NPy Array npFeatures = np.array([]) for i in train_2.collect(): npFeatures = np.append(npFeatures, i['feature'].toArray()) # Converting "label" DenseVector column to NPy Array npLabels = np.array([]) for i in train_2.collect(): npLabels = np.append(npLabels, i['label']) axarr[0].plot(npFeatures, npLabels, label="Data", linewidth=2) # Pipeline: Polynomial expansion, Linear Regression and label vs. prediction charts for every degree for degree in [5, 6, 7]: