def test_NNEstimator_works_with_VectorAssembler_multi_input(self): if self.sc.version.startswith("2"): from pyspark.ml.linalg import Vectors from pyspark.ml.feature import VectorAssembler from pyspark.sql import SparkSession spark = SparkSession \ .builder \ .getOrCreate() df = spark.createDataFrame( [(1, 35, 109.0, Vectors.dense([2.0, 5.0, 0.5, 0.5]), 1.0), (2, 58, 2998.0, Vectors.dense([4.0, 10.0, 0.5, 0.5]), 2.0), (3, 18, 123.0, Vectors.dense([3.0, 15.0, 0.5, 0.5]), 1.0)], ["user", "age", "income", "history", "label"]) assembler = VectorAssembler( inputCols=["user", "age", "income", "history"], outputCol="features") df = assembler.transform(df) x1 = ZLayer.Input(shape=(1, )) x2 = ZLayer.Input(shape=(2, )) x3 = ZLayer.Input(shape=( 2, 2, )) user_embedding = ZLayer.Embedding(5, 10)(x1) flatten = ZLayer.Flatten()(user_embedding) dense1 = ZLayer.Dense(2)(x2) gru = ZLayer.LSTM(4, input_shape=(2, 2))(x3) merged = ZLayer.merge([flatten, dense1, gru], mode="concat") zy = ZLayer.Dense(2)(merged) zmodel = ZModel([x1, x2, x3], zy) criterion = ClassNLLCriterion() classifier = NNClassifier(zmodel, criterion, [[1], [2], [2, 2]]) \ .setOptimMethod(Adam()) \ .setLearningRate(0.1) \ .setBatchSize(2) \ .setMaxEpoch(10) nnClassifierModel = classifier.fit(df) print(nnClassifierModel.getBatchSize()) res = nnClassifierModel.transform(df).collect()
def test_nnEstimator_multiInput_cols(self): from pyspark.ml.linalg import Vectors from pyspark.sql import SparkSession spark = SparkSession \ .builder \ .getOrCreate() df = spark.createDataFrame( [(1, 35, 109.0, Vectors.dense([2.0, 5.0, 0.5, 0.5]), 1.0), (2, 58, 2998.0, Vectors.dense([4.0, 10.0, 0.5, 0.5]), 2.0), (3, 18, 123.0, Vectors.dense([3.0, 15.0, 0.5, 0.5]), 1.0), (4, 18, 123.0, Vectors.dense([3.0, 15.0, 0.5, 0.5]), 1.0)], ["user", "age", "income", "history", "label"]) x1 = ZLayer.Input(shape=(1, )) x2 = ZLayer.Input(shape=(2, )) x3 = ZLayer.Input(shape=( 2, 2, )) user_embedding = ZLayer.Embedding(5, 10)(x1) flatten = ZLayer.Flatten()(user_embedding) dense1 = ZLayer.Dense(2)(x2) gru = ZLayer.LSTM(4, input_shape=(2, 2))(x3) merged = ZLayer.merge([flatten, dense1, gru], mode="concat") zy = ZLayer.Dense(2)(merged) zmodel = ZModel([x1, x2, x3], zy) criterion = ClassNLLCriterion() est = Estimator.from_bigdl(model=zmodel, loss=criterion, optimizer=Adam(learningrate=0.1), feature_preprocessing=[[1], [2], [2, 2]]) est.fit(df, epochs=1, batch_size=4, feature_cols=["user", "age", "income", "history"]) res = est.predict(df, feature_cols=["user", "age", "income", "history"]) res_c = res.collect() assert type(res).__name__ == 'DataFrame'