def test_overlapping_guassians(): dat = [(1.0, Vectors.dense(np.random.normal(0, 1, 10))) for _ in range(0, 200)] dat2 = [(0.0, Vectors.dense(np.random.normal(2, 1, 10))) for _ in range(0, 200)] dat.extend(dat2) random.shuffle(dat) processed = spark.createDataFrame(dat, ["label", "features"]) first_graph = tf.Graph() with first_graph.as_default() as g: v = create_random_model() mg = json_format.MessageToJson(tf.train.export_meta_graph()) spark_model = SparkAsyncDL(inputCol='features', tensorflowGraph=mg, tfInput='x:0', tfLabel='y:0', tfOutput='outer/Sigmoid:0', tfOptimizer='adam', tfLearningRate=.1, iters=35, partitions=4, predictionCol='predicted', labelCol='label') data = spark_model.fit(processed).transform(processed).take(10) nb_errors = 0 for d in data: lab = d['label'] predicted = d['predicted'][0] if predicted != lab: nb_errors += 1 assert nb_errors < len(data)
def test_multi_partition_shuffle(): dat = [(1.0, Vectors.dense(np.random.normal(0, 1, 10))) for _ in range(0, 200)] dat2 = [(0.0, Vectors.dense(np.random.normal(2, 1, 10))) for _ in range(0, 200)] dat.extend(dat2) random.shuffle(dat) processed = spark.createDataFrame(dat, ["label", "features"]) mg = build_graph(create_random_model) spark_model = SparkAsyncDL(inputCol='features', tensorflowGraph=mg, tfInput='x:0', tfLabel='y:0', tfOutput='outer/Sigmoid:0', tfOptimizer='adam', tfLearningRate=.1, iters=20, partitions=4, predictionCol='predicted', labelCol='label', partitionShuffles=2) data = spark_model.fit(processed).transform(processed).take(10) nb_errors = 0 for d in data: lab = d['label'] predicted = d['predicted'][0] if predicted != lab: nb_errors += 1 assert nb_errors < len(data)
def test_auto_encoder(self): processed = self.generate_random_data() mg = build_graph(SparkFlowTests.create_autoencoder) spark_model = SparkAsyncDL(inputCol='features', tensorflowGraph=mg, tfInput='x:0', tfLabel=None, tfOutput='out/Sigmoid:0', tfOptimizer='adam', tfLearningRate=.001, iters=10, predictionCol='predicted', partitions=4, miniBatchSize=10, verbose=1) encoded = spark_model.fit(processed).transform(processed).take(10) print(encoded[0]['predicted'])
def test_save_model(self): processed = self.generate_random_data() mg = build_graph(SparkFlowTests.create_random_model) spark_model = SparkAsyncDL(inputCol='features', tensorflowGraph=mg, tfInput='x:0', tfLabel='y:0', tfOutput='outer/Sigmoid:0', tfOptimizer='adam', tfLearningRate=.1, iters=20, partitions=2, predictionCol='predicted', labelCol='label') fitted = spark_model.fit(processed) fitted.save('saved_model') model = SparkAsyncDLModel.load("saved_model") data = model.transform(processed).take(10) nb_errors = SparkFlowTests.calculate_errors(data) self.assertTrue(nb_errors < len(data))
def test_small_sparse(self): xor = [(0.0, Vectors.sparse(2, [0, 1], [0.0, 0.0])), (0.0, Vectors.sparse(2, [0, 1], [1.0, 1.0])), (1.0, Vectors.sparse(2, [0], [1.0])), (1.0, Vectors.sparse(2, [1], [1.0]))] processed = self.spark.createDataFrame(xor, ["label", "features"]) mg = build_graph(SparkFlowTests.create_model) spark_model = SparkAsyncDL(inputCol='features', tensorflowGraph=mg, tfInput='x:0', tfLabel='y:0', tfOutput='outer/Sigmoid:0', tfOptimizer='adam', tfLearningRate=.1, iters=35, partitions=2, predictionCol='predicted', labelCol='label') assert spark_model.fit(processed).transform( processed).collect() is not None
df = spark.read.option("inferSchema", "true").csv('mnist_train.csv').orderBy(rand()) mg = build_graph(small_model) adam_config = build_adam_config(learning_rate=0.001, beta1=0.9, beta2=0.999) va = VectorAssembler(inputCols=df.columns[1:785], outputCol='features').transform(df) encoded = OneHotEncoder(inputCol='_c0', outputCol='labels', dropLast=False).transform(va).select(['features', 'labels']) #demonstration of options. Not all are required spark_model = SparkAsyncDL( inputCol='features', tensorflowGraph=mg, tfInput='x:0', tfLabel='y:0', tfOutput='out:0', tfOptimizer='adam', miniBatchSize=300, miniStochasticIters=-1, shufflePerIter=True, iters=20, predictionCol='predicted', labelCol='labels', partitions=4, verbose=1, optimizerOptions=adam_config ) spark_model.fit(encoded).save('simple_dnn') x = SparkAsyncDLModel.load("simple_dnn").transform(encoded).take(10) print(x)
tfLabel='y:0', tfOutput='out/Sigmoid:0', tfLearningRate=0.001, # 学习率设为0.001 iters=50, # 训练50轮 predictionCol='probability', miniBatchSize=200, # 每批200个数据 verbose=1, tfOptimizer='adam' # 优化器选择Adam ) # 拟合模型 pipeline = Pipeline(stages=[featuresCreator]) data_transformer = pipeline.fit(data_train) ANN_model = spark_model.fit( data_transformer \ .transform(data_train) ) # 模型预测 prediction = ANN_model.transform( \ data_transformer \ .transform(data_test) ) results = prediction.select('id', 'label', 'probability') from pyspark.sql.functions import col, when prediction = when(col('probability') > 0.5, 1.0).otherwise(0.0) results = results.withColumn('prediction', prediction) # 查看预测结果(前10行) results.show(10)
spark = SparkSession.builder \ .appName("examples") \ .master('local[8]').config('spark.driver.memory', '4g') \ .getOrCreate() df = spark.read.option("inferSchema", "true").csv('mnist_train.csv').orderBy(rand()) mg = build_graph(small_model) va = VectorAssembler(inputCols=df.columns[1:785], outputCol='feats').transform(df).select(['feats']) na = Normalizer(inputCol='feats', outputCol='features', p=1.0).transform(va).select(['features']) #demonstration of options. Not all are required spark_model = SparkAsyncDL( inputCol='features', tensorflowGraph=mg, tfInput='x:0', tfLabel=None, tfOutput='out:0', tfOptimizer='adam', tfLearningRate=.001, iters=10, predictionCol='predicted', partitions=3, miniBatchSize=256, verbose=1 ).fit(na) spark_model.fit(na).save('auto_encoded') x = SparkAsyncDLModel.load("auto_encoded").transform(na).take(10) print(x)