예제 #1
0
def test_multi_partition_shuffle():
    dat = [(1.0, Vectors.dense(np.random.normal(0, 1, 10)))
           for _ in range(0, 200)]
    dat2 = [(0.0, Vectors.dense(np.random.normal(2, 1, 10)))
            for _ in range(0, 200)]
    dat.extend(dat2)
    random.shuffle(dat)
    processed = spark.createDataFrame(dat, ["label", "features"])

    mg = build_graph(create_random_model)

    spark_model = SparkAsyncDL(inputCol='features',
                               tensorflowGraph=mg,
                               tfInput='x:0',
                               tfLabel='y:0',
                               tfOutput='outer/Sigmoid:0',
                               tfOptimizer='adam',
                               tfLearningRate=.1,
                               iters=20,
                               partitions=4,
                               predictionCol='predicted',
                               labelCol='label',
                               partitionShuffles=2)
    data = spark_model.fit(processed).transform(processed).take(10)
    nb_errors = 0
    for d in data:
        lab = d['label']
        predicted = d['predicted'][0]
        if predicted != lab:
            nb_errors += 1
    assert nb_errors < len(data)
예제 #2
0
def stage_result(data_train, message):
    """
	Input: Dataset
	Output: Pipeline model.
	"""
    #params_dict_LinearRegression = {'labelCol': 'price', 'featuresCol': '1_vector', 'predictionCol': 'prediction', 'aggregationDepth': 2, 'solver': 'auto', 'standardization': True, 'fitIntercept': True, 'elasticNetParam': 0, 'maxIter': 100, 'regParam': 0, 'tol': 1e-06, 'loss': 'squaredError', 'epsilon': 1.35}
    #modul_LinearRegression = LinearRegression(**params_dict_LinearRegression)

    import tensorflow as tf
    from sparkflow.pipeline_util import PysparkPipelineWrapper
    from sparkflow.graph_utils import build_graph
    from sparkflow.tensorflow_async import SparkAsyncDL

    mg = build_graph(graph_model)
    params_dict_DNN = {
        "featuresCol": "features",
    }
    params_dict_DNN["tensorflowGraph"] = mg
    modul_DNN = SparkAsyncDL(**params_dict_DNN)
    #modul_DNN = SparkAsyncDL(inputCol='features',
    #						 tensorflowGraph=mg,
    #						 tfInput='x:0',
    #						 tfLabel='y:0',
    #						 tfOutput='out:0',
    #						 tfLearningRate=.001,
    #					     iters=10,
    #					     predictionCol='predicted',
    #					     labelCol='labels',
    #					     verbose=1)

    #stages_pipeline = [modul_LinearRegression]
    stages_pipeline = [modul_DNN]

    pipeline = Pipeline(stages=stages_pipeline)
    model = pipeline.fit(data_train)

    prediction = model.transform(data_train)
    if message['output_col'] == 'all':
        data_prediction = prediction
    else:
        try:
            output_col = set(message['output_col'])
            prediction_col = set(prediction.columns)
            output_col = list(output_col.intersection(prediction_col))
            data_prediction = prediction.select(
                [col for col in prediction.columns if col in output_col])
        except Exception as er:
            raise er

    return model, data_prediction
예제 #3
0
 def test_multi_partition_shuffle(self):
     processed = self.generate_random_data()
     mg = build_graph(SparkFlowTests.create_random_model)
     spark_model = SparkAsyncDL(inputCol='features',
                                tensorflowGraph=mg,
                                tfInput='x:0',
                                tfLabel='y:0',
                                tfOutput='outer/Sigmoid:0',
                                tfOptimizer='adam',
                                tfLearningRate=.1,
                                iters=20,
                                partitions=2,
                                predictionCol='predicted',
                                labelCol='label',
                                partitionShuffles=2)
     self.handle_assertions(spark_model, processed)
예제 #4
0
def test_overlapping_guassians():
    processed = generate_random_data()
    mg = build_graph(create_random_model)

    spark_model = SparkAsyncDL(inputCol='features',
                               tensorflowGraph=mg,
                               tfInput='x:0',
                               tfLabel='y:0',
                               tfOutput='outer/Sigmoid:0',
                               tfOptimizer='adam',
                               tfLearningRate=.1,
                               iters=35,
                               partitions=4,
                               predictionCol='predicted',
                               labelCol='label')
    handle_test(spark_model, processed)
예제 #5
0
 def test_auto_encoder(self):
     processed = self.generate_random_data()
     mg = build_graph(SparkFlowTests.create_autoencoder)
     spark_model = SparkAsyncDL(inputCol='features',
                                tensorflowGraph=mg,
                                tfInput='x:0',
                                tfLabel=None,
                                tfOutput='out/Sigmoid:0',
                                tfOptimizer='adam',
                                tfLearningRate=.001,
                                iters=10,
                                predictionCol='predicted',
                                partitions=4,
                                miniBatchSize=10,
                                verbose=1)
     encoded = spark_model.fit(processed).transform(processed).take(10)
     print(encoded[0]['predicted'])
예제 #6
0
def stage_result():
    """
	Input: Dataset
	Output: Pipeline model.
	"""
    def graph_model():
        x = tf.placeholder(tf.float32, shape=[None, 784], name='x')
        y = tf.placeholder(tf.float32, shape=[None, 10], name='y')

        layer0 = tf.layers.dense(x, 128, activation=tf.nn.relu)
        layer1 = tf.layers.dense(layer0, 128, activation=tf.nn.relu)
        layer2 = tf.layers.dense(layer1, 64, activation=tf.nn.relu)
        layer3 = tf.layers.dense(layer2, 32, activation=tf.nn.relu)

        out = tf.layers.dense(layer3, 10)
        z = tf.argmax(out, 1, name='out')
        loss = tf.losses.softmax_cross_entropy(y, out)
        return loss

    params_dict_DNN = {'labelCol': 'price', 'tfLearningRate': 0.001, 'iters': 10, \
         'predictionCol' : 'predicted', 'labelCol': 'labels', 'verbose' :1}

    params_dict_DNN["tensorflowGraph"] = build_graph(graph_model)
    print("tensorflowGraph type : ", params_dict_DNN["tensorflowGraph"])
    modul_DNN = SparkAsyncDL(**params_dict_DNN)

    stages_pipeline = [modul_DNN]

    pipeline = Pipeline(stages=stages_pipeline)
    #model = pipeline.fit(data_train)

    #prediction = model.transform(data_train)
    #if message['output_col'] == 'all':
    #	data_prediction = prediction
    #else:
    #	try:
    #		output_col = set(message['output_col'])
    #		prediction_col = set(prediction.columns)
    #		output_col = list(output_col.intersection(prediction_col))
    #		data_prediction = prediction.select([col for col in prediction.columns if col in output_col])
    #	except Exception as er:
    #		raise er

    return pipeline  #model, data_prediction
예제 #7
0
def test_rmsprop():
    processed = generate_random_data()
    mg = build_graph(create_random_model)
    options = build_rmsprop_config(learning_rate=0.1, decay=0.95, momentum=0.1, centered=False)
    spark_model = SparkAsyncDL(
        inputCol='features',
        tensorflowGraph=mg,
        tfInput='x:0',
        tfLabel='y:0',
        tfOutput='outer/Sigmoid:0',
        tfOptimizer='rmsprop',
        tfLearningRate=.1,
        iters=25,
        partitions=4,
        predictionCol='predicted',
        labelCol='label',
        optimizerOptions=options
    )
    handle_assertions(spark_model, processed)
예제 #8
0
def test_adam_optimizer_options():
    processed = generate_random_data()
    mg = build_graph(create_random_model)
    options = build_adam_config(learning_rate=0.1, beta1=0.85, beta2=0.98, epsilon=1e-8)
    spark_model = SparkAsyncDL(
        inputCol='features',
        tensorflowGraph=mg,
        tfInput='x:0',
        tfLabel='y:0',
        tfOutput='outer/Sigmoid:0',
        tfOptimizer='adam',
        tfLearningRate=.1,
        iters=25,
        partitions=4,
        predictionCol='predicted',
        labelCol='label',
        optimizerOptions=options
    )
    handle_assertions(spark_model, processed)
예제 #9
0
 def test_save_model(self):
     processed = self.generate_random_data()
     mg = build_graph(SparkFlowTests.create_random_model)
     spark_model = SparkAsyncDL(inputCol='features',
                                tensorflowGraph=mg,
                                tfInput='x:0',
                                tfLabel='y:0',
                                tfOutput='outer/Sigmoid:0',
                                tfOptimizer='adam',
                                tfLearningRate=.1,
                                iters=20,
                                partitions=2,
                                predictionCol='predicted',
                                labelCol='label')
     fitted = spark_model.fit(processed)
     fitted.save('saved_model')
     model = SparkAsyncDLModel.load("saved_model")
     data = model.transform(processed).take(10)
     nb_errors = SparkFlowTests.calculate_errors(data)
     self.assertTrue(nb_errors < len(data))
예제 #10
0
    def test_small_sparse(self):
        xor = [(0.0, Vectors.sparse(2, [0, 1], [0.0, 0.0])),
               (0.0, Vectors.sparse(2, [0, 1], [1.0, 1.0])),
               (1.0, Vectors.sparse(2, [0], [1.0])),
               (1.0, Vectors.sparse(2, [1], [1.0]))]
        processed = self.spark.createDataFrame(xor, ["label", "features"])

        mg = build_graph(SparkFlowTests.create_model)
        spark_model = SparkAsyncDL(inputCol='features',
                                   tensorflowGraph=mg,
                                   tfInput='x:0',
                                   tfLabel='y:0',
                                   tfOutput='outer/Sigmoid:0',
                                   tfOptimizer='adam',
                                   tfLearningRate=.1,
                                   iters=35,
                                   partitions=2,
                                   predictionCol='predicted',
                                   labelCol='label')
        assert spark_model.fit(processed).transform(
            processed).collect() is not None
예제 #11
0
 def test_save_pipeline(self):
     processed = self.generate_random_data()
     mg = build_graph(SparkFlowTests.create_random_model)
     spark_model = SparkAsyncDL(inputCol='features',
                                tensorflowGraph=mg,
                                tfInput='x:0',
                                tfLabel='y:0',
                                tfOutput='outer/Sigmoid:0',
                                tfOptimizer='adam',
                                tfLearningRate=.1,
                                iters=20,
                                partitions=2,
                                predictionCol='predicted',
                                labelCol='label')
     p = Pipeline(stages=[spark_model]).fit(processed)
     p.write().overwrite().save('example_pipeline')
     p = PysparkPipelineWrapper.unwrap(
         PipelineModel.load('example_pipeline'))
     data = p.transform(processed).take(10)
     nb_errors = SparkFlowTests.calculate_errors(data)
     self.assertTrue(nb_errors < len(data))
예제 #12
0
    final_schema.append(StructField('result', IntegerType(), True))
    final_schema = StructType(final_schema)

    final_rdd = sc.parallelize(process_dic)
    final_df = sqlContext.createDataFrame(final_rdd, final_schema)

    print('== preprocess finished, final_df created ==')

    # create spark session and train with final_df
    spark = SparkSession.builder \
            .appName(task+'flow') \
            .getOrCreate()

    # sc.stop() ## stop?

    mg = build_graph(small_model)
    #Assemble and one hot encode
    va = VectorAssembler(inputCols=final_df.columns[1:151],
                         outputCol='features')
    encoded = OneHotEncoder(inputCol='result',
                            outputCol='labels',
                            dropLast=False)
    adam_config = build_adam_config(learning_rate=0.001,
                                    beta1=0.9,
                                    beta2=0.999)

    spark_model = SparkAsyncDL(inputCol='features',
                               tensorflowGraph=mg,
                               tfInput='x:0',
                               tfLabel='y:0',
                               tfOutput='out:0',
예제 #13
0
def cnn_model():
    x = tf.placeholder(tf.float32, shape=[None, 784], name='x')
    y = tf.placeholder(tf.float32, shape=[None, 10], name='y')
    x = tf.reshape(x, shape=[-1, 28, 28, 1])
    conv1 = tf.layers.conv2d(x, 32, 5, activation=tf.nn.relu)
    conv1 = tf.layers.max_pooling2d(conv1, 2, 2)
    conv2 = tf.layers.conv2d(conv1, 64, 3, activation=tf.nn.relu)
    conv2 = tf.layers.max_pooling2d(conv2, 2, 2)
    fc1 = tf.layers.flatten(conv2)
    out = tf.layers.dense(fc1, 10)
    z = tf.argmax(out, 1, name='out')
    loss = tf.losses.softmax_cross_entropy(y, out)
    return loss

# Build the graph
mg = build_graph(cnn_model)

spark_model = SparkAsyncDL(
        inputCol='features',
        tensorflowGraph=mg,
        tfInput='x:0',
        tfLabel='y:0',
        tfOutput='out:0',
        tfOptimizer='adam',
        miniBatchSize=300,
        miniStochasticIters=-1,
        shufflePerIter=True,
        iters=10,
        tfLearningRate=.001,
        predictionCol='predicted',
        labelCol='labels',