示例#1
0
def udfTest():
    # conf = SparkConf().setAppName("test").setMaster("local[2]") \
    #     .set("spark.shuffle.service.enabled", "false").set("spark.dynamicAllocation.enabled", "false")
    #
    # sc = SparkContext(conf=conf)
    spark = createLocalSparkSession()
    dataX, dataY = getMnist()

    def sklmodelPredict(model):
        def f(vec):
            p = model.predict(np.array(vec.values.data).reshape(1, -1))
            return int(p)

        return f

    df = spark.createDataFrame([(user, Vectors.dense([i, i**2, i**3]),
                                 0.0 + user + i + 2 * i**2 + 3 * i**3)
                                for user in range(3) for i in range(5)])
    df = df.toDF("key", "features", "y")
    pd = df.select('features', 'y').toPandas()
    dataX = np.vstack(pd['features'].apply(lambda v: v.toArray()))
    dataY = pd['y'].values.reshape(-1, 1)
    model = linear_model.LinearRegression()
    model.fit(dataX, dataY)

    ufun = udf(sklmodelPredict(model))

    df.withColumn("pred", ufun("features")).show()
示例#2
0
def main():
    """
        a simple pyspark + tensorflow + mnist for local distributed test

        Use a index restful api to provide different index, and all executor both
        initialize ps and worker.

        Use figoqueue to control parameter server.
        After ps initialize, they will run an dequeue operation, which will block the process until element is added
        After each worker end its work, they will enqueue 1 to queue. And the dequeue operation of ps will be exeucuted.
        Ps will be released, after all work of each worker are done.
    :return:
    """
    # TODO: 1. Cluster spec provider. 2. How to decide whether executor should init ps or not
    # spark = SparkSession.builder.appName("test").master("local").getOrCreate()
    # conf = SparkConf().setAppName("test").setMaster("spark://namenode01:7077") \
    #     .set("spark.shuffle.service.enabled", "false").set("spark.dynamicAllocation.enabled", "false")
        #.set('spark.executor.instances', '1').set('spark.executor.cores', '1')\
        #.set('spark.executor.memory', '512m').set('spark.driver.memory', '512m')\
    conf = SparkConf().setAppName("test").setMaster("local[2]") \
        .set("spark.shuffle.service.enabled", "false").set("spark.dynamicAllocation.enabled", "false")

    sc = SparkContext(conf=conf)
    dataX, dataY = getMnist()
    data = np.split(np.hstack([dataX, dataY]), dataX.shape[0], axis=0)

    rdd = sc.parallelize(data, 2)
    rdd.foreachPartition(tff(lrmodel, getClusterAndServer, getLogger))
    print("_________ end _________")
示例#3
0
def simpleLR():
    dataX, dataY = getMnist()
    dataY = np.argmax(dataY, axis=1)

    model = linear_model.LogisticRegression()
    model.fit(dataX, dataY)
    pred = model.predict(dataX)
    print(np.sum(pred == dataY))
示例#4
0
def dynamicrnn():
    """

    :return:
    """
    dataX, dataY = getMnist()

    index = np.arange(dataX.shape[0])
    np.random.shuffle(index)
    dataX = dataX[index]
    dataY = dataY[index]

    t1 = 50
    t2 = 40
    fake_seq_X = []
    fake_seq_Y = []
    steps1 = dataX.shape[0] // t1
    steps2 = dataX.shape[0] // t2

    fake_seq_X1 = mkFakeSeq(dataX, steps1, t1)
    fake_seq_X2 = mkFakeSeq(dataX, steps2, t2)
    # fake_seq_Y = np.array(fake_seq_Y)

    outputSize = 10
    numEmbedding = 128
    seqX = tf.placeholder(dtype=tf.float32, shape=[None, None, 400])
    softmax_w = tf.Variable(
        tf.random_uniform([outputSize, numEmbedding],
                          minval=-INIT_VAL,
                          maxval=INIT_VAL))
    softmax_b = tf.Variable(tf.constant(0.0, shape=[numEmbedding]))

    dysteps = tf.placeholder(dtype=tf.int32, shape=[1])
    cell = LSTMCell(numEmbedding)
    outputs, states = tf.nn.dynamic_rnn(cell, seqX, dtype=tf.float32)

    outputsList = tf.split(outputs, dysteps, 1)
    logits = tf.stack([
        tf.matmul(tf.reshape(t, [-1, outputSize]), softmax_w) + softmax_b
        for t in outputsList
    ])

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        l1 = sess.run(logits,
                      feed_dict={
                          seqX: fake_seq_X1,
                          dysteps: np.array([t1])
                      })
        l2 = sess.run(logits,
                      feed_dict={
                          seqX: fake_seq_X2,
                          dysteps: np.array([t2])
                      })
        print(l1)
        print(l2)
示例#5
0
def tfeTestTrain():
    """
        Test how to use tfe with tf.layers
    :return:
    """
    dataX, dataY = getMnist()

    dataX = dataX.reshape([-1,20,20,1])
    input = tf.Variable(initial_value=dataX, dtype=tf.float32)
    cnn = cnnModel(input)
    print(cnn)
示例#6
0
def getdataset():
    """
        Test tf.data
    :return:
    """
    dataX, dataY = getMnist()
    # dataX = dataX.reshape([-1,20,20,1])
    dataset = tf.data.Dataset.from_tensor_slices({'x':dataX, 'y':dataY})
    dataset = dataset.shuffle(buffer_size=1000).batch(128)
    iterator = dataset.make_one_shot_iterator()
    one_element = iterator.get_next()

    return dataset
示例#7
0
def sparklearnTensorflowGridSearch():
    """
        Wrap tensorflow so that it can be used in spark-sklearn GridsearchCV
    :return:
    """
    conf = SparkConf().setAppName("test").setMaster("local[2]") \
        .set("spark.shuffle.service.enabled", "false").set("spark.dynamicAllocation.enabled", "false")

    sc = SparkContext(conf=conf)

    dataX, dataY = getMnist()
    dataY = np.argmax(dataY, axis=1)

    tuned_parameters = {'lr': [1e-1, 1e-2], 'iters': [10, 20]}
    model = DisLRModel(400, 10, 0.01, 10)

    clf = spGridSearchCV(sc, model, tuned_parameters, cv=2)
    clf.fit(dataX, dataY)
示例#8
0
def main():
    """
        a simple distributed multi model execution

        for input rdd, it's must be shuffle to avoid label imbalance in different partition.

        I have used a simple method that was not confirmed to be balanced partition.

        I thought it was important for user to do data sampling before algorithm.
    :return:
    """
    # TODO: 1. Cluster spec provider. 2. How to decide whether executor should init ps or not
    # spark = SparkSession.builder.appName("test").master("local").getOrCreate()
    # conf = SparkConf().setAppName("test").setMaster("spark://namenode01:7077") \
    #     .set("spark.shuffle.service.enabled", "false").set("spark.dynamicAllocation.enabled", "false")
    #.set('spark.executor.instances', '1').set('spark.executor.cores', '1')\
    #.set('spark.executor.memory', '512m').set('spark.driver.memory', '512m')\
    conf = SparkConf().setAppName("test").setMaster("local[2]") \
        .set("spark.shuffle.service.enabled", "false").set("spark.dynamicAllocation.enabled", "false")

    sc = SparkContext(conf=conf)
    dataX, dataY = getMnist()

    index = np.arange(dataX.shape[0])
    # np.random.shuffle(index)
    # dataX = dataX[index]
    # dataY = dataY[index]

    data = np.split(np.hstack([dataX, dataY]), dataX.shape[0], axis=0)

    rdd = sc.parallelize(data, 2)
    rdd2 = sc.parallelize(dataX, 2)

    sklmodel = RDDLRSklModel(getLogger)
    sklmodel.fit(rdd)
    result = sklmodel.predict(sc, rdd2)

    # result2 = simpleLR()
    dataY = np.argmax(dataY, axis=1)
    print(result)
    print(dataY)
    print(np.sum(result == dataY))
    sc.stop()
示例#9
0
def tensorflowGridSearch():
    """
        Wrap tensorflow so that it can be used in sklearn GridsearchCV
    :return:
    """
    dataX, dataY = getMnist()
    dataX = dataX.astype(np.float32)
    dataY = np.argmax(dataY, axis=1).astype(np.int32)

    tuned_parameters = [{'lr': [1e-1, 1e-2], 'iters': [10, 20]}]
    scores = ['precision', 'recall']

    model = DisLRModel(400, 10, 0.01, 10)
    clf = GridSearchCV(model,
                       param_grid=tuned_parameters,
                       cv=2,
                       scoring='%s_macro' % "precision")
    clf.fit(dataX, dataY)

    # test whether the model could be serialized
    cp = CloudPickleSerializer()
    cp.dumps(model)