예제 #1
0
def pysparkLR():
    """
        TrainValidationSplit Test
    :return:
    """
    spark = createLocalSparkSession()
    df = getDatasetMinist(spark)

    train, test = df.randomSplit([0.9, 0.1], seed=12345)

    lr = RandomForestClassifier()

    paramGrid = ParamGridBuilder() \
        .addGrid(lr.maxDepth, [4, 5]) \
        .addGrid(lr.numTrees, [10, 20]) \
        .build()


    tvs = TrainValidationSplit(estimator=lr,
                               estimatorParamMaps=paramGrid,
                               evaluator=MulticlassClassificationEvaluator(),
                               # 80% of the data will be used for training, 20% for validation.
                               trainRatio=0.8)

    model = tvs.fit(train)

    # Make predictions on test data. model is the model with combination of parameters
    # that performed best.
    model.transform(test) \
        .select("features", "label", "prediction") \
        .show(500)
예제 #2
0
def gbtr():
    spark = createLocalSparkSession()
    df = getDatasetMinist(spark)
    # DecisionTreeClassifier.le = LabelEncoder()
    clf = DecisionTreeClassifier()
    model = clf.fit(df)
    model.write().overwrite().save('file:///data/mapleleaf/work/algorithm/model/DecisionTreeClassifier_node_DecisionTreeClassifier_76017b.None/model')
예제 #3
0
def udfTest():
    # conf = SparkConf().setAppName("test").setMaster("local[2]") \
    #     .set("spark.shuffle.service.enabled", "false").set("spark.dynamicAllocation.enabled", "false")
    #
    # sc = SparkContext(conf=conf)
    spark = createLocalSparkSession()
    dataX, dataY = getMnist()

    def sklmodelPredict(model):
        def f(vec):
            p = model.predict(np.array(vec.values.data).reshape(1, -1))
            return int(p)

        return f

    df = spark.createDataFrame([(user, Vectors.dense([i, i**2, i**3]),
                                 0.0 + user + i + 2 * i**2 + 3 * i**3)
                                for user in range(3) for i in range(5)])
    df = df.toDF("key", "features", "y")
    pd = df.select('features', 'y').toPandas()
    dataX = np.vstack(pd['features'].apply(lambda v: v.toArray()))
    dataY = pd['y'].values.reshape(-1, 1)
    model = linear_model.LinearRegression()
    model.fit(dataX, dataY)

    ufun = udf(sklmodelPredict(model))

    df.withColumn("pred", ufun("features")).show()
예제 #4
0
def testLr():
    spark = createLocalSparkSession()
    df = getDatasetMinist(spark)
    train, test = df.randomSplit([0.9, 0.1], seed=12345)

    lr = TFNeuralNetwork()
    model = lr.fit(train, {0.01: 0.01, 10: 10})
    pred = model.transform(test)
    pred.show()
예제 #5
0
def dfTry():
    spark = createLocalSparkSession()
    df = getDatasetMinist(spark)

    rdd1 = spark.sparkContext.parallelize(np.arange(5000).tolist())

    rdd2 = df.rdd.zip(rdd1).map(lambda d_r: d_r[0]+Row(pred=d_r[1]))

    df2 = df.sql_ctx.createDataFrame(rdd2, df.schema.add("pred", LongType()))
    df2.show()
예제 #6
0
def testCvWithLr():
    spark = createLocalSparkSession()
    df = getDatasetMinist(spark)
    train, test = df.randomSplit([0.9, 0.1], seed=12345)

    lr = TFNeuralNetwork()
    paramGrid = ParamGridBuilder() \
        .addGrid(lr.lr, [0.1, 0.01]) \
        .addGrid(lr.maxIter, [10]) \
        .build()

    tvs = TrainValidationSplit(
        estimator=lr,
        estimatorParamMaps=paramGrid,
        evaluator=MulticlassClassificationEvaluator(),
        # 80% of the data will be used for training, 20% for validation.
        trainRatio=0.8)

    model = tvs.fit(train)
    pred = model.transform(test)
    pred.show()
예제 #7
0
"""
调参
Run with:
  bin/spark-submit --py-files='/Users/t/python/spark-learning/src/utils.zip' \
  /Users/t/python/spark-learning/src/ml/model_selection/grid_search_cv.py
"""
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import GridSearchCV
from spark_sklearn.util import createLocalSparkSession
from spark_sklearn.grid_search import GridSearchCV

digits = datasets.load_digits()
X, y = digits.data, digits.target

sc = createLocalSparkSession().sparkContext
param_grid = {
    "max_depth": [3, None],
    "max_features": [1, 3, 10],
    "min_samples_split": [0.1, 0.2, 0.3],
    "min_samples_leaf": [1, 3, 10],
    "bootstrap": [True, False],
    "criterion": ["gini", "entropy"],
    "n_estimators": [10, 20, 40, 80]
}

gs = GridSearchCV(sc, RandomForestClassifier(), param_grid=param_grid)
gs.fit(X, y)

# 获取最佳参数
best_params_ = None
예제 #8
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri May 11 01:43:41 2018

@author: Mujirin
email: [email protected]
"""
from pyspark.ml.linalg import Vectors
from spark_sklearn.util import createLocalSparkSession
from pyspark.ml.clustering import GaussianMixture
from pyspark.ml.clustering import GaussianMixtureModel
spark = createLocalSparkSession()


def hiperAdapter(hiperparameter):
    '''
    Fungsi ini untuk menyesuaikan config 
    yang tidak lengkap
    ke defaultnya.
    '''
    hiperparameter_default = {
        "featuresCol": "features",
        "predictionCol": "prediction",
        "k": 2,
        "probabilityCol": "probability",
        "tol": 0.01,
        "maxIter": 100,
        "seed": None
    }
    hiperparameter_keys = list(hiperparameter.keys())
예제 #9
0
def load():
    spark = createLocalSparkSession()
    obj = DecisionTreeClassificationModel.load('tmp')
예제 #10
0
 def setUpClass(cls):
     cls.spark = createLocalSparkSession("Unit Tests")
     if setup:
         setup()