Python DecisionTree.DecisionTree示例

编程语言: Python

命名空间/包名称: pyspark.mllib.tree

类/类型: DecisionTree

方法/功能: DecisionTree

hotexamples.com的示例: 4

Python DecisionTree.DecisionTree - 已找到4个示例。这些是从开源项目中提取的最受好评的pyspark.mllib.tree.DecisionTree.DecisionTree现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

trainRegressor(30)

trainClassifier(30)

DecisionTree(4)

predict(1)

save(1)

toDebugString(1)

trainClassfier(1)

示例#1

显示文件

def task_C(df_train, df_test, sc, sqlContext, pipe, features_col):
    nb = NaiveBayes(smoothing=1.0, modelType="multinomial").setFeaturesCol(
        features_col).setLabelCol('class')
    dtc = DecisionTree()

    maem = []
    maeni = []
    maem_dt = []
    maeni_dt = []
    i = 0

    print("Numero di topic da processare:", len(np.unique(df_train.topic)))

    for topic in np.unique(df_train.topic):
        if i >= 125:
            pred = pd.DataFrame(columns=['class', 'prediction'])

            index_test = getSimilar(df_train, df_test, topic, test_size=0.3)[0]

            tt_set = pd.concat(
                [df_train[df_train.topic == topic], df_test.iloc[index_test]])
            tt_set['class'] = tt_set['class'].map(lambda x: x + 2)

            df_f = sqlContext.createDataFrame(tt_set)

            tr3_test = pipe.fit(df_f).transform(df_f)

            # Passare al cross validator un tipo (true albero, false NaiveBayes)

            maem_aux, maeni_aux = cross_validation_task_C(
                tr3_test.toPandas(), nb, sqlContext, False, features_col, sc)
            maem.append(maem_aux)
            maeni.append(maeni_aux)

            maem_aux, maeni_aux = cross_validation_task_C(
                tr3_test.toPandas(), dtc, sqlContext, True, features_col, sc)
            maem_dt.append(maem_aux)
            maeni_dt.append(maeni_aux)

            # train_topic = tr3_test.filter(tr3_test['topic'] == topic)
            # test_topic = tr3_test.filter(tr3_test['topic'] != topic)

            # categorie
            # Topic processati: 150
            # 2.17504068541 1.18580006402 1.31400883186 0.601955388286

        i = i + 1

        if i % 25 == 0:
            print("Topic processati:", i)
            print(np.mean(maem), np.mean(maeni), np.mean(maem_dt),
                  np.mean(maeni_dt))

    print(
        "NaiveBayes maem, NaiveBayes maeni, DecisionTree maem, DecisionTree maeni"
    )
    return (np.mean(maem), np.mean(maeni), np.mean(maem_dt), np.mean(maeni_dt))

示例#2

显示文件

algo = SVMWithSGD()
model = algo.train(training_data)
score(model)


##### Trees
#####
##### Now let’s try three variants of tree-based classification. 
##### The API is slightly different from previous algos.
from pyspark.mllib.tree import DecisionTree

from pyspark.mllib.tree import GradientBoostedTrees

from  pyspark.mllib.tree import RandomForest

algo = DecisionTree()
model = algo.trainClassifier(training_data,numClasses=2,categoricalFeaturesInfo={})
score(model)


algo = GradientBoostedTrees()
model = algo.trainClassifier(training_data,categoricalFeaturesInfo={},numIterations=10)
score(model)

algo = RandomForest()
model = algo.trainClassifier(training_data,numClasses=2,categoricalFeaturesInfo={},numTrees=16)
score(model)

#### Naive Bayes
#### Last but not least, let’s try the Naives Bayes classifier.
from pyspark.mllib.classification import NaiveBayes

示例#3

显示文件

num_iterations = 10
max_tree_depth = 5

lr_model = LogisticRegressionWithLBFGS().train(data, num_iterations)
print("logistic regression model :")
print(lr_model)

svm_model = SVMWithSGD().train(data, num_iterations)
print("svm model :")
print(svm_model)

nb_model = NaiveBayes().train(nb_data)
print("naive bayes model :")
print(nb_model)

dt_model = DecisionTree().trainClassifier(data, 2, {})
print("decision tree model :")
print(dt_model)

#start predict
data_point = data.first()
lr_prediction = lr_model.predict(data_point.features)
print("logistic model prediction :" + str(lr_prediction))
print("the true label :" + str(data_point.label))

#analyze data
vectors = data.map(lambda lp: lp.features)
matrix = RowMatrix(vectors)
matrix_summary = matrix.computeColumnSummaryStatistics()
print("the col mean of matrix :")
print(matrix_summary.mean())

示例#4

显示文件

from pyspark import SparkContext, SparkConf
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from pyspark.mllib.util import MLUtils

conf = SparkConf().setAppName('Decision Tree Regression').setMaster('local[2]')
sc = SparkContext(conf=conf)

# load data
data = MLUtils.loadLibSVMFile(sc, '../data/sample_libsvm_data.txt')
# split the data into training and test sets
(training, testData) = data.randomSplit([0.7, 0.3])

# training a decision tree regression
model = DecisionTree().trainRegressor(training,
                                      categoricalFeaturesInfo={},
                                      impurity='variance',
                                      maxDepth=5,
                                      maxBins=32)

# evaluate model on test instance and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelAndPredictions = testData.map(lambda x: x.label).zip(predictions)
testMSE = labelAndPredictions.map(lambda (v, p): (v - p)**2).sum() / float(
    testData.count())

print('test mean squared error :' + str(testMSE))
print('learned regression tree model :')
print(model.toDebugString())

# save and load model
model.save(sc, '../model/myDecisionTreeRegressionModel')