Exemplo n.º 1
0
def task_C(df_train, df_test, sc, sqlContext, pipe, features_col):
    nb = NaiveBayes(smoothing=1.0, modelType="multinomial").setFeaturesCol(
        features_col).setLabelCol('class')
    dtc = DecisionTree()

    maem = []
    maeni = []
    maem_dt = []
    maeni_dt = []
    i = 0

    print("Numero di topic da processare:", len(np.unique(df_train.topic)))

    for topic in np.unique(df_train.topic):
        if i >= 125:
            pred = pd.DataFrame(columns=['class', 'prediction'])

            index_test = getSimilar(df_train, df_test, topic, test_size=0.3)[0]

            tt_set = pd.concat(
                [df_train[df_train.topic == topic], df_test.iloc[index_test]])
            tt_set['class'] = tt_set['class'].map(lambda x: x + 2)

            df_f = sqlContext.createDataFrame(tt_set)

            tr3_test = pipe.fit(df_f).transform(df_f)

            # Passare al cross validator un tipo (true albero, false NaiveBayes)

            maem_aux, maeni_aux = cross_validation_task_C(
                tr3_test.toPandas(), nb, sqlContext, False, features_col, sc)
            maem.append(maem_aux)
            maeni.append(maeni_aux)

            maem_aux, maeni_aux = cross_validation_task_C(
                tr3_test.toPandas(), dtc, sqlContext, True, features_col, sc)
            maem_dt.append(maem_aux)
            maeni_dt.append(maeni_aux)

            # train_topic = tr3_test.filter(tr3_test['topic'] == topic)
            # test_topic = tr3_test.filter(tr3_test['topic'] != topic)

            # categorie
            # Topic processati: 150
            # 2.17504068541 1.18580006402 1.31400883186 0.601955388286

        i = i + 1

        if i % 25 == 0:
            print("Topic processati:", i)
            print(np.mean(maem), np.mean(maeni), np.mean(maem_dt),
                  np.mean(maeni_dt))

    print(
        "NaiveBayes maem, NaiveBayes maeni, DecisionTree maem, DecisionTree maeni"
    )
    return (np.mean(maem), np.mean(maeni), np.mean(maem_dt), np.mean(maeni_dt))
Exemplo n.º 2
0
algo = SVMWithSGD()
model = algo.train(training_data)
score(model)


##### Trees
#####
##### Now let’s try three variants of tree-based classification. 
##### The API is slightly different from previous algos.
from pyspark.mllib.tree import DecisionTree

from pyspark.mllib.tree import GradientBoostedTrees

from  pyspark.mllib.tree import RandomForest

algo = DecisionTree()
model = algo.trainClassifier(training_data,numClasses=2,categoricalFeaturesInfo={})
score(model)


algo = GradientBoostedTrees()
model = algo.trainClassifier(training_data,categoricalFeaturesInfo={},numIterations=10)
score(model)

algo = RandomForest()
model = algo.trainClassifier(training_data,numClasses=2,categoricalFeaturesInfo={},numTrees=16)
score(model)

#### Naive Bayes
#### Last but not least, let’s try the Naives Bayes classifier.
from pyspark.mllib.classification import NaiveBayes
Exemplo n.º 3
0
num_iterations = 10
max_tree_depth = 5

lr_model = LogisticRegressionWithLBFGS().train(data, num_iterations)
print("logistic regression model :")
print(lr_model)

svm_model = SVMWithSGD().train(data, num_iterations)
print("svm model :")
print(svm_model)

nb_model = NaiveBayes().train(nb_data)
print("naive bayes model :")
print(nb_model)

dt_model = DecisionTree().trainClassifier(data, 2, {})
print("decision tree model :")
print(dt_model)

#start predict
data_point = data.first()
lr_prediction = lr_model.predict(data_point.features)
print("logistic model prediction :" + str(lr_prediction))
print("the true label :" + str(data_point.label))

#analyze data
vectors = data.map(lambda lp: lp.features)
matrix = RowMatrix(vectors)
matrix_summary = matrix.computeColumnSummaryStatistics()
print("the col mean of matrix :")
print(matrix_summary.mean())
Exemplo n.º 4
0
from pyspark import SparkContext, SparkConf
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from pyspark.mllib.util import MLUtils

conf = SparkConf().setAppName('Decision Tree Regression').setMaster('local[2]')
sc = SparkContext(conf=conf)

# load data
data = MLUtils.loadLibSVMFile(sc, '../data/sample_libsvm_data.txt')
# split the data into training and test sets
(training, testData) = data.randomSplit([0.7, 0.3])

# training a decision tree regression
model = DecisionTree().trainRegressor(training,
                                      categoricalFeaturesInfo={},
                                      impurity='variance',
                                      maxDepth=5,
                                      maxBins=32)

# evaluate model on test instance and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelAndPredictions = testData.map(lambda x: x.label).zip(predictions)
testMSE = labelAndPredictions.map(lambda (v, p): (v - p)**2).sum() / float(
    testData.count())

print('test mean squared error :' + str(testMSE))
print('learned regression tree model :')
print(model.toDebugString())

# save and load model
model.save(sc, '../model/myDecisionTreeRegressionModel')