Exemplo n.º 1
0
#-----------------------------
# 7) Performance evaluation
#-----------------------------
def score(model):
   predictions = model.predict(test_data.map(lambda x: x.features))
   labels_and_preds = test_data.map(lambda x: x.label).zip(predictions)
   accuracy = labels_and_preds.filter(lambda x: x[0] == x[1]).count() / float(test_data.count())
   return accuracy

#----------------------
# Algorithms
#----------------------
from pyspark.mllib.classification import LogisticRegressionWithSGD

algo = LogisticRegressionWithSGD()
model = algo.train(training_data)
score(model)

spamExample = tf.transform("You have won $1,000,000. Please fly to Nigeria ASAP. This is urgent".split(" "))
hamExample = tf.transform("Spark is really good at big data processing".split(" "))

print(model.predict(spamExample))
print(model.predict(hamExample))


from pyspark.mllib.classification import LogisticRegressionWithLBFGS

algo = LogisticRegressionWithLBFGS()
model = algo.train(training_data)
score(model)
Exemplo n.º 2
0
    # getting the stop words
    sw = load_stopwords()

    cm = load_common_words()

    reference_table = create_hash_table(common_words=cm, stop_words=sw)

    # tokenizing the text
    rdd = rdd.map(lambda d:
                  {
                      'tokens': tokenize(text=d['text'], common_words=cm),
                      'label': d['label']
                  }).\
        map(lambda d: LabeledPoint(0 if d['label'] == 0 else 1,
                                   compute_tf(tokens=d['tokens'],
                                              reference_table=reference_table)))
    # instantiating the logistic regression
    logistic_regression = LogisticRegressionWithSGD()
    # training the logistic regression
    trained_logistic_regression = logistic_regression.train(data=rdd)

    # storing the parameters in a json file
    trained_parameters = {
        'weights': trained_logistic_regression.weights.toArray().tolist(),
        'intercept': trained_logistic_regression.intercept
    }

    with open('model.json', 'w') as model_file:
        json.dump(trained_parameters, fp=model_file)
Exemplo n.º 3
0
print(fake_features.take(1))
print(real_features.take(1))

# label each element; either fake or real sentence
fake_samples = fake_features.map(lambda features: LabeledPoint(1, features))
real_samples = real_features.map(lambda features: LabeledPoint(0, features))

print(fake_samples.take(1))
print(real_samples.take(1))

samples = fake_samples.union(real_samples)
[training_data, test_data] = samples.randomSplit([0.8, 0.2])
training_data.cache()
test_data.cache()

algorithm = LogisticRegressionWithSGD()
model = algorithm.train(training_data)
print('logistic regression sgd:', score(model))

algorithm = LogisticRegressionWithLBFGS()
model = algorithm.train(training_data)
print('logistic regression with lbfgs:', score(model))

# algorithm = DecisionTree()
# model = algorithm.trainClassifier(training_data, numClasses=2,categoricalFeaturesInfo={})
# print('decision tree: ',score(model))
#
# algorithm = RandomForest()
# model = algorithm.trainClassifier(training_data,numClasses=2,categoricalFeaturesInfo={},numTrees=16)
# print('random forest: ',score(model))
Exemplo n.º 4
0
from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD
from pyspark.ml.tuning import ParamGridBuilder

model_list = {
    "Linear-SVM": SVMWithSGD(),
    "LogisticRegression": LogisticRegressionWithSGD()
}

params_list = {
    "LogisticRegression" : ParamGridBuilder()\
    .addGrid(LogisticRegressionWithSGD.regParam, [0.1, 0.01]) \
    .addGrid(LogisticRegressionWithSGD.step, [0.1, 0.01])\
    .addGrid(LogisticRegressionWithSGD.miniBatchFraction, [0.1, 0.5, 1.0])\
    .addGrid(LogisticRegressionWithSGD.regType, ['l1', 'l2', None])\
    .addGrid(LogisticRegressionWithSGD.convergenceTol, [0.001, 0.0001])
    .build(),
    "Linear-SVM": ParamGridBuilder()\
    .addGrid(SVMWithSGD.regParam, [0.1, 0.01]) \
    .addGrid(SVMWithSGD.step, [0.1, 0.01])\
    .addGrid(SVMWithSGD.miniBatchFraction, [0.1, 0.5, 1.0])\
    .addGrid(SVMWithSGD.regType, ['l1', 'l2', None])\
    .addGrid(SVMWithSGD.convergenceTol, [0.001, 0.0001])
    .build()
}

if __name__ == "__main__":
    # SVMWithSGD.train(training_data, estimatorParamMaps=grid)
    # SVMWithSGD.predict(test_data)

    # LogisticRegressionWithSGD.train(data, estimatorParamMaps=grid)
    # LogisticRegressionWithSGD.predict(test_data)