示例#1
0
def svm_second_predict(spark_session, svm_model_path, df, condition):
    """
    支持向量机二分类预测
    :param spark_session: spark 会话
    :param svm_model_path: 模型地址
    :param df: 数据
    :param condition: {"features": [12, 13, 14, 15], "label": "label"}
    特征列
    :return: 预测结果 sparkframe
    """
    feature_indexs = condition['features']
    label_index = condition['label']
    if label_index is None or label_index == "":  # 无标签列
        # 1. 准备数据
        def func(x):
            features_data = []
            for feature in feature_indexs:
                features_data.append(x[feature])
            return features_data

        predict_data = df.rdd.map(lambda x: func(x))
        print(predict_data.take(10))

        # 2.加载模型
        svm_model = SVMModel.load(spark_session.sparkContext, svm_model_path)

        # 3.预测
        def f(x):
            return {"prediction_result": x}

        prediction_rdd = svm_model.predict(predict_data)
        print(prediction_rdd.take(10))
        prediction_df = prediction_rdd.map(lambda x: Row(**f(x))).toDF()
        return prediction_df
    else:  # 有标签列
        # 1. 准备数据
        def func(x):
            features_data = []
            for feature in feature_indexs:
                features_data.append(x[feature])
            return LabeledPoint(label=x[label_index], features=features_data)

        predict_label_data = df.rdd.map(lambda x: func(x))
        print(predict_label_data.take(10))

        # 2.加载模型
        svm_model = SVMModel.load(spark_session.sparkContext, svm_model_path)

        # 3.预测
        from pyspark.sql.types import Row

        def f(x):
            return {"prediction_result": x[0], label_index: x[1]}

        prediction_rdd = predict_label_data.map(
            lambda x: (svm_model.predict(x.features), x.label))
        print(prediction_rdd.take(10))
        prediction_df = prediction_rdd.map(lambda x: Row(**f(x))).toDF()
        return prediction_df
    def load_parameters(self):
        self.amount_prediction_method = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL,
                                                                 file_name='amount_method')
        self.trend_prediction_method = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL,
                                                                file_name='trend_method')
        self.data_features = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL, file_name='features')
        self.stock_symbol = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL, file_name='symbol')
        self.data_parser = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL, file_name='data_parser')
        amount_model_path = os.path.join(os.path.abspath(self.model_path), 'amount_model')
        trend_model_path = os.path.join(os.path.abspath(self.model_path), 'trend_model')

        if self.amount_prediction_method == self.RANDOM_FOREST:
            amount_model = RandomForestModel.load(sc=self.sc, path=amount_model_path)
        elif self.amount_prediction_method == self.LINEAR_REGRESSION:
            amount_model = LinearRegressionModel.load(sc=self.sc, path=amount_model_path)
        else:
            amount_model = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL, file_name='amount_model')

        if self.trend_prediction_method == self.RANDOM_FOREST:
            trend_model = RandomForestModel.load(sc=self.sc, path=trend_model_path)
        elif self.trend_prediction_method == self.LOGISTIC_REGRESSION:
            trend_model = LogisticRegressionModel.load(sc=self.sc, path=trend_model_path)
        elif self.trend_prediction_method == self.NAIVE_BAYES:
            trend_model = NaiveBayesModel.load(sc=self.sc, path=trend_model_path)
        elif self.trend_prediction_method == self.SVM:
            trend_model = SVMModel.load(sc=self.sc, path=trend_model_path)
        else:
            trend_model = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL, file_name='trend_model')

        return trend_model, amount_model
def do_1vsall(class_all, size, num_iter, config):
    features_path = config['protocol'] + config['bucket'] + config['sep'] + config['features_key']
    print('do_1vsall ==============> Setting RDD_ALL')
    rdd_all = sc.textFile(features_path, minPartitions=4).map(lambda line: line.split(',')).persist()
    print('do_1vsall ==============> Setting RDD_TRAIN_SET')
    rdd_train_set = rdd_all.filter(lambda features: int(features[1]) <= size) \
        .map(lambda features: ['0.0' if features[0] == class_all else '1.0'] + features[2:]) \
        .map(make_labeled_point)

    print('do_1vsall ==============> Setting RDD_TEST_SET')
    rdd_test_set = rdd_all.filter(lambda features: size < int(features[1])) \
        .map(lambda features: ['0.0' if features[0] == class_all else '1.0'] + features[2:]) \
        .map(make_labeled_point)

    # Build the model
    model_dir = class_all + '_' + str(size) + '_' + str(num_iter)
    model_s3_file = config['model_key'] + config['sep'] + model_dir
    model = None
    if s3_object_exists(config['bucket'], model_s3_file):
        print('do_1vsall ==============> Loading SVM Model: {}...'.format(model_s3_file))
        model = SVMModel.load(sc, config['protocol'] + config['bucket'] + config['sep'] + model_s3_file)
    else:
        print('do_1vsall ==============> Building SVM Model')
        model = SVMWithSGD.train(rdd_train_set, iterations=num_iter)
        print('do_1vsall ==============> Saving SVM Model: {}...'.format(model_s3_file))
        model.save(sc, config['protocol'] + config['bucket'] + config['sep'] + model_s3_file)

    # Evaluate the model on th test data
    print('do_1vsall ==============> Evaluating test set')
    labels_and_preds = rdd_test_set.map(lambda p: (p.label, model.predict(p.features)))
    train_err = labels_and_preds.filter(lambda lp: lp[0] != lp[1]).count() / float(rdd_test_set.count())
    # print("Test Error = " + str(train_err))
    success = round(((1 - train_err) * 100), 2)
    print('{},{}'.format(str(size), str(success)))
    return size, success
示例#4
0
 def Prediction(self, modelType):
     data_point = self.Features
     if modelType == 'RF':
         model = RandomForestModel.load(
             self.sc, self.baseDir + '/fraudModel/Model/' + modelType)
         result = np.array(
             model.predict(self.sc.parallelize(data_point)).collect())
         self.df_PD.insert(len(list(self.df_PD.columns)), 'result', result)
     elif modelType == 'GBDT':
         model = GradientBoostedTreesModel.load(
             self.sc, self.baseDir + '/fraudModel/Model/' + modelType)
         result = np.array(
             model.predict(self.sc.parallelize(data_point)).collect())
         self.df_PD.insert(len(list(self.df_PD.columns)), 'result', result)
     elif modelType == 'LRsgd':
         model = LogisticRegressionModel.load(
             self.sc, self.baseDir + '/fraudModel/Model/' + modelType)
         result = np.array(
             model.predict(self.sc.parallelize(data_point)).collect())
         self.df_PD.insert(len(list(self.df_PD.columns)), 'result', result)
     elif modelType == 'LRlbfgs':
         model = LogisticRegressionModel.load(
             self.sc, self.baseDir + '/fraudModel/Model/' + modelType)
         result = np.array(
             model.predict(self.sc.parallelize(data_point)).collect())
         self.df_PD.insert(len(list(self.df_PD.columns)), 'result', result)
     elif modelType == 'SVM':
         model = SVMModel.load(
             self.sc, self.baseDir + '/fraudModel/Model/' + modelType)
         result = np.array(
             model.predict(self.sc.parallelize(data_point)).collect())
         self.df_PD.insert(len(list(self.df_PD.columns)), 'result', result)
     else:
         pass
示例#5
0
 def __init__(self, path):
     conf = SparkConf() \
         .setAppName("crankshaw-pyspark") \
         .set("spark.executor.memory", "2g") \
         .set("spark.kryoserializer.buffer.mb", "128") \
         .set("master", "local")
     sc = SparkContext(conf=conf, batchSize=10)
     self.model = SVMModel.load(sc, path)
     self.path = path
     print("started spark")
    def __init__(self, sc):
        logger.info("Starting up the Classification Engine..")
        self.sc = sc

        #load data
        logger.info("Loading up data..")
        iris_data_raw_RDD = self.sc.textFile("/home/hduser/iris-data.txt")
        self.iris_data_parsed_RDD = iris_data_raw_RDD.map(self.parsePoint)
        #self.train_model()
        self.model = SVMModel.load(self.sc,
                                   "/home/hduser/pythonSVMWithSGDModel")
示例#7
0
def main(sc):
    train_data='/usr/local/spark/data/mllib/sample_svm_data.txt'
    data=sc.textFile(train_data).map(parse)
    
    if os.path.exists('model'):
        model=SVMModel.load(sc, 'model')
    else:
        model=SVMWithSGD.train(data, iterations=100)
        model.save(sc, 'model')

    labelsAndPreds=data.map(lambda p: (p.label, model.predict(p.features)))

    # trainErr=labelsAndPreds.filter(lambda (v, p): v != p).count() / float(data.count())
    # print('Training Error ='  + str(trainErr))

    labelsAndPreds.map(lambda x:str(x[0])+'\t'+str(x[1])).saveAsTextFile('labelsAndPreds')
示例#8
0
def svm_second_evaluation(spark_session, svm_model_path, df, predict_condition,
                          condition):
    """
    svm二分类评估
    :param spark_session:
    :param svm_model_path: 模型地址
    :param df: 预测数据
    :param predict_condition: 预测算子(父算子)配置
    :param condition: 该算子配置 {"label":"标签"}
    :return:
    """

    feature_indexs = predict_condition['features']
    label = condition['label']

    # 1. 准备数据
    def func(x):
        features_data = []
        for feature in feature_indexs:
            features_data.append(x[feature])
        return LabeledPoint(label=x[label], features=features_data)

    predict_data = df.rdd.map(lambda x: func(x))

    # 加载模型
    svm_model = SVMModel.load(spark_session.sparkContext, svm_model_path)

    # 计算评估指标
    svmTotalCorrect = predict_data.map(lambda r: 1 if (svm_model.predict(
        r.features) == r.label) else 0).reduce(lambda x, y: x + y)
    svmAccuracy = svmTotalCorrect / float(predict_data.count())

    # 清除默认阈值,这样会输出原始的预测评分,即带有确信度的结果
    svm_model.clearThreshold()
    svmPredictionAndLabels = predict_data.map(
        lambda lp: (float(svm_model.predict(lp.features)), lp.label))
    svmMetrics = BinaryClassificationMetrics(svmPredictionAndLabels)
    print("Area under PR = %s" % svmMetrics.areaUnderPR)
    print("Area under ROC = %s" % svmMetrics.areaUnderROC)

    # 返回数据
    result = [("正确个数", float(svmTotalCorrect)), ("精准度", float(svmAccuracy)),
              ("Area under PR", float(svmMetrics.areaUnderPR)),
              ("Area under ROC", float(svmMetrics.areaUnderROC))]
    return spark_session.createDataFrame(result, schema=['指标', '值'])
示例#9
0
def SVM_function(rdd,sc,method):
    #method
    from pyspark.mllib.classification import SVMModel
    print("rdd map")
    if method =='TimeDomain':
        output = TimeDomain(rdd)
        testData = sc.parallelize([output])

    if method =='FrequencyDomain':
        output=frequencyDomain(rdd)
        testData=sc.parallelize([output])

    
    #load model
    print("load model")
    Model = SVMModel.load(sc,"hdfs:///home/spark/Desktop/"+method+"Model")
#------------------------------------------------------------#
    #input data and prediction
    print("labelsAndPreds")
    labelsAndPreds = Model.predict(testData)
    return labelsAndPreds.collect()
import datetime
import sys

from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark.sql import functions as F
from pyspark.mllib.classification import SVMModel

dataset_name = sys.argv[1]

sc = SparkContext("local", "Model Prediction", pyFiles=[])
sqlContext = SQLContext(sc)

# First load the model we saved in the model generation step
model = SVMModel.load(sc, "hdfs://hadoop:9000/models/noBikesAvailable.model")

# We also need the stats used to normalize the weather variables
stats_df = sqlContext.read.load("hdfs://hadoop:9000/models/weather-stats")
stats = stats_df.collect()[0]

# We want to produce output for each station
station_df = sqlContext.read.load("hdfs://hadoop:9000/station_data_schema")

print "Statistics: %s" % (stats,)

# Load the weather data

current_weather_csv = (
    sc.textFile("hdfs://hadoop:9000/current_weather/%s.csv" % dataset_name)
    .map(lambda line: line.split(","))
示例#11
0
# %%
lr_predictions = test.map(lambda line:
                          (line[0], line[1], float(lr_model.predict(line[3]))))
lr_predictions.coalesce(1).toDF().write.options(header="true").csv(
    "hdfs://node1:9000/user/root/exp4/predictions/lr_predictions.csv")

# %% [markdown]
# 日期:2020-12-20 14:08:52 排名: 无
# score:0.5015744
# %% [markdown]
# ## SVM

# %%
from pyspark.mllib.classification import SVMModel
svm_model = SVMModel.load(
    sc, "hdfs://node1:9000/user/root/exp4/models/SVMWithSGDModel")

# %%
svm_predictions = test.map(
    lambda line: (line[0], line[1], float(svm_model.predict(line[3]))))
svm_predictions.coalesce(1).toDF().write.options(header="true").csv(
    "hdfs://node1:9000/user/root/exp4/predictions/svm_predictions.csv")

# %% [markdown]
# 日期:2020-12-20 14:18:59 排名: 无
# score:0.5156678
# %% [markdown]
# ## Gradient Boosted Trees

# %%
from pyspark.mllib.tree import GradientBoostedTreesModel
示例#12
0
from pyspark import SparkContext
from pyspark.mllib.classification import SVMWithSGD, SVMModel
from pyspark.mllib.regression import LabeledPoint

from document import DocumentSVM


sc = SparkContext(appName="SVM")
svmModel = SVMModel.load(sc, "../SVMModel")

def parsePoint(line):
    splits = line.split(':')
    doc = DocumentSVM(splits[0], splits[1])
    return doc.svmVec()

data = sc.textFile("hdfs://localhost:8020/user/manh/vectorsvmtest")

parsedData = data.map(parsePoint)

labelsAndPreds = parsedData.map(lambda p: (p.label, svmModel.predict(p.features)))

#0
for i in range(2):
	precision = labelsAndPreds.filter(lambda(v, p) : v == i and v == p).count() / float(labelsAndPreds.filter(lambda(v, p) : p == i).count())

	recall = labelsAndPreds.filter(lambda(v, p) : v == i and v == p).count() / float(labelsAndPreds.filter(lambda(v, p) : v == i).count())

	print("==================Precision c" + str(i) + " : " + str(precision))
	print("==================Recall c" + str(i) + " : " + str(recall))

accuracy = labelsAndPreds.filter(lambda (v, p): v == p).count() / float(parsedData.count())
示例#13
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Date    : 2015-11-23 20:18:03
# @Author  : Your Name ([email protected])
# @Link    : http://example.org
# @Version : $Id$
from pyspark.mllib.classification import SVMWithSGD, SVMModel
from pyspark.mllib.regression import LabeledPoint

# Load and parse the data
def parsePoint(line):
    values = [float(x) for x in line.split(' ')]
    return LabeledPoint(values[0], values[1:])

data = sc.textFile("data/mllib/sample_svm_data.txt")
parsedData = data.map(parsePoint)

# Build the model
model = SVMWithSGD.train(parsedData, iterations=100)

# Evaluating the model on training data
labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count())
print("Training Error = " + str(trainErr))

# Save and load model
model.save(sc, "myModelPath")
sameModel = SVMModel.load(sc, "myModelPath")

示例#14
0
def main():
    #retrieve argument
    args = parse_arguments()
    main_directory = args.directory
    class1 = args.class1
    class2 = args.class2
    force_by_user = args.force
    if args.verbose:
        lg.basicConfig(level=lg.INFO)

    #Variables declaration
    result = []
    directory_feature = os.path.join(main_directory, "features", "*.json")
    nb_training_data_list = args.nb_training_data
    iteration_model_list = args.iteration_model

    lg.info('Features directory is %s', directory_feature)
    for iteration_model in iteration_model_list:
        for nb_training_data in nb_training_data_list:
            model_file = 'model_' + class1 + '_' + class2 + '_' + str(
                nb_training_data) + '_' + str(iteration_model)
            result_file = 'result_' + class1 + '_' + class2 + '_' + str(
                nb_training_data) + '_' + str(
                    iteration_model) + '_' + time.strftime(
                        "%Y%m%d%H%M%S") + '.json'
            model_pathname = os.path.join(main_directory, "models", model_file)

            is_model = False

            start_time = time.time()
            lg.info(
                '#################### Starting pet-classification ######################'
            )
            lg.info('Class 1 is %s', class1)
            lg.info('Class 2 is %s', class2)
            lg.info('Number of training datas is %s', nb_training_data)
            lg.info('Number of iterations model is %s', iteration_model)

            #persist a common rdd which is using by both training and testing datas
            common_rdd = sc.textFile(directory_feature, minPartitions=4)\
                           .filter(lambda line: line.split(', ')[0] in (class1, class2) or class2 == 'All')\
                           .persist()

            #Loading model if exists
            if is_model and not force_by_user:
                model = SVMModel.load(sc, model_pathname)
                lg.info('Found and load recorded model %s', model_file)
            else:
                lg.info('No recorded model found')
                #create training rdd and train model if no model found or force
                train_data_rdd = common_rdd.filter(lambda line: int(line.split(',')[1]) <= nb_training_data)\
                                           .map(lambda line: Row(label=0.0, features=line.split(', ')[2:])
                                                if line.split(', ')[0] == class1
                                                else Row(label=1.0, features=line.split(', ')[2:]))\
                                           .map(lambda line: LabeledPoint(line.label, line.features))

                lg.info('%s features for training datas',
                        train_data_rdd.count())
                lg.info('Start to training model')
                model = SVMWithSGD.train(train_data_rdd,
                                         iterations=iteration_model)
                lg.info('Training model terminated')

            training_time = time.time()
            training_duration = training_time - start_time
            #Create testing rdd
            test_data_rdd = common_rdd.filter(lambda line: int(line.split(', ')[1]) > nb_training_data)\
                              .map(lambda line: Row(label=0.0, features=line.split(', ')[2:])
                                   if line.split(', ')[0] == class1
                                   else Row(label=1.0, features=line.split(', ')[2:]))\
                              .map(lambda row: LabeledPoint(row.label, row.features))
            lg.info('%s features for test datas', test_data_rdd.count())

            # Evaluating the model on training data
            predictions = test_data_rdd.map(
                lambda row: (row.label, float(model.predict(row.features))))
            train_error = predictions.filter(lambda lp: lp[0] != lp[1]).count() \
                                             / float(predictions.count())
            lg.info('Test Error : %s', str(train_error))
            end_time = time.time()
            duration = end_time - start_time
            lg.info('Duration %s', str(duration))
            prediction_duration = end_time - training_time
            # #Save and dump result on S3
            result = {
                "class1": class1,
                "class2": class2,
                "nb_training_data": nb_training_data,
                "error": train_error,
                "iteration_model": iteration_model,
                "total_duration": duration,
                "training_duration": training_duration,
                "prediction_duration": prediction_duration
            }

            with open(result_file, 'w') as result_file:
                json.dump(result, result_file)

    lg.info(
        '#################### Ending pet-classification ######################'
    )
    input("press ctrl+c to exit")
示例#15
0
	values = [float(x) for x in clean_line_split]
	if values[4] == 0:
                        values[4]=1;
        else:
                        values[4]=0;

	return LabeledPoint(values[4], values[0:3]) #dep_delay, cancelled, diverted, carrierdelay, weather delay, NASdelay, Security delay, LateAircraftdelay

#examples = MLUtils.loadLibSVMFile(sc, "2008.csv").collect()
parsedData = raw_data.map(parsePoint)
(trainingData, testData) = parsedData.randomSplit([0.7, 0.3])

startTime = datetime.now()
# Build the model
trainingData.cache ()
model = SVMWithSGD.train(trainingData, iterations=1)
print ('Training Time consumed = '), (datetime.now() - startTime)
startTestTime = datetime.now()
testData.cache()
# Evaluating the model on test data
labelsAndPreds = testData.map(lambda p: (p.label, model.predict(p.features)))
testErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(testData.count())
print ('Testing Time consumed = '), (datetime.now() - startTestTime)
print ('Time consumed = '), (datetime.now() - startTime)

print("Training Error = " + str(testErr))

# Save and load model
model.save(sc, "SVMnarrow95-08")
sameModel = SVMModel.load(sc, "SVMnarrow95-08")
示例#16
0
    spark = SparkSession.builder.appName("downsample").getOrCreate()

    lines = spark.read.csv("s3://daen-cyber/filteredSource/only5s",header=True)
    lines = lines.select([c for c in lines.columns if c not in {'ip','maxScore','minScore','avgScore','trendUp','trendDown','trueCount','dataSetCount','mostCommonCustomerHit'}])

    #for 5s test we dont care about label, just use 1.0
    def labeledPointConverter(row):
        try:
            return LabeledPoint(1.0, row[1:])
        except ValueError:
            return LabeledPoint(50.0,[1.0])

    parsedData = lines.rdd.map(lambda x: labeledPointConverter(x))
    parsedData = parsedData.filter(lambda x: x.label != 50.0)
    parsedData.cache()
    model = SVMModel.load(sc, "s3://daen-cyber/models/no5sSvmModel0")
    preds = parsedData.map(lambda p: model.predict(p.features))
    parsedData.unpersist()
    preds.cache()

    below5 = preds.filter(lambda p: p == 0.0).count()
    above5 = preds.filter(lambda p: p == 1.0).count()

    listToOutput = []
    listToOutput = listToOutput + [("Above 5", str(above5))]
    listToOutput = listToOutput + [("Below5", str(below5))]

    listToOutputRDD = sc.parallelize(listToOutput, 1)\
         .saveAsTextFile("s3://daen-cyber/models/only5sSvmResults0")

    #model.save(sc, "s3://daen-cyber/modelsb/no5sSvmModel0")
示例#17
0
文件: SVM.py 项目: bmewing/spark_vs_r
                        values[7]=1;
        else:
                        values[7]=0;

	return LabeledPoint(values[7], values[0:7]) #dep_delay, cancelled, diverted, carrierdelay, weather delay, NASdelay, Security delay, LateAircraftdelay

#examples = MLUtils.loadLibSVMFile(sc, "2008.csv").collect()
parsedData = raw_data.map(parsePoint)
(trainingData, testData) = parsedData.randomSplit([0.7, 0.3])
startTime = datetime.now()

# Build the model
trainingData.cache ()
model = SVMWithSGD.train(trainingData, iterations=1)
print ('Training Time consumed = '), (datetime.now() - startTime)
startTestTime = datetime.now()

# Evaluating the model on test data
labelsAndPreds = testData.map(lambda p: (p.label, model.predict(p.features)))
testErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(testData.count())
print ('Testing Time consumed = '), (datetime.now() - startTestTime)
print ('Time consumed = '), (datetime.now() - startTime)

print("Training Error = " + str(testErr))


# Save and load model
model.save(sc, "SVMwide00-08train")
sameModel = SVMModel.load(sc, "SVMwide00-08train")

示例#18
0
def classify_with_model(input_data_path, model_file_path):
    input_parsed = sc.textFile(input_data_path).map(parse_point)
    model = SVMModel.load(sc, model_file_path)
    labels = input_parsed.map(lambda p: model.predict(p.features))
    labels.saveAsTextFile("predictions")
def main():
    #retrieve argument
    args = parse_arguments()
    main_directory = args.directory
    class1 = args.class1
    class2 = args.class2
    force_by_user = args.force
    if args.verbose:
        lg.basicConfig(level=lg.INFO)

    #Variables declaration
    result = []
    directory_feature = os.path.join(main_directory, "features", "*.json")
    nb_training_data = args.nb_training_data
    iteration_model = args.iteration_model
    min_partition = args.min_partition
    s3 = boto3.resource('s3')
    bucket = s3.Bucket('oc-calculdistribues-sberton')
    result_file = class1 + '_' + class2 + '_' + time.strftime(
        "%Y%m%d%H%M%S") + '.json'
    model_file = 'model_' + class1 + '_' + class2 + '_' + str(
        nb_training_data) + '_' + str(iteration_model)
    model_pathname = os.path.join(main_directory, "models", model_file)

    #Searching existing model and store existence in is_model boolean
    key = 'distributed_learning/models/' + model_file
    objs = list(bucket.objects.filter(Prefix=key))
    is_model = len(objs) > 0 and objs[0].key.startswith(key + '/')

    start_time = time.time()
    lg.info(
        '#################### Starting pet-classification ######################'
    )
    lg.info('Class 1 is %s', class1)
    lg.info('Class 2 is %s', class2)
    lg.info('Number of training datas is %s', nb_training_data)
    lg.info('Number of iterations model is %s', iteration_model)

    #persist a common rdd which is using by both training and testing datas
    common_rdd = sc.textFile(directory_feature, minPartitions=min_partition)\
                   .filter(lambda line: line.split(', ')[0] in (class1, class2) or class2 == 'All')\
                   .persist()

    #Loading model if exists
    if is_model and not force_by_user:
        model = SVMModel.load(sc, model_pathname)
        lg.info('Found and load recorded model %s', model_file)
    else:
        lg.info('No recorded model found')
        #create training rdd and train model if no model found or force
        train_data_rdd = common_rdd.filter(lambda line: int(line.split(',')[1]) <= nb_training_data)\
                                   .map(lambda line: Row(label=0.0, features=line.split(', ')[2:])
                                        if line.split(', ')[0] == class1
                                        else Row(label=1.0, features=line.split(', ')[2:]))\
                                   .map(lambda line: LabeledPoint(line.label, line.features))

        lg.info('%s features for training datas', train_data_rdd.count())
        lg.info('Start to training model')
        model = SVMWithSGD.train(train_data_rdd, iterations=iteration_model)
        lg.info('Training model terminated')

    training_time = time.time()
    training_duration = training_time - start_time
    #Create testing rdd
    test_data_rdd = common_rdd.filter(lambda line: int(line.split(', ')[1]) > nb_training_data)\
                      .map(lambda line: Row(label=0.0, features=line.split(', ')[2:])
                                           if line.split(', ')[0] == class1
                                           else Row(label=1.0, features=line.split(', ')[2:]))\
                      .map(lambda row: LabeledPoint(row.label, row.features))
    lg.info('%s features for test datas', test_data_rdd.count())

    # Evaluating the model on training data
    predictions = test_data_rdd.map(
        lambda row: (row.label, float(model.predict(row.features))))
    train_error = predictions.filter(lambda lp: lp[0] != lp[1]).count() \
                                     / float(predictions.count())
    lg.info('Test Error : %s', str(train_error))
    end_time = time.time()
    duration = end_time - start_time
    lg.info('Duration %s', str(duration))
    prediction_duration = end_time - training_time
    # #Save and dump result on S3
    result.append({
        "class1": class1,
        "class2": class2,
        "iteration_model": iteration_model,
        "nb_training_data": nb_training_data,
        "total_duration": duration,
        "train_duration": training_duration,
        "predict_duration": prediction_duration,
        "error": train_error
    })

    s3object = s3.Object('oc-calculdistribues-sberton', result_file)
    s3object.put(Body=(bytes(json.dumps(result, indent=2).encode('UTF-8'))))

    #Save model if not exists
    if not is_model:
        lg.info('Saving model at %s', model_file)
        model.save(sc, model_pathname)

    lg.info(
        '#################### Ending pet-classification ######################'
    )
示例#20
0
    parsed_data = MLUtils\
        .loadLibSVMFile(spark_context, "data/classificationdata.txt")\
        .cache()

    print("Parsed data size: " + str(parsed_data.count()))

    # Split initial RDD into two... [60% training data, 40% testing data]
    training, test = parsed_data\
        .randomSplit([0.6, 0.4], seed=3)

    print("Training points size: " + str(training.count()))
    print("Test points size    : " + str(test.count()))

    # Build the model
    model = SVMWithSGD.train(training, iterations=100)

    score_and_labels = test.map(lambda point: score_function(point, model))

    #for score, label in score_and_labels.collect():
    #    print("Score: %d, label: %f" % (score, label))

    # Get evaluation metrics
    metrics = BinaryClassificationMetrics(score_and_labels)
    auROC = metrics.areaUnderROC
    print("Area under ROC: %f" % auROC)

    # Save and load model
    model.save(spark_context, "SVMModel3")
    sameModel = SVMModel.load(spark_context, "SVMModel2")

    spark_context.stop()
    #     ct = get_contingency_table(binarySvm, test_bm25_doc_index, section)
    #     contingency_tables["bm25"][section] = ct

    test_rf_postings = test_tf_postings.mapValues(get_rf_postings)
    test_rf_doc_index = create_doc_index(test_rf_postings, term_dictionary)
    ct = get_contingency_table(binarySvm, test_rf_doc_index, section)
    contingency_tables["rf"][section] = ct

    test_tf_rf_postings = test_tf_postings.mapValues(get_tf_rf_postings)
    test_tf_rf_doc_index = create_doc_index(test_tf_rf_postings, term_dictionary)
    ct = get_contingency_table(binarySvm, test_tf_rf_doc_index, section)
    contingency_tables["tf-rf"][section] = ct
import cPickle as pickle

for clss in classes:
    binarySvm = SVMModel.load(sc, model_output + "tf" + "_" + clss + "_model.svm")

    ct = get_contingency_table(binarySvm, test_tf_doc_index, clss)
    contingency_tables["tf"][clss] = ct
    ct = get_contingency_table(binarySvm, test_tf_id_doc_index, clss)
    contingency_tables["tf-idf"][clss] = ct
    ct = get_contingency_table(binarySvm, test_bm25_doc_index, clss)
    contingency_tables["bm25"][clss] = ct

    test_rf_postings = test_tf_postings.mapValues(get_rf_postings)
    test_rf_doc_index = create_doc_index(test_rf_postings, term_dictionary)
    ct = get_contingency_table(binarySvm, test_rf_doc_index, clss)
    contingency_tables["rf"][clss] = ct

    test_tf_rf_postings = test_tf_postings.mapValues(get_tf_rf_postings)
    test_tf_rf_doc_index = create_doc_index(test_tf_rf_postings, term_dictionary)
示例#22
0
文件: SVM.py 项目: bmewing/spark_vs_r
                        values[4]=1;
        else:
                        values[4]=0;

	return LabeledPoint(values[4], values[0:4]) #dep_delay, cancelled, diverted, carrierdelay, weather delay, NASdelay, Security delay, LateAircraftdelay

#examples = MLUtils.loadLibSVMFile(sc, "2008.csv").collect()
parsedData = raw_data.map(parsePoint)
(trainingData, testData) = parsedData.randomSplit([0.7, 0.3])
startTime = datetime.now()

# Build the model
trainingData.cache ()
model = SVMWithSGD.train(trainingData, iterations=1)
print ('Training Time consumed = '), (datetime.now() - startTime)
startTestTime = datetime.now()

# Evaluating the model on test data
labelsAndPreds = testData.map(lambda p: (p.label, model.predict(p.features)))
testErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(testData.count())
print ('Testing Time consumed = '), (datetime.now() - startTestTime)
print ('Time consumed = '), (datetime.now() - startTime)

print("Training Error = " + str(testErr))


# Save and load model
model.save(sc, "SVMNarrow95-08train")
sameModel = SVMModel.load(sc, "SVMNarrow95-08train")

示例#23
0
文件: svm.py 项目: manhcompany/reishi
from pyspark import SparkContext
from pyspark.mllib.classification import SVMWithSGD, SVMModel
from pyspark.mllib.regression import LabeledPoint


def parsePoint(line):
    parsedData = [0 for i in range(45000)]
    splits = line.split(":")
    vectorSplit = splits[1].split(";")
    for vs in vectorSplit:
        vSplit = vs.split(" ")
        parsedData[int(vSplit[0])] = float(vSplit[1])
    return LabeledPoint(float(splits[0]), parsedData)


sc = SparkContext(appName="PythonSVMWithSGDExample")
data = sc.textFile("hdfs://localhost:8020/pyspark/vectorsvm")
parsedData = data.map(parsePoint)

model = SVMWithSGD.train(parsedData, iterations=100)

labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count())
print("Training Error = " + str(trainErr))

model.save(sc, "pythonSVMWithSGDModel")
sameModel = SVMModel.load(sc, "pythonSVMWithSGDModel")
示例#24
0
# $example on$
from pyspark.mllib.classification import SVMWithSGD, SVMModel
from pyspark.mllib.regression import LabeledPoint

# $example off$

if __name__ == "__main__":
    sc = SparkContext(appName="PythonSVMWithSGDExample")

    # $example on$
    # Load and parse the data
    def parsePoint(line):
        values = [float(x) for x in line.split(" ")]
        return LabeledPoint(values[0], values[1:])

    data = sc.textFile("data/mllib/sample_svm_data.txt")
    parsedData = data.map(parsePoint)

    # Build the model
    model = SVMWithSGD.train(parsedData, iterations=100)

    # Evaluating the model on training data
    labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
    trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count())
    print("Training Error = " + str(trainErr))

    # Save and load model
    model.save(sc, "target/tmp/pythonSVMWithSGDModel")
    sameModel = SVMModel.load(sc, "target/tmp/pythonSVMWithSGDModel")
    # $example off$
from pyspark.mllib.classification import SVMWithSGD, SVMModel
from pyspark.mllib.regression import LabeledPoint

if __name__ == "__main__":
    sc = SparkContext(appName="SVMTicTac")

    # Parse the data and create LabeledPoints
    def parsePoint(line):
        values = [(x) for x in line.split(' ')]
        # Last row contains the target data and rest of
        # the rows define the attributes for linear regression
        return LabeledPoint(values[9], values[0:8])

    # Load the data
    data = sc.textFile("data/mllib/sample_traindata_tic_tac.txt")
    parsedData = data.map(parsePoint)

    # Build the model using SVD
    model = SVMWithSGD.train(parsedData, iterations=100)

    # Evaluating the model on training data
    predict_model = parsedData.map(lambda p: (p.label, model.predict(p.features)))
    trainErr = predict_model.filter(lambda (v, p): v != p).count() / float(parsedData.count())
    
    # Print Mean Squared Error
    print("Training Error = " + str(trainErr))

    # Save and load model
    model.save(sc, "target/tmp/pythonTicTacSGD")
    sameModel = SVMModel.load(sc, "target/tmp/pythonTicTacSGD")
def main(sc, sqlContext):

    #start = timer()

    #print '---Pegando usuario, posts, tokens e categorias do MongoDB---'
    #start_i = timer()
    user = findUserById(iduser)
    posts = findPosts(user) 
    
    tokens, category, categoryAndSubcategory = getTokensAndCategories()
    postsRDD = (sc.parallelize(posts).map(lambda s: (s[0], word_tokenize(s[1].lower()), s[2], s[3]))
                    .map(lambda p: (p[0], [x for x in p[1] if x in tokens] ,p[2], p[3]))
                    .cache())

    

    #print '####levou %d segundos' % (timer() - start_i)

    #print '---Pegando produtos do MongoDB---'
    #start_i = timer()

    #print '####levou %d segundos' % (timer() - start_i)
    
    #print '---Criando corpusRDD---'
    #start_i = timer()
    stpwrds = stopwords.words('portuguese')
    corpusRDD = (postsRDD.map(lambda s: (s[0], [PorterStemmer().stem(x) for x in s[1] if x not in stpwrds], s[2], s[3]))
                         .filter(lambda x: len(x[1]) >= 20 or (x[2] == u'Post' and len(x[1])>0))
                         .cache())
    #print '####levou %d segundos' % (timer() - start_i)

    #print '---Calculando TF-IDF---'
    #start_i = timer()
    wordsData = corpusRDD.map(lambda s: Row(label=int(s[0]), words=s[1], type=s[2]))
    wordsDataDF = sqlContext.createDataFrame(wordsData).unionAll(sqlContext.read.parquet("/home/ubuntu/recsys-tcc-ml/parquet/wordsDataDF.parquet"))


    numTokens = len(tokens)
    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=numTokens)
    idf = IDF(inputCol="rawFeatures", outputCol="features")

    featurizedData = hashingTF.transform(wordsDataDF)

    idfModel = idf.fit(featurizedData)
    tfIDF = idfModel.transform(featurizedData).cache()

    postTFIDF = (tfIDF
                    .filter(tfIDF.type==u'Post')
                    #.map(lambda s: Row(label=s[0], type=s[1], words=s[2], rawFeatures=s[3], features=s[4], sentiment=SVM.predict(s[4])))
                    .cache())

    #postTFIDF = postTFIDF.filter(lambda p: p.sentiment == 1)
    #print '####levou %d segundos' % (timer() - start_i)

    #print '---Carregando modelo---'
    #start_i = timer()
    NB = NaiveBayesModel.load(sc, '/home/ubuntu/recsys-tcc-ml/models/naivebayes/modelo_categoria')
    SVM = SVMModel.load(sc, "/home/ubuntu/recsys-tcc-ml/models/svm")
    #print '####levou %d segundos' % (timer() - start_i)

    #print '---Usando o modelo---'
    #start_i = timer()
    predictions = (postTFIDF
                        .map(lambda p: (NB.predict(p.features), p[0], SVM.predict(p.features)))
                        .filter(lambda p: p[2]==1)
                        .map(lambda p: (p[0], p[1]))
                        .groupByKey()
                        .mapValues(list)
                        .collect())

    #print '####levou %d segundos' % (timer() - start_i)
    #print '---Calculando similaridades---'
    #start_i = timer()
    suggestions = []

    for prediction in predictions:
        category_to_use = category[int(prediction[0])]
        #print ' Calculando similaridades para a categoria: {}'.format(category_to_use)
        tf = tfIDF.filter(tfIDF.type==category_to_use).cache()
        for post in prediction[1]:
            postVector = postTFIDF.filter(postTFIDF.label == post).map(lambda x: x.features).collect()[0]
            sim = (tf
                    .map(lambda x: (post, x.label, cossine(x.features, postVector)))
                    .filter(lambda x: x[2]>=threshold)
                    .collect())
            if len(sim) > 0:
                suggestions.append(sim)

    #print '####levou %d segundos' % (timer() - start_i)

    if len(suggestions) > 0:
        #print '---Inserindo recomendacoes no MongoDB---'
        #start_i = timer()
        insertSuggestions(suggestions, iduser, posts)
示例#27
0
console.setFormatter(formatter)
# 加入 hander 到 root logger
logging.getLogger('').addHandler(console)
# 定義另兩個 logger
logger_server = logging.getLogger('Server')




# load Model.
## LogisticRegressionModel
LR_First_Model = LogisticRegressionModel.load(sc, LR_Layer1)
LR_Second_Model = LogisticRegressionModel.load(sc, LR_Layer2)
LR_Third_Model = LogisticRegressionModel.load(sc, LR_Layer3)

# Mold_LR_First_Model= LogisticRegressionModel.load(sc, Mold_LR_Layer1)
# Mold_LR_Second_Model = LogisticRegressionModel.load(sc, Mold_LR_Layer2)
# Mold_LR_Third_Model = LogisticRegressionModel.load(sc, Mold_LR_Layer3)
## SVMModel
SVM_First_Model = SVMModel.load(sc, SVM_Layer1)
SVM_Second_Model = SVMModel.load(sc, SVM_Layer2)
SVM_Third_Model = SVMModel.load(sc, SVM_Layer3)

Mold_SVM_First_Model = SVMModel.load(sc, Mold_SVM_Layer1)
# Mold_SVM_Second_Model = SVMModel.load(sc, Mold_SVM_Layer2)
# Mold_SVM_Third_Model = SVMModel.load(sc, Mold_SVM_Layer3)

## Random forset
Random_Forest_Model = RandomForestModel.load(sc,Random_Forest)
Mold_Random_Forest_Model = RandomForestModel.load(sc,Mold_Random_Forest)