Python SVMModel.load примеры, pyspark.mllib.classification.SVMModel.load Python примеры использования

Пример #1

0

Показать файл

def svm_second_predict(spark_session, svm_model_path, df, condition):
    """
    支持向量机二分类预测
    :param spark_session: spark 会话
    :param svm_model_path: 模型地址
    :param df: 数据
    :param condition: {"features": [12, 13, 14, 15], "label": "label"}
    特征列
    :return: 预测结果 sparkframe
    """
    feature_indexs = condition['features']
    label_index = condition['label']
    if label_index is None or label_index == "":  # 无标签列
        # 1. 准备数据
        def func(x):
            features_data = []
            for feature in feature_indexs:
                features_data.append(x[feature])
            return features_data

        predict_data = df.rdd.map(lambda x: func(x))
        print(predict_data.take(10))

        # 2.加载模型
        svm_model = SVMModel.load(spark_session.sparkContext, svm_model_path)

        # 3.预测
        def f(x):
            return {"prediction_result": x}

        prediction_rdd = svm_model.predict(predict_data)
        print(prediction_rdd.take(10))
        prediction_df = prediction_rdd.map(lambda x: Row(**f(x))).toDF()
        return prediction_df
    else:  # 有标签列
        # 1. 准备数据
        def func(x):
            features_data = []
            for feature in feature_indexs:
                features_data.append(x[feature])
            return LabeledPoint(label=x[label_index], features=features_data)

        predict_label_data = df.rdd.map(lambda x: func(x))
        print(predict_label_data.take(10))

        # 2.加载模型
        svm_model = SVMModel.load(spark_session.sparkContext, svm_model_path)

        # 3.预测
        from pyspark.sql.types import Row

        def f(x):
            return {"prediction_result": x[0], label_index: x[1]}

        prediction_rdd = predict_label_data.map(
            lambda x: (svm_model.predict(x.features), x.label))
        print(prediction_rdd.take(10))
        prediction_df = prediction_rdd.map(lambda x: Row(**f(x))).toDF()
        return prediction_df

Пример #2

0

Показать файл

Файл: composition_prediction_system.py Проект: WarnWang/Dissertation

    def load_parameters(self):
        self.amount_prediction_method = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL,
                                                                 file_name='amount_method')
        self.trend_prediction_method = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL,
                                                                file_name='trend_method')
        self.data_features = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL, file_name='features')
        self.stock_symbol = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL, file_name='symbol')
        self.data_parser = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL, file_name='data_parser')
        amount_model_path = os.path.join(os.path.abspath(self.model_path), 'amount_model')
        trend_model_path = os.path.join(os.path.abspath(self.model_path), 'trend_model')

        if self.amount_prediction_method == self.RANDOM_FOREST:
            amount_model = RandomForestModel.load(sc=self.sc, path=amount_model_path)
        elif self.amount_prediction_method == self.LINEAR_REGRESSION:
            amount_model = LinearRegressionModel.load(sc=self.sc, path=amount_model_path)
        else:
            amount_model = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL, file_name='amount_model')

        if self.trend_prediction_method == self.RANDOM_FOREST:
            trend_model = RandomForestModel.load(sc=self.sc, path=trend_model_path)
        elif self.trend_prediction_method == self.LOGISTIC_REGRESSION:
            trend_model = LogisticRegressionModel.load(sc=self.sc, path=trend_model_path)
        elif self.trend_prediction_method == self.NAIVE_BAYES:
            trend_model = NaiveBayesModel.load(sc=self.sc, path=trend_model_path)
        elif self.trend_prediction_method == self.SVM:
            trend_model = SVMModel.load(sc=self.sc, path=trend_model_path)
        else:
            trend_model = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL, file_name='trend_model')

        return trend_model, amount_model

Пример #3

0

Показать файл

def test_all(config):
    test_feature_path = config['root_directory'] + config[
        'feature_directory'] + config['test_all_feature_filename']
    model_path = config['root_directory'] + config['one_vs_all_model_directory']
    label_path = config['root_directory'] + config['label_directory'] + config[
        'one_vs_all_label_filename']

    if not os.path.exists(test_feature_path):
        print('No feature for all test')
        sys.exit(-1)

    if not os.path.exists(model_path):
        print('No model for all test')
        sys.exit(-1)

    if not os.path.exists(label_path):
        print('No label for all test')
        sys.exit(-1)

    # Load test data
    data = sc.textFile(test_feature_path)
    parsed_data = data.map(make_labeled_point)
    # Load the model
    model = SVMModel.load(sc, model_path)
    # Try the model against test data
    labels_and_preds = parsed_data.map(lambda p:
                                       (p.label, model.predict(p.features)))
    test_err = labels_and_preds.filter(
        lambda lp: lp[0] != lp[1]).count() / float(parsed_data.count())
    print("Test Error = " + str(test_err * 100) + "%")

Пример #4

0

Показать файл

Файл: animals_identification_emr.py Проект: plawson/distributed_learning

def do_1vsall(class_all, size, num_iter, config):
    features_path = config['protocol'] + config['bucket'] + config['sep'] + config['features_key']
    print('do_1vsall ==============> Setting RDD_ALL')
    rdd_all = sc.textFile(features_path, minPartitions=4).map(lambda line: line.split(',')).persist()
    print('do_1vsall ==============> Setting RDD_TRAIN_SET')
    rdd_train_set = rdd_all.filter(lambda features: int(features[1]) <= size) \
        .map(lambda features: ['0.0' if features[0] == class_all else '1.0'] + features[2:]) \
        .map(make_labeled_point)

    print('do_1vsall ==============> Setting RDD_TEST_SET')
    rdd_test_set = rdd_all.filter(lambda features: size < int(features[1])) \
        .map(lambda features: ['0.0' if features[0] == class_all else '1.0'] + features[2:]) \
        .map(make_labeled_point)

    # Build the model
    model_dir = class_all + '_' + str(size) + '_' + str(num_iter)
    model_s3_file = config['model_key'] + config['sep'] + model_dir
    model = None
    if s3_object_exists(config['bucket'], model_s3_file):
        print('do_1vsall ==============> Loading SVM Model: {}...'.format(model_s3_file))
        model = SVMModel.load(sc, config['protocol'] + config['bucket'] + config['sep'] + model_s3_file)
    else:
        print('do_1vsall ==============> Building SVM Model')
        model = SVMWithSGD.train(rdd_train_set, iterations=num_iter)
        print('do_1vsall ==============> Saving SVM Model: {}...'.format(model_s3_file))
        model.save(sc, config['protocol'] + config['bucket'] + config['sep'] + model_s3_file)

    # Evaluate the model on th test data
    print('do_1vsall ==============> Evaluating test set')
    labels_and_preds = rdd_test_set.map(lambda p: (p.label, model.predict(p.features)))
    train_err = labels_and_preds.filter(lambda lp: lp[0] != lp[1]).count() / float(rdd_test_set.count())
    # print("Test Error = " + str(train_err))
    success = round(((1 - train_err) * 100), 2)
    print('{},{}'.format(str(size), str(success)))
    return size, success

Пример #5

0

Показать файл

Файл: fraud.py Проект: NeroJ/Kaggle-Competition

 def Prediction(self, modelType):
     data_point = self.Features
     if modelType == 'RF':
         model = RandomForestModel.load(
             self.sc, self.baseDir + '/fraudModel/Model/' + modelType)
         result = np.array(
             model.predict(self.sc.parallelize(data_point)).collect())
         self.df_PD.insert(len(list(self.df_PD.columns)), 'result', result)
     elif modelType == 'GBDT':
         model = GradientBoostedTreesModel.load(
             self.sc, self.baseDir + '/fraudModel/Model/' + modelType)
         result = np.array(
             model.predict(self.sc.parallelize(data_point)).collect())
         self.df_PD.insert(len(list(self.df_PD.columns)), 'result', result)
     elif modelType == 'LRsgd':
         model = LogisticRegressionModel.load(
             self.sc, self.baseDir + '/fraudModel/Model/' + modelType)
         result = np.array(
             model.predict(self.sc.parallelize(data_point)).collect())
         self.df_PD.insert(len(list(self.df_PD.columns)), 'result', result)
     elif modelType == 'LRlbfgs':
         model = LogisticRegressionModel.load(
             self.sc, self.baseDir + '/fraudModel/Model/' + modelType)
         result = np.array(
             model.predict(self.sc.parallelize(data_point)).collect())
         self.df_PD.insert(len(list(self.df_PD.columns)), 'result', result)
     elif modelType == 'SVM':
         model = SVMModel.load(
             self.sc, self.baseDir + '/fraudModel/Model/' + modelType)
         result = np.array(
             model.predict(self.sc.parallelize(data_point)).collect())
         self.df_PD.insert(len(list(self.df_PD.columns)), 'result', result)
     else:
         pass

Пример #6

0

Показать файл

Файл: rpc_feature_server.py Проект: amplab/clipper-v0

 def __init__(self, path):
     conf = SparkConf() \
         .setAppName("crankshaw-pyspark") \
         .set("spark.executor.memory", "2g") \
         .set("spark.kryoserializer.buffer.mb", "128") \
         .set("master", "local")
     sc = SparkContext(conf=conf, batchSize=10)
     self.model = SVMModel.load(sc, path)
     self.path = path
     print("started spark")

Пример #7

0

Показать файл

Файл: engine.py Проект: sukrit-uba/spark-mllib-deployment-iris-example

    def __init__(self, sc):
        logger.info("Starting up the Classification Engine..")
        self.sc = sc

        #load data
        logger.info("Loading up data..")
        iris_data_raw_RDD = self.sc.textFile("/home/hduser/iris-data.txt")
        self.iris_data_parsed_RDD = iris_data_raw_RDD.map(self.parsePoint)
        #self.train_model()
        self.model = SVMModel.load(self.sc,
                                   "/home/hduser/pythonSVMWithSGDModel")

Пример #8

0

Показать файл

Файл: predictAPI.py Проект: ANTPHAM/Image_classification_AWS_Spark

def load_model():
    # load the pre-trained Keras model (here we are using a model
    # pre-trained on ImageNet and provided by Keras, but you can
    # substitute in your own networks just as easily)
    global Keras_model
    base_model = VGG16(weights='imagenet')
    global graph
    graph = tf.get_default_graph()
    # Model will produce the output of the 'fc2'layer which is the penultimate neural network layer
    Keras_model = Model(inputs=base_model.input,
                        outputs=base_model.get_layer('fc2').output)
    global Svm_model
    Svm_model = SVMModel.load(sc, Svm_model_path)

Пример #9

0

Показать файл

Файл: create_train_n_test_sets.py Проект: plawson/distributed_learning

def main():
    i = 0
    args = sys.argv[1:]
    if len(args) != 4 and len(args) != 1:
        usage(sys.argv[0])
        sys.exit(-1)

    input_dir = ""
    size = 0
    test_data = False
    while i < len(args):
        if args[i] == "--dir":
            i += 1
            input_dir = args[i]
            if len(input_dir) == 0:
                print("Input directory name is required")
                sys.exit(-1)
        elif args[i] == "--size":
            i += 1
            if not args[i].isdigit():
                print("Train size by class ({}) must be a number".format(
                    args[i]))
                sys.exit(-1)
            else:
                size = int(args[i])
        elif args[i] == "--test":
            test_data = True

        i += 1

    if size > 0:

        train_set, test_set = split_sets(input_dir, "*.json", size)

        create_features(train_set, test_set, input_dir)

        create_model()

    if test_data:
        # Load test data
        data = sc.textFile('./features/test_features.txt')
        parsed_data = data.map(make_labeled_point)
        # Load the model
        model = SVMModel.load(sc, './model/ImageRecognitionModel')
        # Try the model against test data
        labels_and_preds = parsed_data.map(
            lambda p: (p.label, model.predict(p.features)))
        test_err = labels_and_preds.filter(
            lambda lp: lp[0] != lp[1]).count() / float(parsed_data.count())
        print("Test Error = " + str(test_err * 100) + "%")

Пример #10

0

Показать файл

Файл: svm_test.py Проект: feng1008/spark

def main(sc):
    train_data='/usr/local/spark/data/mllib/sample_svm_data.txt'
    data=sc.textFile(train_data).map(parse)
    
    if os.path.exists('model'):
        model=SVMModel.load(sc, 'model')
    else:
        model=SVMWithSGD.train(data, iterations=100)
        model.save(sc, 'model')

    labelsAndPreds=data.map(lambda p: (p.label, model.predict(p.features)))

    # trainErr=labelsAndPreds.filter(lambda (v, p): v != p).count() / float(data.count())
    # print('Training Error ='  + str(trainErr))

    labelsAndPreds.map(lambda x:str(x[0])+'\t'+str(x[1])).saveAsTextFile('labelsAndPreds')

Пример #11

0

Показать файл

def svm_second_evaluation(spark_session, svm_model_path, df, predict_condition,
                          condition):
    """
    svm二分类评估
    :param spark_session:
    :param svm_model_path: 模型地址
    :param df: 预测数据
    :param predict_condition: 预测算子（父算子）配置
    :param condition: 该算子配置 {"label":"标签"}
    :return:
    """

    feature_indexs = predict_condition['features']
    label = condition['label']

    # 1. 准备数据
    def func(x):
        features_data = []
        for feature in feature_indexs:
            features_data.append(x[feature])
        return LabeledPoint(label=x[label], features=features_data)

    predict_data = df.rdd.map(lambda x: func(x))

    # 加载模型
    svm_model = SVMModel.load(spark_session.sparkContext, svm_model_path)

    # 计算评估指标
    svmTotalCorrect = predict_data.map(lambda r: 1 if (svm_model.predict(
        r.features) == r.label) else 0).reduce(lambda x, y: x + y)
    svmAccuracy = svmTotalCorrect / float(predict_data.count())

    # 清除默认阈值，这样会输出原始的预测评分，即带有确信度的结果
    svm_model.clearThreshold()
    svmPredictionAndLabels = predict_data.map(
        lambda lp: (float(svm_model.predict(lp.features)), lp.label))
    svmMetrics = BinaryClassificationMetrics(svmPredictionAndLabels)
    print("Area under PR = %s" % svmMetrics.areaUnderPR)
    print("Area under ROC = %s" % svmMetrics.areaUnderROC)

    # 返回数据
    result = [("正确个数", float(svmTotalCorrect)), ("精准度", float(svmAccuracy)),
              ("Area under PR", float(svmMetrics.areaUnderPR)),
              ("Area under ROC", float(svmMetrics.areaUnderROC))]
    return spark_session.createDataFrame(result, schema=['指标', '值'])

Пример #12

0

Показать файл

def test_one_vs_one(config, class1, class2):
    test_feature_path = config['root_directory'] + config[
        'feature_directory'] + config['test_one_feature_filename']
    model_path = config['root_directory'] + config['one_vs_one_model_directory']
    label_path = config['root_directory'] + config['label_directory'] + config[
        'one_vs_one_label_filename']

    if not os.path.exists(test_feature_path):
        print('No feature for all test')
        sys.exit(-1)

    if not os.path.exists(model_path):
        print('No model for all test')
        sys.exit(-1)

    if not os.path.exists(label_path):
        print('No label for all test')
        sys.exit(-1)

    with open(label_path, "r") as label_file:
        labels = dict(json.load(label_file))

    label_values = labels.values()

    if not any(x == class1
               for x in label_values) or not any(x == class2
                                                 for x in label_values):
        print("No labels for {} and {}.".format(class1, class2))
        sys.exit(-1)

    # Load test data
    data = sc.textFile(test_feature_path)
    parsed_data = data.map(make_labeled_point)
    # Load the model
    model = SVMModel.load(sc, model_path)
    # Try the model against test data
    labels_and_preds = parsed_data.map(lambda p:
                                       (p.label, model.predict(p.features)))
    test_err = labels_and_preds.filter(
        lambda lp: lp[0] != lp[1]).count() / float(parsed_data.count())
    print("Test Error = " + str(test_err * 100) + "%")

Пример #13

0

Показать файл

Файл: SVM.py Проект: Neil399399/Spark-2017CPS

def SVM_function(rdd,sc,method):
    #method
    from pyspark.mllib.classification import SVMModel
    print("rdd map")
    if method =='TimeDomain':
        output = TimeDomain(rdd)
        testData = sc.parallelize([output])

    if method =='FrequencyDomain':
        output=frequencyDomain(rdd)
        testData=sc.parallelize([output])

    
    #load model
    print("load model")
    Model = SVMModel.load(sc,"hdfs:///home/spark/Desktop/"+method+"Model")
#------------------------------------------------------------#
    #input data and prediction
    print("labelsAndPreds")
    labelsAndPreds = Model.predict(testData)
    return labelsAndPreds.collect()

Пример #14

0

Показать файл

def check_image(config, json_file):
    model_path = config['root_directory'] + config['one_vs_all_model_directory']
    label_path = config['root_directory'] + config['label_directory'] + config[
        'one_vs_all_label_filename']

    if not os.path.exists(model_path):
        print('No model with the full set')
        sys.exit(-1)

    if not os.path.exists(label_path):
        print('No label for with the full set')
        sys.exit(-1)

    with open(label_path, "r") as label_file:
        labels = dict(json.load(label_file))

    # inv_labels = {v: k for k, v in labels.items()}
    """
    rdd = sc.textFile(json_file)\
        .map(lambda line: ((line[1:])[:-1]))\
        .flatMap(make_float_list)

    model = SVMModel.load(sc, model_path)

    label = model.predict(rdd)
    """

    with open(json_file) as features_file:
        features = json.load(features_file)

    lp = make_label_point_from_list(",".join(map(str, features)))
    model = SVMModel.load(sc, model_path)
    label = model.predict(lp.features)

    print("=" * 60)
    print("Image features file: {}".format(json_file))
    print("Label: {}".format(label))
    print("Prediction: {}".format(labels[str(label)]))
    print("=" * 60)

Пример #15

0

Показать файл

Файл: train_svm.py Проект: pfattahi/AIRLINE_RANKINGS

def load_svm(sc):

    model= SVMModel.load(sc, "target1/tmp/pythonSVMWithSGDModel")
    return model

Пример #16

0

Показать файл

Файл: spark_generate_index.py Проект: marawanokasha/thesis-benchmark

    #     ct = get_contingency_table(binarySvm, test_bm25_doc_index, section)
    #     contingency_tables["bm25"][section] = ct

    test_rf_postings = test_tf_postings.mapValues(get_rf_postings)
    test_rf_doc_index = create_doc_index(test_rf_postings, term_dictionary)
    ct = get_contingency_table(binarySvm, test_rf_doc_index, section)
    contingency_tables["rf"][section] = ct

    test_tf_rf_postings = test_tf_postings.mapValues(get_tf_rf_postings)
    test_tf_rf_doc_index = create_doc_index(test_tf_rf_postings, term_dictionary)
    ct = get_contingency_table(binarySvm, test_tf_rf_doc_index, section)
    contingency_tables["tf-rf"][section] = ct
import cPickle as pickle

for clss in classes:
    binarySvm = SVMModel.load(sc, model_output + "tf" + "_" + clss + "_model.svm")

    ct = get_contingency_table(binarySvm, test_tf_doc_index, clss)
    contingency_tables["tf"][clss] = ct
    ct = get_contingency_table(binarySvm, test_tf_id_doc_index, clss)
    contingency_tables["tf-idf"][clss] = ct
    ct = get_contingency_table(binarySvm, test_bm25_doc_index, clss)
    contingency_tables["bm25"][clss] = ct

    test_rf_postings = test_tf_postings.mapValues(get_rf_postings)
    test_rf_doc_index = create_doc_index(test_rf_postings, term_dictionary)
    ct = get_contingency_table(binarySvm, test_rf_doc_index, clss)
    contingency_tables["rf"][clss] = ct

    test_tf_rf_postings = test_tf_postings.mapValues(get_tf_rf_postings)
    test_tf_rf_doc_index = create_doc_index(test_tf_rf_postings, term_dictionary)

Пример #17

0

Показать файл

Файл: tic_tac_solution_exercise.py Проект: CodeChix-OpenSource/CodeChix-Technical-Curriculums

from pyspark.mllib.classification import SVMWithSGD, SVMModel
from pyspark.mllib.regression import LabeledPoint

if __name__ == "__main__":
    sc = SparkContext(appName="SVMTicTac")

    # Parse the data and create LabeledPoints
    def parsePoint(line):
        values = [(x) for x in line.split(' ')]
        # Last row contains the target data and rest of
        # the rows define the attributes for linear regression
        return LabeledPoint(values[9], values[0:8])

    # Load the data
    data = sc.textFile("data/mllib/sample_traindata_tic_tac.txt")
    parsedData = data.map(parsePoint)

    # Build the model using SVD
    model = SVMWithSGD.train(parsedData, iterations=100)

    # Evaluating the model on training data
    predict_model = parsedData.map(lambda p: (p.label, model.predict(p.features)))
    trainErr = predict_model.filter(lambda (v, p): v != p).count() / float(parsedData.count())
    
    # Print Mean Squared Error
    print("Training Error = " + str(trainErr))

    # Save and load model
    model.save(sc, "target/tmp/pythonTicTacSGD")
    sameModel = SVMModel.load(sc, "target/tmp/pythonTicTacSGD")

Пример #18

0

Показать файл

Файл: svm_spark.py Проект: cmantas/asap.cslab

def classify_with_model(input_data_path, model_file_path):
    input_parsed = sc.textFile(input_data_path).map(parse_point)
    model = SVMModel.load(sc, model_file_path)
    labels = input_parsed.map(lambda p: model.predict(p.features))
    labels.saveAsTextFile("predictions")

Пример #19

0

Показать файл

    spark = SparkSession.builder.appName("downsample").getOrCreate()

    lines = spark.read.csv("s3://daen-cyber/filteredSource/only5s",header=True)
    lines = lines.select([c for c in lines.columns if c not in {'ip','maxScore','minScore','avgScore','trendUp','trendDown','trueCount','dataSetCount','mostCommonCustomerHit'}])

    #for 5s test we dont care about label, just use 1.0
    def labeledPointConverter(row):
        try:
            return LabeledPoint(1.0, row[1:])
        except ValueError:
            return LabeledPoint(50.0,[1.0])

    parsedData = lines.rdd.map(lambda x: labeledPointConverter(x))
    parsedData = parsedData.filter(lambda x: x.label != 50.0)
    parsedData.cache()
    model = SVMModel.load(sc, "s3://daen-cyber/models/no5sSvmModel0")
    preds = parsedData.map(lambda p: model.predict(p.features))
    parsedData.unpersist()
    preds.cache()

    below5 = preds.filter(lambda p: p == 0.0).count()
    above5 = preds.filter(lambda p: p == 1.0).count()

    listToOutput = []
    listToOutput = listToOutput + [("Above 5", str(above5))]
    listToOutput = listToOutput + [("Below5", str(below5))]

    listToOutputRDD = sc.parallelize(listToOutput, 1)\
         .saveAsTextFile("s3://daen-cyber/models/only5sSvmResults0")

    #model.save(sc, "s3://daen-cyber/modelsb/no5sSvmModel0")

Пример #20

0

Показать файл

Файл: tic_tac_solution_exercise.py Проект: arangas27/CodeChix-Technical-Curriculums

if __name__ == "__main__":
    sc = SparkContext(appName="SVMTicTac")

    # Parse the data and create LabeledPoints
    def parsePoint(line):
        values = [(x) for x in line.split(' ')]
        # Last row contains the target data and rest of
        # the rows define the attributes for linear regression
        return LabeledPoint(values[9], values[0:8])

    # Load the data
    data = sc.textFile("data/mllib/sample_traindata_tic_tac.txt")
    parsedData = data.map(parsePoint)

    # Build the model using SVD
    model = SVMWithSGD.train(parsedData, iterations=100)

    # Evaluating the model on training data
    predict_model = parsedData.map(lambda p:
                                   (p.label, model.predict(p.features)))
    trainErr = predict_model.filter(lambda (v, p): v != p).count() / float(
        parsedData.count())

    # Print Mean Squared Error
    print("Training Error = " + str(trainErr))

    # Save and load model
    model.save(sc, "target/tmp/pythonTicTacSGD")
    sameModel = SVMModel.load(sc, "target/tmp/pythonTicTacSGD")

Пример #21

0

Показать файл

from pyspark.mllib.regression import LabeledPoint

conf = SparkConf().setAppName('Linear Support Vector Machines').setMaster(
    'local[2]')
sc = SparkContext(conf=conf)


# load and parse the data
def parsePoint(line):
    values = [float(x) for x in line.split(' ')]
    return LabeledPoint(values[0], values[1:])


data = sc.textFile('../data/sample_svm_data.txt')
parseData = data.map(parsePoint)

# build the model
model = SVMWithSGD.train(parseData, iterations=100)

# evaluating the model on training data
labelsAndPreds = parseData.map(lambda p: (p.label, model.predict(p.features)))
trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(
    parseData.count())

print('training error :' + str(trainErr))

# save and load model
model.save(sc, '../model/pythonSVMWithSGDModel')
sameModel = SVMModel.load(sc, '../model/pythonSVMWithSGDModel')

sc.stop()

Пример #22

0

Показать файл

Файл: SVM.py Проект: bmewing/spark_vs_r

                        values[7]=1;
        else:
                        values[7]=0;

	return LabeledPoint(values[7], values[0:7]) #dep_delay, cancelled, diverted, carrierdelay, weather delay, NASdelay, Security delay, LateAircraftdelay

#examples = MLUtils.loadLibSVMFile(sc, "2008.csv").collect()
parsedData = raw_data.map(parsePoint)
(trainingData, testData) = parsedData.randomSplit([0.7, 0.3])
startTime = datetime.now()

# Build the model
trainingData.cache ()
model = SVMWithSGD.train(trainingData, iterations=1)
print ('Training Time consumed = '), (datetime.now() - startTime)
startTestTime = datetime.now()

# Evaluating the model on test data
labelsAndPreds = testData.map(lambda p: (p.label, model.predict(p.features)))
testErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(testData.count())
print ('Testing Time consumed = '), (datetime.now() - startTestTime)
print ('Time consumed = '), (datetime.now() - startTime)

print("Training Error = " + str(testErr))


# Save and load model
model.save(sc, "SVMwide00-08train")
sameModel = SVMModel.load(sc, "SVMwide00-08train")

Пример #23

0

Показать файл

# %%
lr_predictions = test.map(lambda line:
                          (line[0], line[1], float(lr_model.predict(line[3]))))
lr_predictions.coalesce(1).toDF().write.options(header="true").csv(
    "hdfs://node1:9000/user/root/exp4/predictions/lr_predictions.csv")

# %% [markdown]
# 日期:2020-12-20 14:08:52 排名: 无
# score:0.5015744
# %% [markdown]
# ## SVM

# %%
from pyspark.mllib.classification import SVMModel
svm_model = SVMModel.load(
    sc, "hdfs://node1:9000/user/root/exp4/models/SVMWithSGDModel")

# %%
svm_predictions = test.map(
    lambda line: (line[0], line[1], float(svm_model.predict(line[3]))))
svm_predictions.coalesce(1).toDF().write.options(header="true").csv(
    "hdfs://node1:9000/user/root/exp4/predictions/svm_predictions.csv")

# %% [markdown]
# 日期:2020-12-20 14:18:59 排名: 无
# score:0.5156678
# %% [markdown]
# ## Gradient Boosted Trees

# %%
from pyspark.mllib.tree import GradientBoostedTreesModel

Пример #24

0

Показать файл

Файл: accuracysvm.py Проект: manhcompany/manhdoi

from pyspark import SparkContext
from pyspark.mllib.classification import SVMWithSGD, SVMModel
from pyspark.mllib.regression import LabeledPoint

from document import DocumentSVM


sc = SparkContext(appName="SVM")
svmModel = SVMModel.load(sc, "../SVMModel")

def parsePoint(line):
    splits = line.split(':')
    doc = DocumentSVM(splits[0], splits[1])
    return doc.svmVec()

data = sc.textFile("hdfs://localhost:8020/user/manh/vectorsvmtest")

parsedData = data.map(parsePoint)

labelsAndPreds = parsedData.map(lambda p: (p.label, svmModel.predict(p.features)))

#0
for i in range(2):
	precision = labelsAndPreds.filter(lambda(v, p) : v == i and v == p).count() / float(labelsAndPreds.filter(lambda(v, p) : p == i).count())

	recall = labelsAndPreds.filter(lambda(v, p) : v == i and v == p).count() / float(labelsAndPreds.filter(lambda(v, p) : v == i).count())

	print("==================Precision c" + str(i) + " : " + str(precision))
	print("==================Recall c" + str(i) + " : " + str(recall))

accuracy = labelsAndPreds.filter(lambda (v, p): v == p).count() / float(parsedData.count())

Пример #25

0

Показать файл

Файл: spark-predict.py Проект: tomyedwab/mids-w205-project-tx

import datetime
import sys

from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark.sql import functions as F
from pyspark.mllib.classification import SVMModel

dataset_name = sys.argv[1]

sc = SparkContext("local", "Model Prediction", pyFiles=[])
sqlContext = SQLContext(sc)

# First load the model we saved in the model generation step
model = SVMModel.load(sc, "hdfs://hadoop:9000/models/noBikesAvailable.model")

# We also need the stats used to normalize the weather variables
stats_df = sqlContext.read.load("hdfs://hadoop:9000/models/weather-stats")
stats = stats_df.collect()[0]

# We want to produce output for each station
station_df = sqlContext.read.load("hdfs://hadoop:9000/station_data_schema")

print "Statistics: %s" % (stats,)

# Load the weather data

current_weather_csv = (
    sc.textFile("hdfs://hadoop:9000/current_weather/%s.csv" % dataset_name)
    .map(lambda line: line.split(","))

Пример #26

0

Показать файл

def main():
    #retrieve argument
    args = parse_arguments()
    main_directory = args.directory
    class1 = args.class1
    class2 = args.class2
    force_by_user = args.force
    if args.verbose:
        lg.basicConfig(level=lg.INFO)

    #Variables declaration
    result = []
    directory_feature = os.path.join(main_directory, "features", "*.json")
    nb_training_data_list = args.nb_training_data
    iteration_model_list = args.iteration_model

    lg.info('Features directory is %s', directory_feature)
    for iteration_model in iteration_model_list:
        for nb_training_data in nb_training_data_list:
            model_file = 'model_' + class1 + '_' + class2 + '_' + str(
                nb_training_data) + '_' + str(iteration_model)
            result_file = 'result_' + class1 + '_' + class2 + '_' + str(
                nb_training_data) + '_' + str(
                    iteration_model) + '_' + time.strftime(
                        "%Y%m%d%H%M%S") + '.json'
            model_pathname = os.path.join(main_directory, "models", model_file)

            is_model = False

            start_time = time.time()
            lg.info(
                '#################### Starting pet-classification ######################'
            )
            lg.info('Class 1 is %s', class1)
            lg.info('Class 2 is %s', class2)
            lg.info('Number of training datas is %s', nb_training_data)
            lg.info('Number of iterations model is %s', iteration_model)

            #persist a common rdd which is using by both training and testing datas
            common_rdd = sc.textFile(directory_feature, minPartitions=4)\
                           .filter(lambda line: line.split(', ')[0] in (class1, class2) or class2 == 'All')\
                           .persist()

            #Loading model if exists
            if is_model and not force_by_user:
                model = SVMModel.load(sc, model_pathname)
                lg.info('Found and load recorded model %s', model_file)
            else:
                lg.info('No recorded model found')
                #create training rdd and train model if no model found or force
                train_data_rdd = common_rdd.filter(lambda line: int(line.split(',')[1]) <= nb_training_data)\
                                           .map(lambda line: Row(label=0.0, features=line.split(', ')[2:])
                                                if line.split(', ')[0] == class1
                                                else Row(label=1.0, features=line.split(', ')[2:]))\
                                           .map(lambda line: LabeledPoint(line.label, line.features))

                lg.info('%s features for training datas',
                        train_data_rdd.count())
                lg.info('Start to training model')
                model = SVMWithSGD.train(train_data_rdd,
                                         iterations=iteration_model)
                lg.info('Training model terminated')

            training_time = time.time()
            training_duration = training_time - start_time
            #Create testing rdd
            test_data_rdd = common_rdd.filter(lambda line: int(line.split(', ')[1]) > nb_training_data)\
                              .map(lambda line: Row(label=0.0, features=line.split(', ')[2:])
                                   if line.split(', ')[0] == class1
                                   else Row(label=1.0, features=line.split(', ')[2:]))\
                              .map(lambda row: LabeledPoint(row.label, row.features))
            lg.info('%s features for test datas', test_data_rdd.count())

            # Evaluating the model on training data
            predictions = test_data_rdd.map(
                lambda row: (row.label, float(model.predict(row.features))))
            train_error = predictions.filter(lambda lp: lp[0] != lp[1]).count() \
                                             / float(predictions.count())
            lg.info('Test Error : %s', str(train_error))
            end_time = time.time()
            duration = end_time - start_time
            lg.info('Duration %s', str(duration))
            prediction_duration = end_time - training_time
            # #Save and dump result on S3
            result = {
                "class1": class1,
                "class2": class2,
                "nb_training_data": nb_training_data,
                "error": train_error,
                "iteration_model": iteration_model,
                "total_duration": duration,
                "training_duration": training_duration,
                "prediction_duration": prediction_duration
            }

            with open(result_file, 'w') as result_file:
                json.dump(result, result_file)

    lg.info(
        '#################### Ending pet-classification ######################'
    )
    input("press ctrl+c to exit")

Пример #27

0

Показать файл

Файл: SVM-narrow.py Проект: bmewing/spark_vs_r

	values = [float(x) for x in clean_line_split]
	if values[4] == 0:
                        values[4]=1;
        else:
                        values[4]=0;

	return LabeledPoint(values[4], values[0:3]) #dep_delay, cancelled, diverted, carrierdelay, weather delay, NASdelay, Security delay, LateAircraftdelay

#examples = MLUtils.loadLibSVMFile(sc, "2008.csv").collect()
parsedData = raw_data.map(parsePoint)
(trainingData, testData) = parsedData.randomSplit([0.7, 0.3])

startTime = datetime.now()
# Build the model
trainingData.cache ()
model = SVMWithSGD.train(trainingData, iterations=1)
print ('Training Time consumed = '), (datetime.now() - startTime)
startTestTime = datetime.now()
testData.cache()
# Evaluating the model on test data
labelsAndPreds = testData.map(lambda p: (p.label, model.predict(p.features)))
testErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(testData.count())
print ('Testing Time consumed = '), (datetime.now() - startTestTime)
print ('Time consumed = '), (datetime.now() - startTime)

print("Training Error = " + str(testErr))

# Save and load model
model.save(sc, "SVMnarrow95-08")
sameModel = SVMModel.load(sc, "SVMnarrow95-08")

Пример #28

0

Показать файл

Файл: SVM.py Проект: bmewing/spark_vs_r

                        values[4]=1;
        else:
                        values[4]=0;

	return LabeledPoint(values[4], values[0:4]) #dep_delay, cancelled, diverted, carrierdelay, weather delay, NASdelay, Security delay, LateAircraftdelay

#examples = MLUtils.loadLibSVMFile(sc, "2008.csv").collect()
parsedData = raw_data.map(parsePoint)
(trainingData, testData) = parsedData.randomSplit([0.7, 0.3])
startTime = datetime.now()

# Build the model
trainingData.cache ()
model = SVMWithSGD.train(trainingData, iterations=1)
print ('Training Time consumed = '), (datetime.now() - startTime)
startTestTime = datetime.now()

# Evaluating the model on test data
labelsAndPreds = testData.map(lambda p: (p.label, model.predict(p.features)))
testErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(testData.count())
print ('Testing Time consumed = '), (datetime.now() - startTestTime)
print ('Time consumed = '), (datetime.now() - startTime)

print("Training Error = " + str(testErr))


# Save and load model
model.save(sc, "SVMNarrow95-08train")
sameModel = SVMModel.load(sc, "SVMNarrow95-08train")

Пример #29

0

Показать файл

Файл: predict_single_file_generic.py Проект: superuser5/Resilient-ML-Research-Platform

        from pyspark.mllib.evaluation import BinaryClassificationMetrics
        from pyspark.mllib.tree import DecisionTree
        from pyspark.mllib.clustering import KMeans, KMeansModel, GaussianMixture, GaussianMixtureModel
        from pyspark.mllib.linalg import Vectors 
        
        SparkContext.setSystemProperty('spark.rdd.compress', config.get('spark', 'spark_rdd_compress'))
        SparkContext.setSystemProperty('spark.driver.maxResultSize', config.get('spark', 'spark_driver_maxResultSize'))
        SparkContext.setSystemProperty('spark.executor.memory', args.exe_memory)
        SparkContext.setSystemProperty('spark.cores.max', args.core_max)

        sc = SparkContext(args.sp_master, 'single_predict:'+str(args.row_id))
        flag_model = ml_opts['learning_algorithm']        
        save_dir = config.get('app', 'HADOOP_MASTER')+config.get('app', 'HDFS_MODEL_DIR')+'/'+row_id_str

        if flag_model == "linear_svm_with_sgd":
            mllib_model = SVMModel.load(sc, save_dir)
            col_num = len(mllib_model.weights)
        elif flag_model == "logistic_regression_with_lbfgs" or flag_model == "logistic_regression_with_sgd":
            mllib_model = LogisticRegressionModel.load(sc, save_dir)
            col_num = mllib_model.numFeatures # len(mllib_model.weights) return 3x value
        elif flag_model == "kmeans":
            mllib_model = KMeansModel.load(sc, save_dir)
            col_num =len(mllib_model.clusterCenters[0])
        else:
            print "ERROR: Training model selection error: no valid ML model selected!"
            return
        # get the model dimension
        #col_num = len(mllib_model.weights)
        print "INFO: total feature # in mllib model=",col_num

        # calculate hypothesis value ================

Пример #30

0

Показать файл

Файл: SVM.py Проект: bsangee/spark_vs_r

                        values[4]=1;
        else:
                        values[4]=0;

	return LabeledPoint(values[4], values[0:4]) #dep_delay, cancelled, diverted, carrierdelay, weather delay, NASdelay, Security delay, LateAircraftdelay

#examples = MLUtils.loadLibSVMFile(sc, "2008.csv").collect()
parsedData = raw_data.map(parsePoint)
(trainingData, testData) = parsedData.randomSplit([0.7, 0.3])
startTime = datetime.now()

# Build the model
trainingData.cache ()
model = SVMWithSGD.train(trainingData, iterations=1)
print ('Training Time consumed = '), (datetime.now() - startTime)
startTestTime = datetime.now()

# Evaluating the model on test data
labelsAndPreds = testData.map(lambda p: (p.label, model.predict(p.features)))
testErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(testData.count())
print ('Testing Time consumed = '), (datetime.now() - startTestTime)
print ('Time consumed = '), (datetime.now() - startTime)

print("Training Error = " + str(testErr))


# Save and load model
model.save(sc, "SVMNarrow00-08train")
sameModel = SVMModel.load(sc, "SVMNarrow00-08train")

Пример #31

0

Показать файл

Файл: pet-classification.py Проект: sberton/distributed_learning

def main():
    #retrieve argument
    args = parse_arguments()
    main_directory = args.directory
    class1 = args.class1
    class2 = args.class2
    force_by_user = args.force
    if args.verbose:
        lg.basicConfig(level=lg.INFO)

    #Variables declaration
    result = []
    directory_feature = os.path.join(main_directory, "features", "*.json")
    nb_training_data = args.nb_training_data
    iteration_model = args.iteration_model
    min_partition = args.min_partition
    s3 = boto3.resource('s3')
    bucket = s3.Bucket('oc-calculdistribues-sberton')
    result_file = class1 + '_' + class2 + '_' + time.strftime(
        "%Y%m%d%H%M%S") + '.json'
    model_file = 'model_' + class1 + '_' + class2 + '_' + str(
        nb_training_data) + '_' + str(iteration_model)
    model_pathname = os.path.join(main_directory, "models", model_file)

    #Searching existing model and store existence in is_model boolean
    key = 'distributed_learning/models/' + model_file
    objs = list(bucket.objects.filter(Prefix=key))
    is_model = len(objs) > 0 and objs[0].key.startswith(key + '/')

    start_time = time.time()
    lg.info(
        '#################### Starting pet-classification ######################'
    )
    lg.info('Class 1 is %s', class1)
    lg.info('Class 2 is %s', class2)
    lg.info('Number of training datas is %s', nb_training_data)
    lg.info('Number of iterations model is %s', iteration_model)

    #persist a common rdd which is using by both training and testing datas
    common_rdd = sc.textFile(directory_feature, minPartitions=min_partition)\
                   .filter(lambda line: line.split(', ')[0] in (class1, class2) or class2 == 'All')\
                   .persist()

    #Loading model if exists
    if is_model and not force_by_user:
        model = SVMModel.load(sc, model_pathname)
        lg.info('Found and load recorded model %s', model_file)
    else:
        lg.info('No recorded model found')
        #create training rdd and train model if no model found or force
        train_data_rdd = common_rdd.filter(lambda line: int(line.split(',')[1]) <= nb_training_data)\
                                   .map(lambda line: Row(label=0.0, features=line.split(', ')[2:])
                                        if line.split(', ')[0] == class1
                                        else Row(label=1.0, features=line.split(', ')[2:]))\
                                   .map(lambda line: LabeledPoint(line.label, line.features))

        lg.info('%s features for training datas', train_data_rdd.count())
        lg.info('Start to training model')
        model = SVMWithSGD.train(train_data_rdd, iterations=iteration_model)
        lg.info('Training model terminated')

    training_time = time.time()
    training_duration = training_time - start_time
    #Create testing rdd
    test_data_rdd = common_rdd.filter(lambda line: int(line.split(', ')[1]) > nb_training_data)\
                      .map(lambda line: Row(label=0.0, features=line.split(', ')[2:])
                                           if line.split(', ')[0] == class1
                                           else Row(label=1.0, features=line.split(', ')[2:]))\
                      .map(lambda row: LabeledPoint(row.label, row.features))
    lg.info('%s features for test datas', test_data_rdd.count())

    # Evaluating the model on training data
    predictions = test_data_rdd.map(
        lambda row: (row.label, float(model.predict(row.features))))
    train_error = predictions.filter(lambda lp: lp[0] != lp[1]).count() \
                                     / float(predictions.count())
    lg.info('Test Error : %s', str(train_error))
    end_time = time.time()
    duration = end_time - start_time
    lg.info('Duration %s', str(duration))
    prediction_duration = end_time - training_time
    # #Save and dump result on S3
    result.append({
        "class1": class1,
        "class2": class2,
        "iteration_model": iteration_model,
        "nb_training_data": nb_training_data,
        "total_duration": duration,
        "train_duration": training_duration,
        "predict_duration": prediction_duration,
        "error": train_error
    })

    s3object = s3.Object('oc-calculdistribues-sberton', result_file)
    s3object.put(Body=(bytes(json.dumps(result, indent=2).encode('UTF-8'))))

    #Save model if not exists
    if not is_model:
        lg.info('Saving model at %s', model_file)
        model.save(sc, model_pathname)

    lg.info(
        '#################### Ending pet-classification ######################'
    )

Пример #32

0

Показать файл

Файл: svmBySpark.py Проект: xiaokugua250/PythonProject

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Date    : 2015-11-23 20:18:03
# @Author  : Your Name ([email protected])
# @Link    : http://example.org
# @Version : $Id$
from pyspark.mllib.classification import SVMWithSGD, SVMModel
from pyspark.mllib.regression import LabeledPoint

# Load and parse the data
def parsePoint(line):
    values = [float(x) for x in line.split(' ')]
    return LabeledPoint(values[0], values[1:])

data = sc.textFile("data/mllib/sample_svm_data.txt")
parsedData = data.map(parsePoint)

# Build the model
model = SVMWithSGD.train(parsedData, iterations=100)

# Evaluating the model on training data
labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count())
print("Training Error = " + str(trainErr))

# Save and load model
model.save(sc, "myModelPath")
sameModel = SVMModel.load(sc, "myModelPath")

Пример #33

0

Показать файл

    #     contingency_tables["bm25"][section] = ct

    test_rf_postings = test_tf_postings.mapValues(get_rf_postings)
    test_rf_doc_index = create_doc_index(test_rf_postings, term_dictionary)
    ct = get_contingency_table(binarySvm, test_rf_doc_index, section)
    contingency_tables["rf"][section] = ct

    test_tf_rf_postings = test_tf_postings.mapValues(get_tf_rf_postings)
    test_tf_rf_doc_index = create_doc_index(test_tf_rf_postings,
                                            term_dictionary)
    ct = get_contingency_table(binarySvm, test_tf_rf_doc_index, section)
    contingency_tables["tf-rf"][section] = ct
import cPickle as pickle

for clss in classes:
    binarySvm = SVMModel.load(sc,
                              model_output + "tf" + "_" + clss + "_model.svm")

    ct = get_contingency_table(binarySvm, test_tf_doc_index, clss)
    contingency_tables["tf"][clss] = ct
    ct = get_contingency_table(binarySvm, test_tf_id_doc_index, clss)
    contingency_tables["tf-idf"][clss] = ct
    ct = get_contingency_table(binarySvm, test_bm25_doc_index, clss)
    contingency_tables["bm25"][clss] = ct

    test_rf_postings = test_tf_postings.mapValues(get_rf_postings)
    test_rf_doc_index = create_doc_index(test_rf_postings, term_dictionary)
    ct = get_contingency_table(binarySvm, test_rf_doc_index, clss)
    contingency_tables["rf"][clss] = ct

    test_tf_rf_postings = test_tf_postings.mapValues(get_tf_rf_postings)
    test_tf_rf_doc_index = create_doc_index(test_tf_rf_postings,

Пример #34

0

Показать файл

console.setFormatter(formatter)
# 加入 hander 到 root logger
logging.getLogger('').addHandler(console)
# 定義另兩個 logger
logger_server = logging.getLogger('Server')




# load Model.
## LogisticRegressionModel
LR_First_Model = LogisticRegressionModel.load(sc, LR_Layer1)
LR_Second_Model = LogisticRegressionModel.load(sc, LR_Layer2)
LR_Third_Model = LogisticRegressionModel.load(sc, LR_Layer3)

# Mold_LR_First_Model= LogisticRegressionModel.load(sc, Mold_LR_Layer1)
# Mold_LR_Second_Model = LogisticRegressionModel.load(sc, Mold_LR_Layer2)
# Mold_LR_Third_Model = LogisticRegressionModel.load(sc, Mold_LR_Layer3)
## SVMModel
SVM_First_Model = SVMModel.load(sc, SVM_Layer1)
SVM_Second_Model = SVMModel.load(sc, SVM_Layer2)
SVM_Third_Model = SVMModel.load(sc, SVM_Layer3)

Mold_SVM_First_Model = SVMModel.load(sc, Mold_SVM_Layer1)
# Mold_SVM_Second_Model = SVMModel.load(sc, Mold_SVM_Layer2)
# Mold_SVM_Third_Model = SVMModel.load(sc, Mold_SVM_Layer3)

## Random forset
Random_Forest_Model = RandomForestModel.load(sc,Random_Forest)
Mold_Random_Forest_Model = RandomForestModel.load(sc,Mold_Random_Forest)

Пример #35

0

Показать файл

Файл: svm_with_sgd_example.py Проект: littlebear-xbz/hadoop

from pyspark import SparkContext
# $example on$
from pyspark.mllib.classification import SVMWithSGD, SVMModel
from pyspark.mllib.regression import LabeledPoint
# $example off$

if __name__ == "__main__":
    sc = SparkContext(appName="PythonSVMWithSGDExample")

    # $example on$
    # Load and parse the data
    def parsePoint(line):
        values = [float(x) for x in line.split(' ')]
        return LabeledPoint(values[0], values[1:])

    data = sc.textFile("/user/huting/testSet_SVM.txt")
    parsedData = data.map(parsePoint)

    # Build the model
    model = SVMWithSGD.train(parsedData, iterations=100)

    # Evaluating the model on training data
    labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
    trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count())
    print("Training Error = " + str(trainErr))

    # Save and load model
    model.save(sc, "target/tmp/pythonSVMWithSGDModel")
    sameModel = SVMModel.load(sc, "target/tmp/pythonSVMWithSGDModel")
    # $example off$

Пример #36

0

Показать файл

ssc = StreamingContext(sc, 10)

kafka_configuration_params = {
    "topic": ["BigData"],
    "connectionstring": "localhost:9092"
}

from pyspark.streaming.kafka import KafkaUtils
directKafkaStream = KafkaUtils.createDirectStream(
    ssc, kafka_configuration_params["topic"],
    {"metadata.broker.list": kafka_configuration_params["connectionstring"]})

from pyspark.mllib.classification import SVMModel, LogisticRegressionModel, NaiveBayesModel

LR_model = LogisticRegressionModel.load(sc, "../../notebooks/LR_model")
SVM_model = SVMModel.load(sc, "../../notebooks/SVM_model")
NB_model = NaiveBayesModel.load(sc, "../../notebooks/NB_model")

import nltk
import random
from nltk.tokenize import word_tokenize

allowed_word_types = ["JJ"]

rdd_all_words = sc.textFile("../../notebooks/all_words/part-00000")
rdd_broadcast_all_words = sc.broadcast(rdd_all_words.collect())


def convert_tweet_to_instance(tweets):

    rdd_tweets = tweets.map( \

Пример #37

0

Показать файл

Файл: SVM.py Проект: bsangee/spark_vs_r

                        values[7]=1;
        else:
                        values[7]=0;

	return LabeledPoint(values[7], values[0:7]) #dep_delay, cancelled, diverted, carrierdelay, weather delay, NASdelay, Security delay, LateAircraftdelay

#examples = MLUtils.loadLibSVMFile(sc, "2008.csv").collect()
parsedData = raw_data.map(parsePoint)
(trainingData, testData) = parsedData.randomSplit([0.7, 0.3])
startTime = datetime.now()

# Build the model
trainingData.cache ()
model = SVMWithSGD.train(trainingData, iterations=1)
print ('Training Time consumed = '), (datetime.now() - startTime)
startTestTime = datetime.now()

# Evaluating the model on test data
labelsAndPreds = testData.map(lambda p: (p.label, model.predict(p.features)))
testErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(testData.count())
print ('Testing Time consumed = '), (datetime.now() - startTestTime)
print ('Time consumed = '), (datetime.now() - startTime)

print("Training Error = " + str(testErr))


# Save and load model
model.save(sc, "SVMwide2008train")
sameModel = SVMModel.load(sc, "SVMwide2008train")

Пример #38

0

Показать файл

    parsed_data = MLUtils\
        .loadLibSVMFile(spark_context, "data/classificationdata.txt")\
        .cache()

    print("Parsed data size: " + str(parsed_data.count()))

    # Split initial RDD into two... [60% training data, 40% testing data]
    training, test = parsed_data\
        .randomSplit([0.6, 0.4], seed=3)

    print("Training points size: " + str(training.count()))
    print("Test points size    : " + str(test.count()))

    # Build the model
    model = SVMWithSGD.train(training, iterations=100)

    score_and_labels = test.map(lambda point: score_function(point, model))

    #for score, label in score_and_labels.collect():
    #    print("Score: %d, label: %f" % (score, label))

    # Get evaluation metrics
    metrics = BinaryClassificationMetrics(score_and_labels)
    auROC = metrics.areaUnderROC
    print("Area under ROC: %f" % auROC)

    # Save and load model
    model.save(spark_context, "SVMModel3")
    sameModel = SVMModel.load(spark_context, "SVMModel2")

    spark_context.stop()

Пример #39

0

Показать файл

Файл: svm_with_sgd_example.py Проект: ChaiBapchya/spark

# $example on$
from pyspark.mllib.classification import SVMWithSGD, SVMModel
from pyspark.mllib.regression import LabeledPoint

# $example off$

if __name__ == "__main__":
    sc = SparkContext(appName="PythonSVMWithSGDExample")

    # $example on$
    # Load and parse the data
    def parsePoint(line):
        values = [float(x) for x in line.split(" ")]
        return LabeledPoint(values[0], values[1:])

    data = sc.textFile("data/mllib/sample_svm_data.txt")
    parsedData = data.map(parsePoint)

    # Build the model
    model = SVMWithSGD.train(parsedData, iterations=100)

    # Evaluating the model on training data
    labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
    trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count())
    print("Training Error = " + str(trainErr))

    # Save and load model
    model.save(sc, "target/tmp/pythonSVMWithSGDModel")
    sameModel = SVMModel.load(sc, "target/tmp/pythonSVMWithSGDModel")
    # $example off$

Пример #40

0

Показать файл

Файл: svm.py Проект: manhcompany/reishi

from pyspark import SparkContext
from pyspark.mllib.classification import SVMWithSGD, SVMModel
from pyspark.mllib.regression import LabeledPoint


def parsePoint(line):
    parsedData = [0 for i in range(45000)]
    splits = line.split(":")
    vectorSplit = splits[1].split(";")
    for vs in vectorSplit:
        vSplit = vs.split(" ")
        parsedData[int(vSplit[0])] = float(vSplit[1])
    return LabeledPoint(float(splits[0]), parsedData)


sc = SparkContext(appName="PythonSVMWithSGDExample")
data = sc.textFile("hdfs://localhost:8020/pyspark/vectorsvm")
parsedData = data.map(parsePoint)

model = SVMWithSGD.train(parsedData, iterations=100)

labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count())
print("Training Error = " + str(trainErr))

model.save(sc, "pythonSVMWithSGDModel")
sameModel = SVMModel.load(sc, "pythonSVMWithSGDModel")

Пример #41

0

Показать файл

Файл: make_prediction.py Проект: janes/recsys-tcc-ml

def main(sc, sqlContext):

    #start = timer()

    #print '---Pegando usuario, posts, tokens e categorias do MongoDB---'
    #start_i = timer()
    user = findUserById(iduser)
    posts = findPosts(user)

    tokens, category, categoryAndSubcategory = getTokensAndCategories()
    postsRDD = (sc.parallelize(posts).map(lambda s: (s[
        0], word_tokenize(s[1].lower()), s[2], s[3])).map(lambda p: (p[
            0], [x for x in p[1] if x in tokens], p[2], p[3])).cache())

    #print '####levou %d segundos' % (timer() - start_i)

    #print '---Pegando produtos do MongoDB---'
    #start_i = timer()

    #print '####levou %d segundos' % (timer() - start_i)

    #print '---Criando corpusRDD---'
    #start_i = timer()
    stpwrds = stopwords.words('portuguese')
    corpusRDD = (postsRDD.map(lambda s: (s[0], [
        PorterStemmer().stem(x) for x in s[1] if x not in stpwrds
    ], s[2], s[3])).filter(lambda x: len(x[1]) >= 20 or
                           (x[2] == u'Post' and len(x[1]) > 0)).cache())
    #print '####levou %d segundos' % (timer() - start_i)

    #print '---Calculando TF-IDF---'
    #start_i = timer()
    wordsData = corpusRDD.map(
        lambda s: Row(label=int(s[0]), words=s[1], type=s[2]))
    wordsDataDF = sqlContext.createDataFrame(wordsData).unionAll(
        sqlContext.read.parquet(
            "/home/ubuntu/recsys-tcc-ml/parquet/wordsDataDF.parquet"))

    numTokens = len(tokens)
    hashingTF = HashingTF(inputCol="words",
                          outputCol="rawFeatures",
                          numFeatures=numTokens)
    idf = IDF(inputCol="rawFeatures", outputCol="features")

    featurizedData = hashingTF.transform(wordsDataDF)

    idfModel = idf.fit(featurizedData)
    tfIDF = idfModel.transform(featurizedData).cache()

    postTFIDF = (
        tfIDF.filter(tfIDF.type == u'Post')
        #.map(lambda s: Row(label=s[0], type=s[1], words=s[2], rawFeatures=s[3], features=s[4], sentiment=SVM.predict(s[4])))
        .cache())

    #postTFIDF = postTFIDF.filter(lambda p: p.sentiment == 1)
    #print '####levou %d segundos' % (timer() - start_i)

    #print '---Carregando modelo---'
    #start_i = timer()
    NB = NaiveBayesModel.load(
        sc, '/home/ubuntu/recsys-tcc-ml/models/naivebayes/modelo_categoria')
    SVM = SVMModel.load(sc, "/home/ubuntu/recsys-tcc-ml/models/svm")
    #print '####levou %d segundos' % (timer() - start_i)

    #print '---Usando o modelo---'
    #start_i = timer()
    predictions = (postTFIDF.map(lambda p: (NB.predict(p.features), p[
        0], SVM.predict(p.features))).filter(lambda p: p[2] == 1).map(
            lambda p: (p[0], p[1])).groupByKey().mapValues(list).collect())

    #print '####levou %d segundos' % (timer() - start_i)
    #print '---Calculando similaridades---'
    #start_i = timer()
    suggestions = []

    for prediction in predictions:
        category_to_use = category[int(prediction[0])]
        #print ' Calculando similaridades para a categoria: {}'.format(category_to_use)
        tf = tfIDF.filter(tfIDF.type == category_to_use).cache()
        for post in prediction[1]:
            postVector = postTFIDF.filter(
                postTFIDF.label == post).map(lambda x: x.features).collect()[0]
            sim = (tf.map(lambda x: (
                post, x.label, cossine(x.features, postVector))).filter(
                    lambda x: x[2] >= threshold).collect())
            if len(sim) > 0:
                suggestions.append(sim)

    #print '####levou %d segundos' % (timer() - start_i)

    if len(suggestions) > 0:
        #print '---Inserindo recomendacoes no MongoDB---'
        #start_i = timer()
        insertSuggestions(suggestions, iduser, posts)

Пример #42

0

Показать файл

Файл: svm.py Проект: psh5487/BigData_SVM

print("Confusion Matrix: ")
print("TP = " + str(a))
print("FN = " + str(b))
print("FP = " + str(c))
print("TN = " + str(d))
print("\n")

#Calculation
a = np.float(a)
b = np.float(b)
c = np.float(c)
d = np.float(d)

accuracy = (a+d) / (a+b+c+d)
precision = a / (a+c)
recall = a / (a+b)
f1 = 2*a / (2*a+b+c)

print('Accuracy: %f' %accuracy)
print('Precision: %f' %precision)
print('Recall: %f' %recall)
print('F1: %f' %f1)

#save and load model
model.save(sc, "/user/cloudera/hw2/results/2015310884_SVM")
sameModel = SVMModel.load(sc, "/user/cloudera/hw2/results/2015310884_SVM")

Пример #43

0

Показать файл

Файл: make_prediction.py Проект: felipecontra3/recsys-tcc-ml

def main(sc, sqlContext):

    #start = timer()

    #print '---Pegando usuario, posts, tokens e categorias do MongoDB---'
    #start_i = timer()
    user = findUserById(iduser)
    posts = findPosts(user) 
    
    tokens, category, categoryAndSubcategory = getTokensAndCategories()
    postsRDD = (sc.parallelize(posts).map(lambda s: (s[0], word_tokenize(s[1].lower()), s[2], s[3]))
                    .map(lambda p: (p[0], [x for x in p[1] if x in tokens] ,p[2], p[3]))
                    .cache())

    

    #print '####levou %d segundos' % (timer() - start_i)

    #print '---Pegando produtos do MongoDB---'
    #start_i = timer()

    #print '####levou %d segundos' % (timer() - start_i)
    
    #print '---Criando corpusRDD---'
    #start_i = timer()
    stpwrds = stopwords.words('portuguese')
    corpusRDD = (postsRDD.map(lambda s: (s[0], [PorterStemmer().stem(x) for x in s[1] if x not in stpwrds], s[2], s[3]))
                         .filter(lambda x: len(x[1]) >= 20 or (x[2] == u'Post' and len(x[1])>0))
                         .cache())
    #print '####levou %d segundos' % (timer() - start_i)

    #print '---Calculando TF-IDF---'
    #start_i = timer()
    wordsData = corpusRDD.map(lambda s: Row(label=int(s[0]), words=s[1], type=s[2]))
    wordsDataDF = sqlContext.createDataFrame(wordsData).unionAll(sqlContext.read.parquet("/home/ubuntu/recsys-tcc-ml/parquet/wordsDataDF.parquet"))


    numTokens = len(tokens)
    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=numTokens)
    idf = IDF(inputCol="rawFeatures", outputCol="features")

    featurizedData = hashingTF.transform(wordsDataDF)

    idfModel = idf.fit(featurizedData)
    tfIDF = idfModel.transform(featurizedData).cache()

    postTFIDF = (tfIDF
                    .filter(tfIDF.type==u'Post')
                    #.map(lambda s: Row(label=s[0], type=s[1], words=s[2], rawFeatures=s[3], features=s[4], sentiment=SVM.predict(s[4])))
                    .cache())

    #postTFIDF = postTFIDF.filter(lambda p: p.sentiment == 1)
    #print '####levou %d segundos' % (timer() - start_i)

    #print '---Carregando modelo---'
    #start_i = timer()
    NB = NaiveBayesModel.load(sc, '/home/ubuntu/recsys-tcc-ml/models/naivebayes/modelo_categoria')
    SVM = SVMModel.load(sc, "/home/ubuntu/recsys-tcc-ml/models/svm")
    #print '####levou %d segundos' % (timer() - start_i)

    #print '---Usando o modelo---'
    #start_i = timer()
    predictions = (postTFIDF
                        .map(lambda p: (NB.predict(p.features), p[0], SVM.predict(p.features)))
                        .filter(lambda p: p[2]==1)
                        .map(lambda p: (p[0], p[1]))
                        .groupByKey()
                        .mapValues(list)
                        .collect())

    #print '####levou %d segundos' % (timer() - start_i)
    #print '---Calculando similaridades---'
    #start_i = timer()
    suggestions = []

    for prediction in predictions:
        category_to_use = category[int(prediction[0])]
        #print ' Calculando similaridades para a categoria: {}'.format(category_to_use)
        tf = tfIDF.filter(tfIDF.type==category_to_use).cache()
        for post in prediction[1]:
            postVector = postTFIDF.filter(postTFIDF.label == post).map(lambda x: x.features).collect()[0]
            sim = (tf
                    .map(lambda x: (post, x.label, cossine(x.features, postVector)))
                    .filter(lambda x: x[2]>=threshold)
                    .collect())
            if len(sim) > 0:
                suggestions.append(sim)

    #print '####levou %d segundos' % (timer() - start_i)

    if len(suggestions) > 0:
        #print '---Inserindo recomendacoes no MongoDB---'
        #start_i = timer()
        insertSuggestions(suggestions, iduser, posts)

Пример #44

0

Показать файл

    return LabeledPoint(newValue[0], newValue[1:])


# Start
SparkContextHandler._master_ip = "10.14.24.101"
sc = SparkContextHandler.get_spark_sc()
#------------------------------------------------------------#
startTime = time()
#------------------------------------------------------------#\
print("load testdata")
test = sc.textFile(
    "file:/home/spark/Documents/neil-git/dataset/oneBolt_rag/Test.txt")
testData = test.map(FrequencyDomain)
#------------------------------------------------------------#
print("load model")
Model = SVMModel.load(sc, "hdfs:///home/spark/Desktop/FNOModel")
print("First Prediction (Normal or unNormal)")
# labelsAndPreds = Model.predict(testData)
# labelsAndPreds = testData.map(lambda p: (Model.predict(p),p))
labelsAndPreds = testData.map(lambda p:
                              (p.label, Model.predict(p.features), p.features))
TotalAmount = float(testData.count())
temp = labelsAndPreds.filter(lambda p: p[1] == 0)
oneBoltAmount = labelsAndPreds.filter(lambda p: p[0] == 1).count()
ragAmount = labelsAndPreds.filter(lambda p: p[0] == 0).count()
print("Normal or unNormal:", temp.count() / TotalAmount)

print("Second Prediction (oneBolt or rag)")
Model2 = SVMModel.load(sc, "hdfs:///home/spark/Desktop/FORModel")
temp2 = temp.map(lambda p: (p[0], Model2.predict(p[2])))
oneBoltResult = temp2.filter(lambda p: p[0] == p[1] and p[1] == 1)

Python SVMModel.load примеры использования