def svm_second_predict(spark_session, svm_model_path, df, condition): """ 支持向量机二分类预测 :param spark_session: spark 会话 :param svm_model_path: 模型地址 :param df: 数据 :param condition: {"features": [12, 13, 14, 15], "label": "label"} 特征列 :return: 预测结果 sparkframe """ feature_indexs = condition['features'] label_index = condition['label'] if label_index is None or label_index == "": # 无标签列 # 1. 准备数据 def func(x): features_data = [] for feature in feature_indexs: features_data.append(x[feature]) return features_data predict_data = df.rdd.map(lambda x: func(x)) print(predict_data.take(10)) # 2.加载模型 svm_model = SVMModel.load(spark_session.sparkContext, svm_model_path) # 3.预测 def f(x): return {"prediction_result": x} prediction_rdd = svm_model.predict(predict_data) print(prediction_rdd.take(10)) prediction_df = prediction_rdd.map(lambda x: Row(**f(x))).toDF() return prediction_df else: # 有标签列 # 1. 准备数据 def func(x): features_data = [] for feature in feature_indexs: features_data.append(x[feature]) return LabeledPoint(label=x[label_index], features=features_data) predict_label_data = df.rdd.map(lambda x: func(x)) print(predict_label_data.take(10)) # 2.加载模型 svm_model = SVMModel.load(spark_session.sparkContext, svm_model_path) # 3.预测 from pyspark.sql.types import Row def f(x): return {"prediction_result": x[0], label_index: x[1]} prediction_rdd = predict_label_data.map( lambda x: (svm_model.predict(x.features), x.label)) print(prediction_rdd.take(10)) prediction_df = prediction_rdd.map(lambda x: Row(**f(x))).toDF() return prediction_df
def load_parameters(self): self.amount_prediction_method = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL, file_name='amount_method') self.trend_prediction_method = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL, file_name='trend_method') self.data_features = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL, file_name='features') self.stock_symbol = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL, file_name='symbol') self.data_parser = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL, file_name='data_parser') amount_model_path = os.path.join(os.path.abspath(self.model_path), 'amount_model') trend_model_path = os.path.join(os.path.abspath(self.model_path), 'trend_model') if self.amount_prediction_method == self.RANDOM_FOREST: amount_model = RandomForestModel.load(sc=self.sc, path=amount_model_path) elif self.amount_prediction_method == self.LINEAR_REGRESSION: amount_model = LinearRegressionModel.load(sc=self.sc, path=amount_model_path) else: amount_model = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL, file_name='amount_model') if self.trend_prediction_method == self.RANDOM_FOREST: trend_model = RandomForestModel.load(sc=self.sc, path=trend_model_path) elif self.trend_prediction_method == self.LOGISTIC_REGRESSION: trend_model = LogisticRegressionModel.load(sc=self.sc, path=trend_model_path) elif self.trend_prediction_method == self.NAIVE_BAYES: trend_model = NaiveBayesModel.load(sc=self.sc, path=trend_model_path) elif self.trend_prediction_method == self.SVM: trend_model = SVMModel.load(sc=self.sc, path=trend_model_path) else: trend_model = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL, file_name='trend_model') return trend_model, amount_model
def test_all(config): test_feature_path = config['root_directory'] + config[ 'feature_directory'] + config['test_all_feature_filename'] model_path = config['root_directory'] + config['one_vs_all_model_directory'] label_path = config['root_directory'] + config['label_directory'] + config[ 'one_vs_all_label_filename'] if not os.path.exists(test_feature_path): print('No feature for all test') sys.exit(-1) if not os.path.exists(model_path): print('No model for all test') sys.exit(-1) if not os.path.exists(label_path): print('No label for all test') sys.exit(-1) # Load test data data = sc.textFile(test_feature_path) parsed_data = data.map(make_labeled_point) # Load the model model = SVMModel.load(sc, model_path) # Try the model against test data labels_and_preds = parsed_data.map(lambda p: (p.label, model.predict(p.features))) test_err = labels_and_preds.filter( lambda lp: lp[0] != lp[1]).count() / float(parsed_data.count()) print("Test Error = " + str(test_err * 100) + "%")
def do_1vsall(class_all, size, num_iter, config): features_path = config['protocol'] + config['bucket'] + config['sep'] + config['features_key'] print('do_1vsall ==============> Setting RDD_ALL') rdd_all = sc.textFile(features_path, minPartitions=4).map(lambda line: line.split(',')).persist() print('do_1vsall ==============> Setting RDD_TRAIN_SET') rdd_train_set = rdd_all.filter(lambda features: int(features[1]) <= size) \ .map(lambda features: ['0.0' if features[0] == class_all else '1.0'] + features[2:]) \ .map(make_labeled_point) print('do_1vsall ==============> Setting RDD_TEST_SET') rdd_test_set = rdd_all.filter(lambda features: size < int(features[1])) \ .map(lambda features: ['0.0' if features[0] == class_all else '1.0'] + features[2:]) \ .map(make_labeled_point) # Build the model model_dir = class_all + '_' + str(size) + '_' + str(num_iter) model_s3_file = config['model_key'] + config['sep'] + model_dir model = None if s3_object_exists(config['bucket'], model_s3_file): print('do_1vsall ==============> Loading SVM Model: {}...'.format(model_s3_file)) model = SVMModel.load(sc, config['protocol'] + config['bucket'] + config['sep'] + model_s3_file) else: print('do_1vsall ==============> Building SVM Model') model = SVMWithSGD.train(rdd_train_set, iterations=num_iter) print('do_1vsall ==============> Saving SVM Model: {}...'.format(model_s3_file)) model.save(sc, config['protocol'] + config['bucket'] + config['sep'] + model_s3_file) # Evaluate the model on th test data print('do_1vsall ==============> Evaluating test set') labels_and_preds = rdd_test_set.map(lambda p: (p.label, model.predict(p.features))) train_err = labels_and_preds.filter(lambda lp: lp[0] != lp[1]).count() / float(rdd_test_set.count()) # print("Test Error = " + str(train_err)) success = round(((1 - train_err) * 100), 2) print('{},{}'.format(str(size), str(success))) return size, success
def Prediction(self, modelType): data_point = self.Features if modelType == 'RF': model = RandomForestModel.load( self.sc, self.baseDir + '/fraudModel/Model/' + modelType) result = np.array( model.predict(self.sc.parallelize(data_point)).collect()) self.df_PD.insert(len(list(self.df_PD.columns)), 'result', result) elif modelType == 'GBDT': model = GradientBoostedTreesModel.load( self.sc, self.baseDir + '/fraudModel/Model/' + modelType) result = np.array( model.predict(self.sc.parallelize(data_point)).collect()) self.df_PD.insert(len(list(self.df_PD.columns)), 'result', result) elif modelType == 'LRsgd': model = LogisticRegressionModel.load( self.sc, self.baseDir + '/fraudModel/Model/' + modelType) result = np.array( model.predict(self.sc.parallelize(data_point)).collect()) self.df_PD.insert(len(list(self.df_PD.columns)), 'result', result) elif modelType == 'LRlbfgs': model = LogisticRegressionModel.load( self.sc, self.baseDir + '/fraudModel/Model/' + modelType) result = np.array( model.predict(self.sc.parallelize(data_point)).collect()) self.df_PD.insert(len(list(self.df_PD.columns)), 'result', result) elif modelType == 'SVM': model = SVMModel.load( self.sc, self.baseDir + '/fraudModel/Model/' + modelType) result = np.array( model.predict(self.sc.parallelize(data_point)).collect()) self.df_PD.insert(len(list(self.df_PD.columns)), 'result', result) else: pass
def __init__(self, path): conf = SparkConf() \ .setAppName("crankshaw-pyspark") \ .set("spark.executor.memory", "2g") \ .set("spark.kryoserializer.buffer.mb", "128") \ .set("master", "local") sc = SparkContext(conf=conf, batchSize=10) self.model = SVMModel.load(sc, path) self.path = path print("started spark")
def __init__(self, sc): logger.info("Starting up the Classification Engine..") self.sc = sc #load data logger.info("Loading up data..") iris_data_raw_RDD = self.sc.textFile("/home/hduser/iris-data.txt") self.iris_data_parsed_RDD = iris_data_raw_RDD.map(self.parsePoint) #self.train_model() self.model = SVMModel.load(self.sc, "/home/hduser/pythonSVMWithSGDModel")
def load_model(): # load the pre-trained Keras model (here we are using a model # pre-trained on ImageNet and provided by Keras, but you can # substitute in your own networks just as easily) global Keras_model base_model = VGG16(weights='imagenet') global graph graph = tf.get_default_graph() # Model will produce the output of the 'fc2'layer which is the penultimate neural network layer Keras_model = Model(inputs=base_model.input, outputs=base_model.get_layer('fc2').output) global Svm_model Svm_model = SVMModel.load(sc, Svm_model_path)
def main(): i = 0 args = sys.argv[1:] if len(args) != 4 and len(args) != 1: usage(sys.argv[0]) sys.exit(-1) input_dir = "" size = 0 test_data = False while i < len(args): if args[i] == "--dir": i += 1 input_dir = args[i] if len(input_dir) == 0: print("Input directory name is required") sys.exit(-1) elif args[i] == "--size": i += 1 if not args[i].isdigit(): print("Train size by class ({}) must be a number".format( args[i])) sys.exit(-1) else: size = int(args[i]) elif args[i] == "--test": test_data = True i += 1 if size > 0: train_set, test_set = split_sets(input_dir, "*.json", size) create_features(train_set, test_set, input_dir) create_model() if test_data: # Load test data data = sc.textFile('./features/test_features.txt') parsed_data = data.map(make_labeled_point) # Load the model model = SVMModel.load(sc, './model/ImageRecognitionModel') # Try the model against test data labels_and_preds = parsed_data.map( lambda p: (p.label, model.predict(p.features))) test_err = labels_and_preds.filter( lambda lp: lp[0] != lp[1]).count() / float(parsed_data.count()) print("Test Error = " + str(test_err * 100) + "%")
def main(sc): train_data='/usr/local/spark/data/mllib/sample_svm_data.txt' data=sc.textFile(train_data).map(parse) if os.path.exists('model'): model=SVMModel.load(sc, 'model') else: model=SVMWithSGD.train(data, iterations=100) model.save(sc, 'model') labelsAndPreds=data.map(lambda p: (p.label, model.predict(p.features))) # trainErr=labelsAndPreds.filter(lambda (v, p): v != p).count() / float(data.count()) # print('Training Error =' + str(trainErr)) labelsAndPreds.map(lambda x:str(x[0])+'\t'+str(x[1])).saveAsTextFile('labelsAndPreds')
def svm_second_evaluation(spark_session, svm_model_path, df, predict_condition, condition): """ svm二分类评估 :param spark_session: :param svm_model_path: 模型地址 :param df: 预测数据 :param predict_condition: 预测算子(父算子)配置 :param condition: 该算子配置 {"label":"标签"} :return: """ feature_indexs = predict_condition['features'] label = condition['label'] # 1. 准备数据 def func(x): features_data = [] for feature in feature_indexs: features_data.append(x[feature]) return LabeledPoint(label=x[label], features=features_data) predict_data = df.rdd.map(lambda x: func(x)) # 加载模型 svm_model = SVMModel.load(spark_session.sparkContext, svm_model_path) # 计算评估指标 svmTotalCorrect = predict_data.map(lambda r: 1 if (svm_model.predict( r.features) == r.label) else 0).reduce(lambda x, y: x + y) svmAccuracy = svmTotalCorrect / float(predict_data.count()) # 清除默认阈值,这样会输出原始的预测评分,即带有确信度的结果 svm_model.clearThreshold() svmPredictionAndLabels = predict_data.map( lambda lp: (float(svm_model.predict(lp.features)), lp.label)) svmMetrics = BinaryClassificationMetrics(svmPredictionAndLabels) print("Area under PR = %s" % svmMetrics.areaUnderPR) print("Area under ROC = %s" % svmMetrics.areaUnderROC) # 返回数据 result = [("正确个数", float(svmTotalCorrect)), ("精准度", float(svmAccuracy)), ("Area under PR", float(svmMetrics.areaUnderPR)), ("Area under ROC", float(svmMetrics.areaUnderROC))] return spark_session.createDataFrame(result, schema=['指标', '值'])
def test_one_vs_one(config, class1, class2): test_feature_path = config['root_directory'] + config[ 'feature_directory'] + config['test_one_feature_filename'] model_path = config['root_directory'] + config['one_vs_one_model_directory'] label_path = config['root_directory'] + config['label_directory'] + config[ 'one_vs_one_label_filename'] if not os.path.exists(test_feature_path): print('No feature for all test') sys.exit(-1) if not os.path.exists(model_path): print('No model for all test') sys.exit(-1) if not os.path.exists(label_path): print('No label for all test') sys.exit(-1) with open(label_path, "r") as label_file: labels = dict(json.load(label_file)) label_values = labels.values() if not any(x == class1 for x in label_values) or not any(x == class2 for x in label_values): print("No labels for {} and {}.".format(class1, class2)) sys.exit(-1) # Load test data data = sc.textFile(test_feature_path) parsed_data = data.map(make_labeled_point) # Load the model model = SVMModel.load(sc, model_path) # Try the model against test data labels_and_preds = parsed_data.map(lambda p: (p.label, model.predict(p.features))) test_err = labels_and_preds.filter( lambda lp: lp[0] != lp[1]).count() / float(parsed_data.count()) print("Test Error = " + str(test_err * 100) + "%")
def SVM_function(rdd,sc,method): #method from pyspark.mllib.classification import SVMModel print("rdd map") if method =='TimeDomain': output = TimeDomain(rdd) testData = sc.parallelize([output]) if method =='FrequencyDomain': output=frequencyDomain(rdd) testData=sc.parallelize([output]) #load model print("load model") Model = SVMModel.load(sc,"hdfs:///home/spark/Desktop/"+method+"Model") #------------------------------------------------------------# #input data and prediction print("labelsAndPreds") labelsAndPreds = Model.predict(testData) return labelsAndPreds.collect()
def check_image(config, json_file): model_path = config['root_directory'] + config['one_vs_all_model_directory'] label_path = config['root_directory'] + config['label_directory'] + config[ 'one_vs_all_label_filename'] if not os.path.exists(model_path): print('No model with the full set') sys.exit(-1) if not os.path.exists(label_path): print('No label for with the full set') sys.exit(-1) with open(label_path, "r") as label_file: labels = dict(json.load(label_file)) # inv_labels = {v: k for k, v in labels.items()} """ rdd = sc.textFile(json_file)\ .map(lambda line: ((line[1:])[:-1]))\ .flatMap(make_float_list) model = SVMModel.load(sc, model_path) label = model.predict(rdd) """ with open(json_file) as features_file: features = json.load(features_file) lp = make_label_point_from_list(",".join(map(str, features))) model = SVMModel.load(sc, model_path) label = model.predict(lp.features) print("=" * 60) print("Image features file: {}".format(json_file)) print("Label: {}".format(label)) print("Prediction: {}".format(labels[str(label)])) print("=" * 60)
def load_svm(sc): model= SVMModel.load(sc, "target1/tmp/pythonSVMWithSGDModel") return model
# ct = get_contingency_table(binarySvm, test_bm25_doc_index, section) # contingency_tables["bm25"][section] = ct test_rf_postings = test_tf_postings.mapValues(get_rf_postings) test_rf_doc_index = create_doc_index(test_rf_postings, term_dictionary) ct = get_contingency_table(binarySvm, test_rf_doc_index, section) contingency_tables["rf"][section] = ct test_tf_rf_postings = test_tf_postings.mapValues(get_tf_rf_postings) test_tf_rf_doc_index = create_doc_index(test_tf_rf_postings, term_dictionary) ct = get_contingency_table(binarySvm, test_tf_rf_doc_index, section) contingency_tables["tf-rf"][section] = ct import cPickle as pickle for clss in classes: binarySvm = SVMModel.load(sc, model_output + "tf" + "_" + clss + "_model.svm") ct = get_contingency_table(binarySvm, test_tf_doc_index, clss) contingency_tables["tf"][clss] = ct ct = get_contingency_table(binarySvm, test_tf_id_doc_index, clss) contingency_tables["tf-idf"][clss] = ct ct = get_contingency_table(binarySvm, test_bm25_doc_index, clss) contingency_tables["bm25"][clss] = ct test_rf_postings = test_tf_postings.mapValues(get_rf_postings) test_rf_doc_index = create_doc_index(test_rf_postings, term_dictionary) ct = get_contingency_table(binarySvm, test_rf_doc_index, clss) contingency_tables["rf"][clss] = ct test_tf_rf_postings = test_tf_postings.mapValues(get_tf_rf_postings) test_tf_rf_doc_index = create_doc_index(test_tf_rf_postings, term_dictionary)
from pyspark.mllib.classification import SVMWithSGD, SVMModel from pyspark.mllib.regression import LabeledPoint if __name__ == "__main__": sc = SparkContext(appName="SVMTicTac") # Parse the data and create LabeledPoints def parsePoint(line): values = [(x) for x in line.split(' ')] # Last row contains the target data and rest of # the rows define the attributes for linear regression return LabeledPoint(values[9], values[0:8]) # Load the data data = sc.textFile("data/mllib/sample_traindata_tic_tac.txt") parsedData = data.map(parsePoint) # Build the model using SVD model = SVMWithSGD.train(parsedData, iterations=100) # Evaluating the model on training data predict_model = parsedData.map(lambda p: (p.label, model.predict(p.features))) trainErr = predict_model.filter(lambda (v, p): v != p).count() / float(parsedData.count()) # Print Mean Squared Error print("Training Error = " + str(trainErr)) # Save and load model model.save(sc, "target/tmp/pythonTicTacSGD") sameModel = SVMModel.load(sc, "target/tmp/pythonTicTacSGD")
def classify_with_model(input_data_path, model_file_path): input_parsed = sc.textFile(input_data_path).map(parse_point) model = SVMModel.load(sc, model_file_path) labels = input_parsed.map(lambda p: model.predict(p.features)) labels.saveAsTextFile("predictions")
spark = SparkSession.builder.appName("downsample").getOrCreate() lines = spark.read.csv("s3://daen-cyber/filteredSource/only5s",header=True) lines = lines.select([c for c in lines.columns if c not in {'ip','maxScore','minScore','avgScore','trendUp','trendDown','trueCount','dataSetCount','mostCommonCustomerHit'}]) #for 5s test we dont care about label, just use 1.0 def labeledPointConverter(row): try: return LabeledPoint(1.0, row[1:]) except ValueError: return LabeledPoint(50.0,[1.0]) parsedData = lines.rdd.map(lambda x: labeledPointConverter(x)) parsedData = parsedData.filter(lambda x: x.label != 50.0) parsedData.cache() model = SVMModel.load(sc, "s3://daen-cyber/models/no5sSvmModel0") preds = parsedData.map(lambda p: model.predict(p.features)) parsedData.unpersist() preds.cache() below5 = preds.filter(lambda p: p == 0.0).count() above5 = preds.filter(lambda p: p == 1.0).count() listToOutput = [] listToOutput = listToOutput + [("Above 5", str(above5))] listToOutput = listToOutput + [("Below5", str(below5))] listToOutputRDD = sc.parallelize(listToOutput, 1)\ .saveAsTextFile("s3://daen-cyber/models/only5sSvmResults0") #model.save(sc, "s3://daen-cyber/modelsb/no5sSvmModel0")
if __name__ == "__main__": sc = SparkContext(appName="SVMTicTac") # Parse the data and create LabeledPoints def parsePoint(line): values = [(x) for x in line.split(' ')] # Last row contains the target data and rest of # the rows define the attributes for linear regression return LabeledPoint(values[9], values[0:8]) # Load the data data = sc.textFile("data/mllib/sample_traindata_tic_tac.txt") parsedData = data.map(parsePoint) # Build the model using SVD model = SVMWithSGD.train(parsedData, iterations=100) # Evaluating the model on training data predict_model = parsedData.map(lambda p: (p.label, model.predict(p.features))) trainErr = predict_model.filter(lambda (v, p): v != p).count() / float( parsedData.count()) # Print Mean Squared Error print("Training Error = " + str(trainErr)) # Save and load model model.save(sc, "target/tmp/pythonTicTacSGD") sameModel = SVMModel.load(sc, "target/tmp/pythonTicTacSGD")
from pyspark.mllib.regression import LabeledPoint conf = SparkConf().setAppName('Linear Support Vector Machines').setMaster( 'local[2]') sc = SparkContext(conf=conf) # load and parse the data def parsePoint(line): values = [float(x) for x in line.split(' ')] return LabeledPoint(values[0], values[1:]) data = sc.textFile('../data/sample_svm_data.txt') parseData = data.map(parsePoint) # build the model model = SVMWithSGD.train(parseData, iterations=100) # evaluating the model on training data labelsAndPreds = parseData.map(lambda p: (p.label, model.predict(p.features))) trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float( parseData.count()) print('training error :' + str(trainErr)) # save and load model model.save(sc, '../model/pythonSVMWithSGDModel') sameModel = SVMModel.load(sc, '../model/pythonSVMWithSGDModel') sc.stop()
values[7]=1; else: values[7]=0; return LabeledPoint(values[7], values[0:7]) #dep_delay, cancelled, diverted, carrierdelay, weather delay, NASdelay, Security delay, LateAircraftdelay #examples = MLUtils.loadLibSVMFile(sc, "2008.csv").collect() parsedData = raw_data.map(parsePoint) (trainingData, testData) = parsedData.randomSplit([0.7, 0.3]) startTime = datetime.now() # Build the model trainingData.cache () model = SVMWithSGD.train(trainingData, iterations=1) print ('Training Time consumed = '), (datetime.now() - startTime) startTestTime = datetime.now() # Evaluating the model on test data labelsAndPreds = testData.map(lambda p: (p.label, model.predict(p.features))) testErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(testData.count()) print ('Testing Time consumed = '), (datetime.now() - startTestTime) print ('Time consumed = '), (datetime.now() - startTime) print("Training Error = " + str(testErr)) # Save and load model model.save(sc, "SVMwide00-08train") sameModel = SVMModel.load(sc, "SVMwide00-08train")
# %% lr_predictions = test.map(lambda line: (line[0], line[1], float(lr_model.predict(line[3])))) lr_predictions.coalesce(1).toDF().write.options(header="true").csv( "hdfs://node1:9000/user/root/exp4/predictions/lr_predictions.csv") # %% [markdown] # 日期:2020-12-20 14:08:52 排名: 无 # score:0.5015744 # %% [markdown] # ## SVM # %% from pyspark.mllib.classification import SVMModel svm_model = SVMModel.load( sc, "hdfs://node1:9000/user/root/exp4/models/SVMWithSGDModel") # %% svm_predictions = test.map( lambda line: (line[0], line[1], float(svm_model.predict(line[3])))) svm_predictions.coalesce(1).toDF().write.options(header="true").csv( "hdfs://node1:9000/user/root/exp4/predictions/svm_predictions.csv") # %% [markdown] # 日期:2020-12-20 14:18:59 排名: 无 # score:0.5156678 # %% [markdown] # ## Gradient Boosted Trees # %% from pyspark.mllib.tree import GradientBoostedTreesModel
from pyspark import SparkContext from pyspark.mllib.classification import SVMWithSGD, SVMModel from pyspark.mllib.regression import LabeledPoint from document import DocumentSVM sc = SparkContext(appName="SVM") svmModel = SVMModel.load(sc, "../SVMModel") def parsePoint(line): splits = line.split(':') doc = DocumentSVM(splits[0], splits[1]) return doc.svmVec() data = sc.textFile("hdfs://localhost:8020/user/manh/vectorsvmtest") parsedData = data.map(parsePoint) labelsAndPreds = parsedData.map(lambda p: (p.label, svmModel.predict(p.features))) #0 for i in range(2): precision = labelsAndPreds.filter(lambda(v, p) : v == i and v == p).count() / float(labelsAndPreds.filter(lambda(v, p) : p == i).count()) recall = labelsAndPreds.filter(lambda(v, p) : v == i and v == p).count() / float(labelsAndPreds.filter(lambda(v, p) : v == i).count()) print("==================Precision c" + str(i) + " : " + str(precision)) print("==================Recall c" + str(i) + " : " + str(recall)) accuracy = labelsAndPreds.filter(lambda (v, p): v == p).count() / float(parsedData.count())
import datetime import sys from pyspark import SparkContext from pyspark.sql import SQLContext from pyspark.sql.types import * from pyspark.sql import functions as F from pyspark.mllib.classification import SVMModel dataset_name = sys.argv[1] sc = SparkContext("local", "Model Prediction", pyFiles=[]) sqlContext = SQLContext(sc) # First load the model we saved in the model generation step model = SVMModel.load(sc, "hdfs://hadoop:9000/models/noBikesAvailable.model") # We also need the stats used to normalize the weather variables stats_df = sqlContext.read.load("hdfs://hadoop:9000/models/weather-stats") stats = stats_df.collect()[0] # We want to produce output for each station station_df = sqlContext.read.load("hdfs://hadoop:9000/station_data_schema") print "Statistics: %s" % (stats,) # Load the weather data current_weather_csv = ( sc.textFile("hdfs://hadoop:9000/current_weather/%s.csv" % dataset_name) .map(lambda line: line.split(","))
def main(): #retrieve argument args = parse_arguments() main_directory = args.directory class1 = args.class1 class2 = args.class2 force_by_user = args.force if args.verbose: lg.basicConfig(level=lg.INFO) #Variables declaration result = [] directory_feature = os.path.join(main_directory, "features", "*.json") nb_training_data_list = args.nb_training_data iteration_model_list = args.iteration_model lg.info('Features directory is %s', directory_feature) for iteration_model in iteration_model_list: for nb_training_data in nb_training_data_list: model_file = 'model_' + class1 + '_' + class2 + '_' + str( nb_training_data) + '_' + str(iteration_model) result_file = 'result_' + class1 + '_' + class2 + '_' + str( nb_training_data) + '_' + str( iteration_model) + '_' + time.strftime( "%Y%m%d%H%M%S") + '.json' model_pathname = os.path.join(main_directory, "models", model_file) is_model = False start_time = time.time() lg.info( '#################### Starting pet-classification ######################' ) lg.info('Class 1 is %s', class1) lg.info('Class 2 is %s', class2) lg.info('Number of training datas is %s', nb_training_data) lg.info('Number of iterations model is %s', iteration_model) #persist a common rdd which is using by both training and testing datas common_rdd = sc.textFile(directory_feature, minPartitions=4)\ .filter(lambda line: line.split(', ')[0] in (class1, class2) or class2 == 'All')\ .persist() #Loading model if exists if is_model and not force_by_user: model = SVMModel.load(sc, model_pathname) lg.info('Found and load recorded model %s', model_file) else: lg.info('No recorded model found') #create training rdd and train model if no model found or force train_data_rdd = common_rdd.filter(lambda line: int(line.split(',')[1]) <= nb_training_data)\ .map(lambda line: Row(label=0.0, features=line.split(', ')[2:]) if line.split(', ')[0] == class1 else Row(label=1.0, features=line.split(', ')[2:]))\ .map(lambda line: LabeledPoint(line.label, line.features)) lg.info('%s features for training datas', train_data_rdd.count()) lg.info('Start to training model') model = SVMWithSGD.train(train_data_rdd, iterations=iteration_model) lg.info('Training model terminated') training_time = time.time() training_duration = training_time - start_time #Create testing rdd test_data_rdd = common_rdd.filter(lambda line: int(line.split(', ')[1]) > nb_training_data)\ .map(lambda line: Row(label=0.0, features=line.split(', ')[2:]) if line.split(', ')[0] == class1 else Row(label=1.0, features=line.split(', ')[2:]))\ .map(lambda row: LabeledPoint(row.label, row.features)) lg.info('%s features for test datas', test_data_rdd.count()) # Evaluating the model on training data predictions = test_data_rdd.map( lambda row: (row.label, float(model.predict(row.features)))) train_error = predictions.filter(lambda lp: lp[0] != lp[1]).count() \ / float(predictions.count()) lg.info('Test Error : %s', str(train_error)) end_time = time.time() duration = end_time - start_time lg.info('Duration %s', str(duration)) prediction_duration = end_time - training_time # #Save and dump result on S3 result = { "class1": class1, "class2": class2, "nb_training_data": nb_training_data, "error": train_error, "iteration_model": iteration_model, "total_duration": duration, "training_duration": training_duration, "prediction_duration": prediction_duration } with open(result_file, 'w') as result_file: json.dump(result, result_file) lg.info( '#################### Ending pet-classification ######################' ) input("press ctrl+c to exit")
values = [float(x) for x in clean_line_split] if values[4] == 0: values[4]=1; else: values[4]=0; return LabeledPoint(values[4], values[0:3]) #dep_delay, cancelled, diverted, carrierdelay, weather delay, NASdelay, Security delay, LateAircraftdelay #examples = MLUtils.loadLibSVMFile(sc, "2008.csv").collect() parsedData = raw_data.map(parsePoint) (trainingData, testData) = parsedData.randomSplit([0.7, 0.3]) startTime = datetime.now() # Build the model trainingData.cache () model = SVMWithSGD.train(trainingData, iterations=1) print ('Training Time consumed = '), (datetime.now() - startTime) startTestTime = datetime.now() testData.cache() # Evaluating the model on test data labelsAndPreds = testData.map(lambda p: (p.label, model.predict(p.features))) testErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(testData.count()) print ('Testing Time consumed = '), (datetime.now() - startTestTime) print ('Time consumed = '), (datetime.now() - startTime) print("Training Error = " + str(testErr)) # Save and load model model.save(sc, "SVMnarrow95-08") sameModel = SVMModel.load(sc, "SVMnarrow95-08")
values[4]=1; else: values[4]=0; return LabeledPoint(values[4], values[0:4]) #dep_delay, cancelled, diverted, carrierdelay, weather delay, NASdelay, Security delay, LateAircraftdelay #examples = MLUtils.loadLibSVMFile(sc, "2008.csv").collect() parsedData = raw_data.map(parsePoint) (trainingData, testData) = parsedData.randomSplit([0.7, 0.3]) startTime = datetime.now() # Build the model trainingData.cache () model = SVMWithSGD.train(trainingData, iterations=1) print ('Training Time consumed = '), (datetime.now() - startTime) startTestTime = datetime.now() # Evaluating the model on test data labelsAndPreds = testData.map(lambda p: (p.label, model.predict(p.features))) testErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(testData.count()) print ('Testing Time consumed = '), (datetime.now() - startTestTime) print ('Time consumed = '), (datetime.now() - startTime) print("Training Error = " + str(testErr)) # Save and load model model.save(sc, "SVMNarrow95-08train") sameModel = SVMModel.load(sc, "SVMNarrow95-08train")
from pyspark.mllib.evaluation import BinaryClassificationMetrics from pyspark.mllib.tree import DecisionTree from pyspark.mllib.clustering import KMeans, KMeansModel, GaussianMixture, GaussianMixtureModel from pyspark.mllib.linalg import Vectors SparkContext.setSystemProperty('spark.rdd.compress', config.get('spark', 'spark_rdd_compress')) SparkContext.setSystemProperty('spark.driver.maxResultSize', config.get('spark', 'spark_driver_maxResultSize')) SparkContext.setSystemProperty('spark.executor.memory', args.exe_memory) SparkContext.setSystemProperty('spark.cores.max', args.core_max) sc = SparkContext(args.sp_master, 'single_predict:'+str(args.row_id)) flag_model = ml_opts['learning_algorithm'] save_dir = config.get('app', 'HADOOP_MASTER')+config.get('app', 'HDFS_MODEL_DIR')+'/'+row_id_str if flag_model == "linear_svm_with_sgd": mllib_model = SVMModel.load(sc, save_dir) col_num = len(mllib_model.weights) elif flag_model == "logistic_regression_with_lbfgs" or flag_model == "logistic_regression_with_sgd": mllib_model = LogisticRegressionModel.load(sc, save_dir) col_num = mllib_model.numFeatures # len(mllib_model.weights) return 3x value elif flag_model == "kmeans": mllib_model = KMeansModel.load(sc, save_dir) col_num =len(mllib_model.clusterCenters[0]) else: print "ERROR: Training model selection error: no valid ML model selected!" return # get the model dimension #col_num = len(mllib_model.weights) print "INFO: total feature # in mllib model=",col_num # calculate hypothesis value ================
values[4]=1; else: values[4]=0; return LabeledPoint(values[4], values[0:4]) #dep_delay, cancelled, diverted, carrierdelay, weather delay, NASdelay, Security delay, LateAircraftdelay #examples = MLUtils.loadLibSVMFile(sc, "2008.csv").collect() parsedData = raw_data.map(parsePoint) (trainingData, testData) = parsedData.randomSplit([0.7, 0.3]) startTime = datetime.now() # Build the model trainingData.cache () model = SVMWithSGD.train(trainingData, iterations=1) print ('Training Time consumed = '), (datetime.now() - startTime) startTestTime = datetime.now() # Evaluating the model on test data labelsAndPreds = testData.map(lambda p: (p.label, model.predict(p.features))) testErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(testData.count()) print ('Testing Time consumed = '), (datetime.now() - startTestTime) print ('Time consumed = '), (datetime.now() - startTime) print("Training Error = " + str(testErr)) # Save and load model model.save(sc, "SVMNarrow00-08train") sameModel = SVMModel.load(sc, "SVMNarrow00-08train")
def main(): #retrieve argument args = parse_arguments() main_directory = args.directory class1 = args.class1 class2 = args.class2 force_by_user = args.force if args.verbose: lg.basicConfig(level=lg.INFO) #Variables declaration result = [] directory_feature = os.path.join(main_directory, "features", "*.json") nb_training_data = args.nb_training_data iteration_model = args.iteration_model min_partition = args.min_partition s3 = boto3.resource('s3') bucket = s3.Bucket('oc-calculdistribues-sberton') result_file = class1 + '_' + class2 + '_' + time.strftime( "%Y%m%d%H%M%S") + '.json' model_file = 'model_' + class1 + '_' + class2 + '_' + str( nb_training_data) + '_' + str(iteration_model) model_pathname = os.path.join(main_directory, "models", model_file) #Searching existing model and store existence in is_model boolean key = 'distributed_learning/models/' + model_file objs = list(bucket.objects.filter(Prefix=key)) is_model = len(objs) > 0 and objs[0].key.startswith(key + '/') start_time = time.time() lg.info( '#################### Starting pet-classification ######################' ) lg.info('Class 1 is %s', class1) lg.info('Class 2 is %s', class2) lg.info('Number of training datas is %s', nb_training_data) lg.info('Number of iterations model is %s', iteration_model) #persist a common rdd which is using by both training and testing datas common_rdd = sc.textFile(directory_feature, minPartitions=min_partition)\ .filter(lambda line: line.split(', ')[0] in (class1, class2) or class2 == 'All')\ .persist() #Loading model if exists if is_model and not force_by_user: model = SVMModel.load(sc, model_pathname) lg.info('Found and load recorded model %s', model_file) else: lg.info('No recorded model found') #create training rdd and train model if no model found or force train_data_rdd = common_rdd.filter(lambda line: int(line.split(',')[1]) <= nb_training_data)\ .map(lambda line: Row(label=0.0, features=line.split(', ')[2:]) if line.split(', ')[0] == class1 else Row(label=1.0, features=line.split(', ')[2:]))\ .map(lambda line: LabeledPoint(line.label, line.features)) lg.info('%s features for training datas', train_data_rdd.count()) lg.info('Start to training model') model = SVMWithSGD.train(train_data_rdd, iterations=iteration_model) lg.info('Training model terminated') training_time = time.time() training_duration = training_time - start_time #Create testing rdd test_data_rdd = common_rdd.filter(lambda line: int(line.split(', ')[1]) > nb_training_data)\ .map(lambda line: Row(label=0.0, features=line.split(', ')[2:]) if line.split(', ')[0] == class1 else Row(label=1.0, features=line.split(', ')[2:]))\ .map(lambda row: LabeledPoint(row.label, row.features)) lg.info('%s features for test datas', test_data_rdd.count()) # Evaluating the model on training data predictions = test_data_rdd.map( lambda row: (row.label, float(model.predict(row.features)))) train_error = predictions.filter(lambda lp: lp[0] != lp[1]).count() \ / float(predictions.count()) lg.info('Test Error : %s', str(train_error)) end_time = time.time() duration = end_time - start_time lg.info('Duration %s', str(duration)) prediction_duration = end_time - training_time # #Save and dump result on S3 result.append({ "class1": class1, "class2": class2, "iteration_model": iteration_model, "nb_training_data": nb_training_data, "total_duration": duration, "train_duration": training_duration, "predict_duration": prediction_duration, "error": train_error }) s3object = s3.Object('oc-calculdistribues-sberton', result_file) s3object.put(Body=(bytes(json.dumps(result, indent=2).encode('UTF-8')))) #Save model if not exists if not is_model: lg.info('Saving model at %s', model_file) model.save(sc, model_pathname) lg.info( '#################### Ending pet-classification ######################' )
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Date : 2015-11-23 20:18:03 # @Author : Your Name ([email protected]) # @Link : http://example.org # @Version : $Id$ from pyspark.mllib.classification import SVMWithSGD, SVMModel from pyspark.mllib.regression import LabeledPoint # Load and parse the data def parsePoint(line): values = [float(x) for x in line.split(' ')] return LabeledPoint(values[0], values[1:]) data = sc.textFile("data/mllib/sample_svm_data.txt") parsedData = data.map(parsePoint) # Build the model model = SVMWithSGD.train(parsedData, iterations=100) # Evaluating the model on training data labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features))) trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count()) print("Training Error = " + str(trainErr)) # Save and load model model.save(sc, "myModelPath") sameModel = SVMModel.load(sc, "myModelPath")
# contingency_tables["bm25"][section] = ct test_rf_postings = test_tf_postings.mapValues(get_rf_postings) test_rf_doc_index = create_doc_index(test_rf_postings, term_dictionary) ct = get_contingency_table(binarySvm, test_rf_doc_index, section) contingency_tables["rf"][section] = ct test_tf_rf_postings = test_tf_postings.mapValues(get_tf_rf_postings) test_tf_rf_doc_index = create_doc_index(test_tf_rf_postings, term_dictionary) ct = get_contingency_table(binarySvm, test_tf_rf_doc_index, section) contingency_tables["tf-rf"][section] = ct import cPickle as pickle for clss in classes: binarySvm = SVMModel.load(sc, model_output + "tf" + "_" + clss + "_model.svm") ct = get_contingency_table(binarySvm, test_tf_doc_index, clss) contingency_tables["tf"][clss] = ct ct = get_contingency_table(binarySvm, test_tf_id_doc_index, clss) contingency_tables["tf-idf"][clss] = ct ct = get_contingency_table(binarySvm, test_bm25_doc_index, clss) contingency_tables["bm25"][clss] = ct test_rf_postings = test_tf_postings.mapValues(get_rf_postings) test_rf_doc_index = create_doc_index(test_rf_postings, term_dictionary) ct = get_contingency_table(binarySvm, test_rf_doc_index, clss) contingency_tables["rf"][clss] = ct test_tf_rf_postings = test_tf_postings.mapValues(get_tf_rf_postings) test_tf_rf_doc_index = create_doc_index(test_tf_rf_postings,
console.setFormatter(formatter) # 加入 hander 到 root logger logging.getLogger('').addHandler(console) # 定義另兩個 logger logger_server = logging.getLogger('Server') # load Model. ## LogisticRegressionModel LR_First_Model = LogisticRegressionModel.load(sc, LR_Layer1) LR_Second_Model = LogisticRegressionModel.load(sc, LR_Layer2) LR_Third_Model = LogisticRegressionModel.load(sc, LR_Layer3) # Mold_LR_First_Model= LogisticRegressionModel.load(sc, Mold_LR_Layer1) # Mold_LR_Second_Model = LogisticRegressionModel.load(sc, Mold_LR_Layer2) # Mold_LR_Third_Model = LogisticRegressionModel.load(sc, Mold_LR_Layer3) ## SVMModel SVM_First_Model = SVMModel.load(sc, SVM_Layer1) SVM_Second_Model = SVMModel.load(sc, SVM_Layer2) SVM_Third_Model = SVMModel.load(sc, SVM_Layer3) Mold_SVM_First_Model = SVMModel.load(sc, Mold_SVM_Layer1) # Mold_SVM_Second_Model = SVMModel.load(sc, Mold_SVM_Layer2) # Mold_SVM_Third_Model = SVMModel.load(sc, Mold_SVM_Layer3) ## Random forset Random_Forest_Model = RandomForestModel.load(sc,Random_Forest) Mold_Random_Forest_Model = RandomForestModel.load(sc,Mold_Random_Forest)
from pyspark import SparkContext # $example on$ from pyspark.mllib.classification import SVMWithSGD, SVMModel from pyspark.mllib.regression import LabeledPoint # $example off$ if __name__ == "__main__": sc = SparkContext(appName="PythonSVMWithSGDExample") # $example on$ # Load and parse the data def parsePoint(line): values = [float(x) for x in line.split(' ')] return LabeledPoint(values[0], values[1:]) data = sc.textFile("/user/huting/testSet_SVM.txt") parsedData = data.map(parsePoint) # Build the model model = SVMWithSGD.train(parsedData, iterations=100) # Evaluating the model on training data labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features))) trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count()) print("Training Error = " + str(trainErr)) # Save and load model model.save(sc, "target/tmp/pythonSVMWithSGDModel") sameModel = SVMModel.load(sc, "target/tmp/pythonSVMWithSGDModel") # $example off$
ssc = StreamingContext(sc, 10) kafka_configuration_params = { "topic": ["BigData"], "connectionstring": "localhost:9092" } from pyspark.streaming.kafka import KafkaUtils directKafkaStream = KafkaUtils.createDirectStream( ssc, kafka_configuration_params["topic"], {"metadata.broker.list": kafka_configuration_params["connectionstring"]}) from pyspark.mllib.classification import SVMModel, LogisticRegressionModel, NaiveBayesModel LR_model = LogisticRegressionModel.load(sc, "../../notebooks/LR_model") SVM_model = SVMModel.load(sc, "../../notebooks/SVM_model") NB_model = NaiveBayesModel.load(sc, "../../notebooks/NB_model") import nltk import random from nltk.tokenize import word_tokenize allowed_word_types = ["JJ"] rdd_all_words = sc.textFile("../../notebooks/all_words/part-00000") rdd_broadcast_all_words = sc.broadcast(rdd_all_words.collect()) def convert_tweet_to_instance(tweets): rdd_tweets = tweets.map( \
values[7]=1; else: values[7]=0; return LabeledPoint(values[7], values[0:7]) #dep_delay, cancelled, diverted, carrierdelay, weather delay, NASdelay, Security delay, LateAircraftdelay #examples = MLUtils.loadLibSVMFile(sc, "2008.csv").collect() parsedData = raw_data.map(parsePoint) (trainingData, testData) = parsedData.randomSplit([0.7, 0.3]) startTime = datetime.now() # Build the model trainingData.cache () model = SVMWithSGD.train(trainingData, iterations=1) print ('Training Time consumed = '), (datetime.now() - startTime) startTestTime = datetime.now() # Evaluating the model on test data labelsAndPreds = testData.map(lambda p: (p.label, model.predict(p.features))) testErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(testData.count()) print ('Testing Time consumed = '), (datetime.now() - startTestTime) print ('Time consumed = '), (datetime.now() - startTime) print("Training Error = " + str(testErr)) # Save and load model model.save(sc, "SVMwide2008train") sameModel = SVMModel.load(sc, "SVMwide2008train")
parsed_data = MLUtils\ .loadLibSVMFile(spark_context, "data/classificationdata.txt")\ .cache() print("Parsed data size: " + str(parsed_data.count())) # Split initial RDD into two... [60% training data, 40% testing data] training, test = parsed_data\ .randomSplit([0.6, 0.4], seed=3) print("Training points size: " + str(training.count())) print("Test points size : " + str(test.count())) # Build the model model = SVMWithSGD.train(training, iterations=100) score_and_labels = test.map(lambda point: score_function(point, model)) #for score, label in score_and_labels.collect(): # print("Score: %d, label: %f" % (score, label)) # Get evaluation metrics metrics = BinaryClassificationMetrics(score_and_labels) auROC = metrics.areaUnderROC print("Area under ROC: %f" % auROC) # Save and load model model.save(spark_context, "SVMModel3") sameModel = SVMModel.load(spark_context, "SVMModel2") spark_context.stop()
# $example on$ from pyspark.mllib.classification import SVMWithSGD, SVMModel from pyspark.mllib.regression import LabeledPoint # $example off$ if __name__ == "__main__": sc = SparkContext(appName="PythonSVMWithSGDExample") # $example on$ # Load and parse the data def parsePoint(line): values = [float(x) for x in line.split(" ")] return LabeledPoint(values[0], values[1:]) data = sc.textFile("data/mllib/sample_svm_data.txt") parsedData = data.map(parsePoint) # Build the model model = SVMWithSGD.train(parsedData, iterations=100) # Evaluating the model on training data labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features))) trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count()) print("Training Error = " + str(trainErr)) # Save and load model model.save(sc, "target/tmp/pythonSVMWithSGDModel") sameModel = SVMModel.load(sc, "target/tmp/pythonSVMWithSGDModel") # $example off$
from pyspark import SparkContext from pyspark.mllib.classification import SVMWithSGD, SVMModel from pyspark.mllib.regression import LabeledPoint def parsePoint(line): parsedData = [0 for i in range(45000)] splits = line.split(":") vectorSplit = splits[1].split(";") for vs in vectorSplit: vSplit = vs.split(" ") parsedData[int(vSplit[0])] = float(vSplit[1]) return LabeledPoint(float(splits[0]), parsedData) sc = SparkContext(appName="PythonSVMWithSGDExample") data = sc.textFile("hdfs://localhost:8020/pyspark/vectorsvm") parsedData = data.map(parsePoint) model = SVMWithSGD.train(parsedData, iterations=100) labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features))) trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count()) print("Training Error = " + str(trainErr)) model.save(sc, "pythonSVMWithSGDModel") sameModel = SVMModel.load(sc, "pythonSVMWithSGDModel")
def main(sc, sqlContext): #start = timer() #print '---Pegando usuario, posts, tokens e categorias do MongoDB---' #start_i = timer() user = findUserById(iduser) posts = findPosts(user) tokens, category, categoryAndSubcategory = getTokensAndCategories() postsRDD = (sc.parallelize(posts).map(lambda s: (s[ 0], word_tokenize(s[1].lower()), s[2], s[3])).map(lambda p: (p[ 0], [x for x in p[1] if x in tokens], p[2], p[3])).cache()) #print '####levou %d segundos' % (timer() - start_i) #print '---Pegando produtos do MongoDB---' #start_i = timer() #print '####levou %d segundos' % (timer() - start_i) #print '---Criando corpusRDD---' #start_i = timer() stpwrds = stopwords.words('portuguese') corpusRDD = (postsRDD.map(lambda s: (s[0], [ PorterStemmer().stem(x) for x in s[1] if x not in stpwrds ], s[2], s[3])).filter(lambda x: len(x[1]) >= 20 or (x[2] == u'Post' and len(x[1]) > 0)).cache()) #print '####levou %d segundos' % (timer() - start_i) #print '---Calculando TF-IDF---' #start_i = timer() wordsData = corpusRDD.map( lambda s: Row(label=int(s[0]), words=s[1], type=s[2])) wordsDataDF = sqlContext.createDataFrame(wordsData).unionAll( sqlContext.read.parquet( "/home/ubuntu/recsys-tcc-ml/parquet/wordsDataDF.parquet")) numTokens = len(tokens) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=numTokens) idf = IDF(inputCol="rawFeatures", outputCol="features") featurizedData = hashingTF.transform(wordsDataDF) idfModel = idf.fit(featurizedData) tfIDF = idfModel.transform(featurizedData).cache() postTFIDF = ( tfIDF.filter(tfIDF.type == u'Post') #.map(lambda s: Row(label=s[0], type=s[1], words=s[2], rawFeatures=s[3], features=s[4], sentiment=SVM.predict(s[4]))) .cache()) #postTFIDF = postTFIDF.filter(lambda p: p.sentiment == 1) #print '####levou %d segundos' % (timer() - start_i) #print '---Carregando modelo---' #start_i = timer() NB = NaiveBayesModel.load( sc, '/home/ubuntu/recsys-tcc-ml/models/naivebayes/modelo_categoria') SVM = SVMModel.load(sc, "/home/ubuntu/recsys-tcc-ml/models/svm") #print '####levou %d segundos' % (timer() - start_i) #print '---Usando o modelo---' #start_i = timer() predictions = (postTFIDF.map(lambda p: (NB.predict(p.features), p[ 0], SVM.predict(p.features))).filter(lambda p: p[2] == 1).map( lambda p: (p[0], p[1])).groupByKey().mapValues(list).collect()) #print '####levou %d segundos' % (timer() - start_i) #print '---Calculando similaridades---' #start_i = timer() suggestions = [] for prediction in predictions: category_to_use = category[int(prediction[0])] #print ' Calculando similaridades para a categoria: {}'.format(category_to_use) tf = tfIDF.filter(tfIDF.type == category_to_use).cache() for post in prediction[1]: postVector = postTFIDF.filter( postTFIDF.label == post).map(lambda x: x.features).collect()[0] sim = (tf.map(lambda x: ( post, x.label, cossine(x.features, postVector))).filter( lambda x: x[2] >= threshold).collect()) if len(sim) > 0: suggestions.append(sim) #print '####levou %d segundos' % (timer() - start_i) if len(suggestions) > 0: #print '---Inserindo recomendacoes no MongoDB---' #start_i = timer() insertSuggestions(suggestions, iduser, posts)
print("Confusion Matrix: ") print("TP = " + str(a)) print("FN = " + str(b)) print("FP = " + str(c)) print("TN = " + str(d)) print("\n") #Calculation a = np.float(a) b = np.float(b) c = np.float(c) d = np.float(d) accuracy = (a+d) / (a+b+c+d) precision = a / (a+c) recall = a / (a+b) f1 = 2*a / (2*a+b+c) print('Accuracy: %f' %accuracy) print('Precision: %f' %precision) print('Recall: %f' %recall) print('F1: %f' %f1) #save and load model model.save(sc, "/user/cloudera/hw2/results/2015310884_SVM") sameModel = SVMModel.load(sc, "/user/cloudera/hw2/results/2015310884_SVM")
def main(sc, sqlContext): #start = timer() #print '---Pegando usuario, posts, tokens e categorias do MongoDB---' #start_i = timer() user = findUserById(iduser) posts = findPosts(user) tokens, category, categoryAndSubcategory = getTokensAndCategories() postsRDD = (sc.parallelize(posts).map(lambda s: (s[0], word_tokenize(s[1].lower()), s[2], s[3])) .map(lambda p: (p[0], [x for x in p[1] if x in tokens] ,p[2], p[3])) .cache()) #print '####levou %d segundos' % (timer() - start_i) #print '---Pegando produtos do MongoDB---' #start_i = timer() #print '####levou %d segundos' % (timer() - start_i) #print '---Criando corpusRDD---' #start_i = timer() stpwrds = stopwords.words('portuguese') corpusRDD = (postsRDD.map(lambda s: (s[0], [PorterStemmer().stem(x) for x in s[1] if x not in stpwrds], s[2], s[3])) .filter(lambda x: len(x[1]) >= 20 or (x[2] == u'Post' and len(x[1])>0)) .cache()) #print '####levou %d segundos' % (timer() - start_i) #print '---Calculando TF-IDF---' #start_i = timer() wordsData = corpusRDD.map(lambda s: Row(label=int(s[0]), words=s[1], type=s[2])) wordsDataDF = sqlContext.createDataFrame(wordsData).unionAll(sqlContext.read.parquet("/home/ubuntu/recsys-tcc-ml/parquet/wordsDataDF.parquet")) numTokens = len(tokens) hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=numTokens) idf = IDF(inputCol="rawFeatures", outputCol="features") featurizedData = hashingTF.transform(wordsDataDF) idfModel = idf.fit(featurizedData) tfIDF = idfModel.transform(featurizedData).cache() postTFIDF = (tfIDF .filter(tfIDF.type==u'Post') #.map(lambda s: Row(label=s[0], type=s[1], words=s[2], rawFeatures=s[3], features=s[4], sentiment=SVM.predict(s[4]))) .cache()) #postTFIDF = postTFIDF.filter(lambda p: p.sentiment == 1) #print '####levou %d segundos' % (timer() - start_i) #print '---Carregando modelo---' #start_i = timer() NB = NaiveBayesModel.load(sc, '/home/ubuntu/recsys-tcc-ml/models/naivebayes/modelo_categoria') SVM = SVMModel.load(sc, "/home/ubuntu/recsys-tcc-ml/models/svm") #print '####levou %d segundos' % (timer() - start_i) #print '---Usando o modelo---' #start_i = timer() predictions = (postTFIDF .map(lambda p: (NB.predict(p.features), p[0], SVM.predict(p.features))) .filter(lambda p: p[2]==1) .map(lambda p: (p[0], p[1])) .groupByKey() .mapValues(list) .collect()) #print '####levou %d segundos' % (timer() - start_i) #print '---Calculando similaridades---' #start_i = timer() suggestions = [] for prediction in predictions: category_to_use = category[int(prediction[0])] #print ' Calculando similaridades para a categoria: {}'.format(category_to_use) tf = tfIDF.filter(tfIDF.type==category_to_use).cache() for post in prediction[1]: postVector = postTFIDF.filter(postTFIDF.label == post).map(lambda x: x.features).collect()[0] sim = (tf .map(lambda x: (post, x.label, cossine(x.features, postVector))) .filter(lambda x: x[2]>=threshold) .collect()) if len(sim) > 0: suggestions.append(sim) #print '####levou %d segundos' % (timer() - start_i) if len(suggestions) > 0: #print '---Inserindo recomendacoes no MongoDB---' #start_i = timer() insertSuggestions(suggestions, iduser, posts)
return LabeledPoint(newValue[0], newValue[1:]) # Start SparkContextHandler._master_ip = "10.14.24.101" sc = SparkContextHandler.get_spark_sc() #------------------------------------------------------------# startTime = time() #------------------------------------------------------------#\ print("load testdata") test = sc.textFile( "file:/home/spark/Documents/neil-git/dataset/oneBolt_rag/Test.txt") testData = test.map(FrequencyDomain) #------------------------------------------------------------# print("load model") Model = SVMModel.load(sc, "hdfs:///home/spark/Desktop/FNOModel") print("First Prediction (Normal or unNormal)") # labelsAndPreds = Model.predict(testData) # labelsAndPreds = testData.map(lambda p: (Model.predict(p),p)) labelsAndPreds = testData.map(lambda p: (p.label, Model.predict(p.features), p.features)) TotalAmount = float(testData.count()) temp = labelsAndPreds.filter(lambda p: p[1] == 0) oneBoltAmount = labelsAndPreds.filter(lambda p: p[0] == 1).count() ragAmount = labelsAndPreds.filter(lambda p: p[0] == 0).count() print("Normal or unNormal:", temp.count() / TotalAmount) print("Second Prediction (oneBolt or rag)") Model2 = SVMModel.load(sc, "hdfs:///home/spark/Desktop/FORModel") temp2 = temp.map(lambda p: (p[0], Model2.predict(p[2]))) oneBoltResult = temp2.filter(lambda p: p[0] == p[1] and p[1] == 1)