def Prediction(self, modelType): data_point = self.Features if modelType == 'RF': model = RandomForestModel.load( self.sc, self.baseDir + '/fraudModel/Model/' + modelType) result = np.array( model.predict(self.sc.parallelize(data_point)).collect()) self.df_PD.insert(len(list(self.df_PD.columns)), 'result', result) elif modelType == 'GBDT': model = GradientBoostedTreesModel.load( self.sc, self.baseDir + '/fraudModel/Model/' + modelType) result = np.array( model.predict(self.sc.parallelize(data_point)).collect()) self.df_PD.insert(len(list(self.df_PD.columns)), 'result', result) elif modelType == 'LRsgd': model = LogisticRegressionModel.load( self.sc, self.baseDir + '/fraudModel/Model/' + modelType) result = np.array( model.predict(self.sc.parallelize(data_point)).collect()) self.df_PD.insert(len(list(self.df_PD.columns)), 'result', result) elif modelType == 'LRlbfgs': model = LogisticRegressionModel.load( self.sc, self.baseDir + '/fraudModel/Model/' + modelType) result = np.array( model.predict(self.sc.parallelize(data_point)).collect()) self.df_PD.insert(len(list(self.df_PD.columns)), 'result', result) elif modelType == 'SVM': model = SVMModel.load( self.sc, self.baseDir + '/fraudModel/Model/' + modelType) result = np.array( model.predict(self.sc.parallelize(data_point)).collect()) self.df_PD.insert(len(list(self.df_PD.columns)), 'result', result) else: pass
def Load_Regression_Model(): model_1 = LogisticRegressionModel.load(sc, "./logistc_1.model") model_2 = LogisticRegressionModel.load(sc, "./logistc_2.model") model_3 = LogisticRegressionModel.load(sc, "./logistc_3.model") model_list = [model_1, model_2, model_3] return model_list
def load_parameters(self): self.amount_prediction_method = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL, file_name='amount_method') self.trend_prediction_method = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL, file_name='trend_method') self.data_features = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL, file_name='features') self.stock_symbol = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL, file_name='symbol') self.data_parser = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL, file_name='data_parser') amount_model_path = os.path.join(os.path.abspath(self.model_path), 'amount_model') trend_model_path = os.path.join(os.path.abspath(self.model_path), 'trend_model') if self.amount_prediction_method == self.RANDOM_FOREST: amount_model = RandomForestModel.load(sc=self.sc, path=amount_model_path) elif self.amount_prediction_method == self.LINEAR_REGRESSION: amount_model = LinearRegressionModel.load(sc=self.sc, path=amount_model_path) else: amount_model = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL, file_name='amount_model') if self.trend_prediction_method == self.RANDOM_FOREST: trend_model = RandomForestModel.load(sc=self.sc, path=trend_model_path) elif self.trend_prediction_method == self.LOGISTIC_REGRESSION: trend_model = LogisticRegressionModel.load(sc=self.sc, path=trend_model_path) elif self.trend_prediction_method == self.NAIVE_BAYES: trend_model = NaiveBayesModel.load(sc=self.sc, path=trend_model_path) elif self.trend_prediction_method == self.SVM: trend_model = SVMModel.load(sc=self.sc, path=trend_model_path) else: trend_model = self.load_data_from_file(data_type=self.SAVE_TYPE_MODEL, file_name='trend_model') return trend_model, amount_model
def model_instream(sc, **params): fs = sc._jvm.org.apache.hadoop.fs.FileSystem.get( sc._jsc.hadoopConfiguration()) if not fs.exists( sc._jvm.org.apache.hadoop.fs.Path(HDFS_PATH + str(g_cache.user) + '/model/' + params['path'])): raise Exception("Invalid file path, path not exists!") if params['type'] == 'kmeans': model = KMeansModel.load( sc, HDFS_PATH + str(g_cache.user) + '/model/' + params['path']) elif params['type'] == 'fpgrowth': model = FPGrowthModel.load( sc, HDFS_PATH + str(g_cache.user) + '/model/' + params['path']) elif params['type'] == 'logistic-regression': model = LogisticRegressionModel.load( sc, HDFS_PATH + str(g_cache.user) + '/model/' + params['path']) elif params['type'] == 'word2vec': model = Word2VecModel.load( sc, HDFS_PATH + str(g_cache.user) + '/model/' + params['path']) elif params['type'] == 'decision-tree': model = DecisionTreeModel.load( sc, HDFS_PATH + str(g_cache.user) + '/model/' + params['path']) else: raise Exception("Invalid model type!") return True, model
def prediction(model_directory, libsvm_file, outputfile): sc = SparkContext(appName="PythonLinearRegressionWithSGDExample") model = LogisticRegressionModel.load(sc, model_directory) #print "numfeature",model.numFeatures #print "aaaaaaaa" vectors = MLUtils.loadLibSVMFile(sc, libsvm_file, numFeatures=model.numFeatures) vectors.cache() model.clearThreshold() # vector = vectors.collect() # for v in vector: # # features = v.features # print features # print "bbbb",len(features),model.predict(Vectors.dense(features)) # exit() scores = vectors.map(lambda p: (model.predict(Vectors.dense(p.features)))) # lambda p: (p.label, model.predict(p.features))) scores_list = scores.collect() file_out_obj = open(outputfile, 'w') for score in scores_list: #print '----->',score file_out_obj.write(str(score) + '\n') file_out_obj.close()
def main(): parser = argparse.ArgumentParser(description='Park or Bird Prediction Engine') parser.add_argument('--i','--input', type=str, required=True, default=None, help='Input file or directory of jpg images') parser.add_argument('--m','--method', type=str, required=True, default=None, help='Model method, 1 or 2') args = parser.parse_args() outfile = '/gpfs/gpfsfpo/prediction/predict_me.txt.gz' os.system('rm -f ' + outfile) sc = SparkContext(appName="Park Bird Predction Model 1") args.m = args.m if args.m in [1,2] else 2 model_path = '/gpfs/gpfsfpo/shared/model_1_LBFGS' if args.m == 1 else '/gpfs/gpfsfpo/shared/model_2' CreateTestData(args.i, args.m, outfile) raw_input = sc.textFile(outfile) k = raw_input.map(lambda x: x.split(',')[0]) p = raw_input.map(lambda x: x.split(',')[1]).map(lambda x: x.split(' ')).map(lambda x: [float(y) for y in x]).map(lambda x: Vectors.dense(x)) model = LogisticRegressionModel.load(sc, model_path) predictions = model.predict(p) keyPredictions = k.zip(predictions.map(lambda x: "IT'S A BIRD!" if x==1 else "IT'S A PARK!")) print("************* RESULTS *******************") print keyPredictions.collect() sc.stop()
def __init__(self, path): conf = SparkConf() \ .setAppName("crankshaw-pyspark") \ .set("spark.executor.memory", "2g") \ .set("spark.kryoserializer.buffer.mb", "128") \ .set("master", "local") sc = SparkContext(conf=conf, batchSize=10) self.model = LogisticRegressionModel.load(sc, path) self.path = path print("started spark")
def main(sc): data = [ LabeledPoint(0.0, [0.0, 1.0]), LabeledPoint(1.0, [1.0, 0.0]) ] lrm = LogisticRegressionWithSGD.train(sc.parallelize(data), iterations=10) print (lrm.predict([1.0, 0.0])) print(lrm.predict([0.0, 1.0])) # Save and load model lrm.save(sc, "lrsgd") sameModel = LogisticRegressionModel.load(sc, "lrsgd") print(sameModel.predict([1.0, 0.0])) print(sameModel.predict([0.0, 1.0]))
def __init__(self, path): conf = SparkConf() \ .setAppName("crankshaw-pyspark") \ .set("spark.executor.memory", "2g") \ .set("spark.kryoserializer.buffer.mb", "128") \ .set("master", "local") sc = SparkContext(conf=conf, batchSize=10) self.model = LogisticRegressionModel.load(sc, path) # self.model = RandomForestModel.load(sc, path) self.path = path # path = '/Users/crankshaw/model-serving/tugboat/feature_servers/python/spark_model' # self.name, self.model = load_pyspark_model(path) print("started spark")
def testing_model(model_directory, libsvm, prediction, report, prc_file): sc = SparkContext(appName="PythonLinearRegressionWithSGDExample") model = LogisticRegressionModel.load(sc, model_directory) testing_rdd = MLUtils.loadLibSVMFile(sc, libsvm, numFeatures=model.numFeatures) testing_rdd.cache() au_prc, precision, recall, thresholds, y_true, y_scores = evaluate_model( testing_rdd, model) print 'evaluating_model done!\n' write_to_report(au_prc, precision, recall, thresholds, report) print 'write_to_report done!\n' write_to_prediction(y_true, y_scores, prediction) print 'write_to_prediction done!\n' draw_prc(precision, recall, prc_file, au_prc) print 'draw_prc done!\n'
def main(): #spark = SparkSession.builder.master("yarn").appName("spark_demo").getOrCreate() spark = SparkSession.builder.getOrCreate() print "Session created!" sc = spark.sparkContext print "The url to track the job: http://namenode-01:8088/proxy/" + sc.applicationId print sys.argv sampleHDFS_1 = sys.argv[1] sampleHDFS_2 = sys.argv[2] outputHDFS = sys.argv[3] sampleRDD = sc.textFile(sampleHDFS_1).map(parse) predictRDD = sc.textFile(sampleHDFS_2).map(lambda x: parse(x, True)) # 训练 model = LogisticRegressionWithLBFGS.train(sampleRDD) model.clearThreshold() #删除默认阈值(否则后面直接输出0、1) # 预测,保存结果 labelsAndPreds = predictRDD.map( lambda p: (p[0], p[1].label, model.predict(p[1].features))) labelsAndPreds.map(lambda p: '\t'.join(map(str, p))).saveAsTextFile( outputHDFS + "/target/output") # 评估不同阈值下的准确率、召回率 labelsAndPreds_label_1 = labelsAndPreds.filter(lambda lp: int(lp[1]) == 1) labelsAndPreds_label_0 = labelsAndPreds.filter(lambda lp: int(lp[1]) == 0) t_cnt = labelsAndPreds_label_1.count() f_cnt = labelsAndPreds_label_0.count() print "thre\ttp\ttn\tfp\tfn\taccuracy\trecall" for thre in [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]: tp = labelsAndPreds_label_1.filter(lambda lp: lp[2] > thre).count() tn = t_cnt - tp fp = labelsAndPreds_label_0.filter(lambda lp: lp[2] > thre).count() fn = f_cnt - fp print("%.1f\t%d\t%d\t%d\t%d\t%.4f\t%.4f" % (thre, tp, tn, fp, fn, float(tp) / (tp + fp), float(tp) / (t_cnt))) # 保存模型、加载模型 model.save( sc, outputHDFS + "/target/tmp/pythonLogisticRegressionWithLBFGSModel") sameModel = LogisticRegressionModel.load( sc, outputHDFS + "/target/tmp/pythonLogisticRegressionWithLBFGSModel") print "output:", outputHDFS
def do_stuff(self, parameters): val = parameters.values() list = val.head() size = list.size() pylist = [] count = 0 while count < size: pylist.append(list.head()) count = count + 1 list = list.tail() heat = pylist[0] km = pylist[1] lrm = LogisticRegressionModel.load(self.context, "/tmp/brakeModel") worn = lrm.predict([km,heat]) return ("brake is worn=", worn)
def create_or_load_model(sc: SparkContext, train_dataset_path: str) -> LogisticRegressionModel: if not os.path.exists(MODEL_PATH): print('training model...') dataset_rdd = sc.textFile(train_dataset_path) table_rdd = dataset_rdd.map(lambda line: line.split(',')) labeled_features = rdd_to_feature(table_rdd) # labeled_features.foreach(lambda lp: print(lp)) labeled_features.cache() model = LogisticRegressionWithLBFGS.train(labeled_features, numClasses=NUM_CLASSES) model.setThreshold(0.5) model.save(sc, MODEL_PATH) return model else: model = LogisticRegressionModel.load(sc, MODEL_PATH) return model
def main(): parser = argparse.ArgumentParser( description='Park or Bird Prediction Engine') parser.add_argument('--i', '--input', type=str, required=True, default=None, help='Input file or directory of jpg images') parser.add_argument('--m', '--method', type=str, required=True, default=None, help='Model method, 1 or 2') args = parser.parse_args() outfile = '/gpfs/gpfsfpo/prediction/predict_me.txt.gz' os.system('rm -f ' + outfile) sc = SparkContext(appName="Park Bird Predction Model 1") args.m = args.m if args.m in [1, 2] else 2 model_path = '/gpfs/gpfsfpo/shared/model_1_LBFGS' if args.m == 1 else '/gpfs/gpfsfpo/shared/model_2' CreateTestData(args.i, args.m, outfile) raw_input = sc.textFile(outfile) k = raw_input.map(lambda x: x.split(',')[0]) p = raw_input.map(lambda x: x.split(',')[1]).map( lambda x: x.split(' ')).map(lambda x: [float(y) for y in x]).map( lambda x: Vectors.dense(x)) model = LogisticRegressionModel.load(sc, model_path) predictions = model.predict(p) keyPredictions = k.zip( predictions.map(lambda x: "IT'S A BIRD!" if x == 1 else "IT'S A PARK!")) print("************* RESULTS *******************") print keyPredictions.collect() sc.stop()
def main(sc): # Load and parse the data def parsePoint(line): values = [float(x) for x in line.split(' ')] return LabeledPoint(values[0], values[1:]) data = sc.textFile("logistic_regression_data.txt") parsedData = data.map(parsePoint) # Build the model model = LogisticRegressionWithLBFGS.train(parsedData) # Evaluating the model on training data labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features))) trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float( parsedData.count()) print("Training Error = " + str(trainErr)) # Save and load model model.save(sc, "myModelPath") sameModel = LogisticRegressionModel.load(sc, "myModelPath")
from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel from pyspark.mllib.regression import LabeledPoint # $example off$ if __name__ == "__main__": sc = SparkContext(appName="PythonLogisticRegressionWithLBFGSExample") # $example on$ # Load and parse the data def parsePoint(line): values = [float(x) for x in line.split(' ')] return LabeledPoint(values[0], values[1:]) data = sc.textFile("data/mllib/sample_svm_data.txt") parsedData = data.map(parsePoint) # Build the model model = LogisticRegressionWithLBFGS.train(parsedData) # Evaluating the model on training data labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features))) trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count()) print("Training Error = " + str(trainErr)) # Save and load model model.save(sc, "target/tmp/pythonLogisticRegressionWithLBFGSModel") sameModel = LogisticRegressionModel.load(sc, "target/tmp/pythonLogisticRegressionWithLBFGSModel") # $example off$
from pyspark.streaming import StreamingContext ssc = StreamingContext(sc, 10) kafka_configuration_params = { "topic": ["BigData"], "connectionstring": "localhost:9092" } from pyspark.streaming.kafka import KafkaUtils directKafkaStream = KafkaUtils.createDirectStream( ssc, kafka_configuration_params["topic"], {"metadata.broker.list": kafka_configuration_params["connectionstring"]}) from pyspark.mllib.classification import SVMModel, LogisticRegressionModel, NaiveBayesModel LR_model = LogisticRegressionModel.load(sc, "../../notebooks/LR_model") SVM_model = SVMModel.load(sc, "../../notebooks/SVM_model") NB_model = NaiveBayesModel.load(sc, "../../notebooks/NB_model") import nltk import random from nltk.tokenize import word_tokenize allowed_word_types = ["JJ"] rdd_all_words = sc.textFile("../../notebooks/all_words/part-00000") rdd_broadcast_all_words = sc.broadcast(rdd_all_words.collect()) def convert_tweet_to_instance(tweets):
def train_model (conf): sc = SparkUtil.get_spark_context (conf.spark_conf) conf.output_dir = conf.output_dir.replace ("file:", "") conf.output_dir = "file://{0}".format (conf.output_dir) labeled = Evaluate.load_all (sc, conf). \ map (lambda b : LabeledPoint ( label = 1.0 if b.fact else 0.0, features = [ b.paraDist, b.sentDist, b.docDist ] ) ) # labeled = sc.parallelize ([ round ((x/10) * 9) for x in random.sample(range(1, 100000000), 30000) ]). \ # map (lambda b : LabeledPoint ( 1.0 if b % 2 == 0 else 0.0, # [ b, b * 2, b * 9 ] ) ) # print (labeled.collect ()) train, test = labeled.randomSplit (weights=[ 0.8, 0.2 ], seed=12345) count = train.count () start = time.time () model = LogisticRegressionWithLBFGS.train (train) elapsed = time.time () - start print ("Trained model on training set of size {0} in {1} seconds".format (count, elapsed)) start = time.time () model_path = os.path.join (conf.output_dir, "eval", "model") file_path = model_path.replace ("file://", "") if os.path.isdir (file_path): print ("Removing existing model {0}".format (file_path)) shutil.rmtree (file_path) model.save(sc, model_path) sameModel = LogisticRegressionModel.load(sc, model_path) elapsed = time.time () - start print ("Saved and restored model to {0} in {1} seconds".format (model_path, elapsed)) # Metrics labelsAndPreds = test.map (lambda p: (p.label, model.predict (p.features))) trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count () / float (train.count()) print("Training Error => {0}".format (trainErr)) predictionsAndLabels = labelsAndPreds.map (lambda x : ( float(x[1]), float(x[0]) )) metrics = MulticlassMetrics (predictionsAndLabels) print (" --------------> {0}".format (predictionsAndLabels.take (1000))) #print (labelsAndPreds.collect ()) print ("\nMETRICS:") try: print ("false positive (0.0): {0}".format (metrics.falsePositiveRate(0.0))) print ("false positive (1.0): {0}".format (metrics.falsePositiveRate(1.0))) except: traceback.print_exc () try: print ("precision : {0}".format (metrics.precision(1.0))) except: traceback.print_exc () try: print ("recall : {0}".format (metrics.recall(1.0))) except: traceback.print_exc () try: print ("fMeasure : {0}".format (metrics.fMeasure(0.0, 2.0))) except: traceback.print_exc () print ("confusion matrix : {0}".format (metrics.confusionMatrix().toArray ())) print ("precision : {0}".format (metrics.precision())) print ("recall : {0}".format (metrics.recall())) print ("weighted false pos : {0}".format (metrics.weightedFalsePositiveRate)) print ("weighted precision : {0}".format (metrics.weightedPrecision)) print ("weighted recall : {0}".format (metrics.weightedRecall)) print ("weight f measure : {0}".format (metrics.weightedFMeasure())) print ("weight f measure 2 : {0}".format (metrics.weightedFMeasure(2.0))) print ("") # Regression metrics predictedAndObserved = test.map (lambda p: (model.predict (p.features) / 1.0 , p.label / 1.0 ) ) regression_metrics = RegressionMetrics (predictedAndObserved) print ("explained variance......: {0}".format (regression_metrics.explainedVariance)) print ("absolute error..........: {0}".format (regression_metrics.meanAbsoluteError)) print ("mean squared error......: {0}".format (regression_metrics.meanSquaredError)) print ("root mean squared error.: {0}".format (regression_metrics.rootMeanSquaredError)) print ("r2......................: {0}".format (regression_metrics.r2)) print ("") labelsAndPreds = test.map (lambda p: (p.label, sameModel.predict (p.features))) testErr = labelsAndPreds.filter (lambda (v, p): v != p).count () / float (test.count ()) print ("Testing Error => {0}".format (testErr))
from pyspark.mllib.linalg import Vectors # create the spark-context sc = SparkContext('local', 'pyspark') # load and parse the data data = sc.textFile("hdfs:///user/events/test_data_spark/") splits = data.map(lambda line: line.split(',')).filter(lambda x: x[5] != '\\N') # the user-event label user_event = splits.map(lambda fields: (fields[1], fields[2])) # extract the features features = splits.map(lambda fields: Vectors.dense(fields[3:])) # load the model sameModel = LogisticRegressionModel.load(sc, "hdfs:///user/events/model/LR") # predict the users-interest predictions = sameModel.predict(features.map(lambda p: (p[0:]))) # re-organize the label-prediction label_prediction = user_event.zip(predictions).map(lambda ( (userid, eventid), interested): (int(userid), int(eventid), interested)) # create the header header = sc.parallelize(["user,event,interested"]) # wrap the label-prediction lines = label_prediction.map(lambda v: (str(v[0]) + "," + str(v[1]) + "," + str(v[2]))) # save the result into HDFS (header + lines).repartition(1).saveAsTextFile("hdfs:///user/events/predictions")
from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.evaluation import BinaryClassificationMetrics def parsePoint(line): values = [float(x) for x in line] return LabeledPoint(values[-1] if values[-1]==1 else 0, values[:-1]) data = sc.textFile('mergedAB_delete_all_empty.csv') data = data.mapPartitions(lambda x: reader(x)) #header = data.first() #data = data.filter(lambda x: x != header) data = data.filter(lambda x: x[-1] in ['1', '-1']) parsedData = data.map(parsePoint) # Build the model model_lr = LogisticRegressionWithLBFGS.train(parsedData) # Evaluating the model on training data labelsAndPreds_lr = parsedData.map(lambda p: (p.label, model_lr.predict(p.features))) trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count()) print("Training Error = " + str(trainErr)) # Save and load model model.save(sc, "myModelPath") sameModel = LogisticRegressionModel.load(sc, "myModelPath")
# get the command-line arguments args = get_cli_args() # create a label encoder from a local json file that contains the set l_encoder = label_encoders_from_json_file(args.labels, args.category) # --------------- Choose the Operation to perform =-------------- # if args.operation.lower() == "train": # get/create spark context sc = get_spark_context("Train/Update LR Model") # load initial weights if it's an update operation init_model = None if args.update: print("---> Loading model") LogisticRegressionModel.load(sc, args.model) print("---> OK") # do the train job model = perform_train_job(sc, args.input, l_encoder, initial_model=init_model, evaluate=args.evaluate, category=args.category) # save the model weights as a csv file try: system("hdfs dfs -rm -r " + args.model) except: print "Failed to delete model: ", args.model print("---> Saving LR model") model.save(sc, args.model) print("---> OK")
getID = UserDefinedFunction(lambda x: parse_tweet(x)[0], StringType()) getTs = UserDefinedFunction(lambda x: parse_tweet(x)[1], StringType()) getTweet = UserDefinedFunction(lambda x: parse_tweet(x)[2], StringType()) # Apply the UDF using withColumn tweets = (tweets.withColumn('id', getID(col("data"))).withColumn( 'ts', getTs(col("data"))).withColumn('Tweet', getTweet( col("data")))).toPandas() # tweets is now a pandas df # convert tweets pandas df into input tensor for logistic regression model # df = pd.read_csv() input_tensor = create_input_tensor(tweets) # load MlLib model sameModel = LogisticRegressionModel.load( sc, "hdfs:///user/project/llib_logistic.model") # retrieve sentiments from input tensor using model tweet_f = input_tensor.to_numpy() pred = sameModel.predict(tweet_f) # create DF to send to dashboard dashboard_df = pd.DataFrame() dashboard_df['tweet'] = tweets['Tweet'] dashboard_df['ts'] = tweets['ts'] dashboard_df['prediction'] = pred print(pred) # send created DF to dashboard # send_df_to_dashboard(df) #button action = 'exec(python capture.py' + userinput+')' runs this in EMR instance,
# Initialize features to <number of sensors>-length array, filled with neutral initial sensor value features = np.zeros(n_sensors) features.fill(0.5) # Initialize streaming for specified reporting interval sc = SparkContext(appName="iotstream_lr_kafka") interval = sc.accumulator(0) empty_intervals = sc.accumulator(0) events = sc.accumulator(0) ssc = StreamingContext(sc, reporting_interval) sensor_stream = KafkaUtils.createDirectStream( ssc, [topic], {"bootstrap.servers": kafka_server_list}) # Load pre-computed model model = LogisticRegressionModel.load(sc, modelname) # Run model on each batch #sensor_stream.pprint(10) sensor_stream.foreachRDD(run_model) # Start reading streaming data ssc.start() start_time = time() ssc.awaitTermination() finish_time = time() elapsed_time = finish_time - start_time - empty_intervals.value * reporting_interval - 1.5 # Subtract off time waiting for events and 1.5 sec for termination print( '\n%s.%03dZ: %d events received in %.1f seconds (%d intervals), or %.0f sensor events/second\n' % (strftime("%Y-%m-%dT%H:%M:%S", gmtime()), (time() * 1000) % 1000, events.value - 1, elapsed_time, interval.value,
.map(lambda line : line.split(',')) \ .filter(lambda fields: fields[0] in testdays and \ fields[22] != '') # these are the fields we used in the regression # format is LabeledPoint(label, [x1, x2, ...]) flights = allfields.map(lambda fields: LabeledPoint(\ float(float(fields[22]) < 15), #ontime \ [ \ float(fields[15]), # DEP_DELAY \ float(fields[16]), # TAXI_OUT \ float(fields[26]), # DISTANCE \ ])) # read the saved model lrmodel = LogisticRegressionModel.load(sc,\ 'gs://cloud-training-demos/flights/sparkoutput/model') print lrmodel.weights,lrmodel.intercept # how good is the fit? lrmodel.setThreshold(0.7) # cancel if prob-of-ontime < 0.7 labelpred = flights.map(lambda p: (p.label, lrmodel.predict(p.features))) def eval(labelpred): cancel = labelpred.filter(lambda (label, pred): pred == 1) nocancel = labelpred.filter(lambda (label, pred): pred == 0) corr_cancel = cancel.filter(lambda (label, pred): label == pred).count() corr_nocancel = nocancel.filter(lambda (label, pred): label == pred).count() return {'total_cancel': cancel.count(), \ 'correct_cancel': float(corr_cancel)/cancel.count(), \ 'total_noncancel': nocancel.count(), \ 'correct_noncancel': float(corr_nocancel)/nocancel.count() \ }
# In[7]: labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features))) labelsAndPreds.sample(False, 0.03).collect() # In[8]: trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float( parsedData.count()) print("Training Error = " + str(trainErr)) # In[10]: model.save(sc, "model_ex2.mod") sameModel = LogisticRegressionModel.load(sc, "model_ex2.mod") # In[11]: from pyspark.mllib.stat import Statistics from pyspark.mllib.linalg import Vectors from pyspark.mllib.linalg.distributed import RowMatrix feats = parsedData.map(lambda p: p.features) Mdata = RowMatrix(parsedData.map(lambda p: p.features)) # In[12]: feats.take(4) # In[13]:
#geolocator = Nominatim() sparkConf = SparkConf().setMaster("local").setAppName("Predict").set("spark.app.id", "Predict") sc = SparkContext(conf=sparkConf) inp = sc.textFile("testing.txt").map(lambda row: row.split(" ")) word2vec = Word2Vec() model = word2vec.fit(inp) WordVectors = {} for i in model.getVectors().keys(): WordVectors[i] = model.findSynonyms(i,7) Positive = open(os.getcwd() + "/positive.txt").read().splitlines() Negative = open(os.getcwd() + "/negative.txt").read().splitlines() sameModel = LogisticRegressionModel.load(sc, "model") from pymongo import MongoClient client = MongoClient('localhost:27017') db=client.test tweetList = [] tweets=db.tweetdb.find() for tweet in tweets: l=[] entity_names = "" loc = "" if tweet.get('place'): loc=str(tweet['place']['full_name']).split(',')[-1] print loc text = tweet['text']
# get the command-line arguments args = get_cli_args() # create a label encoder from a local json file that contains the set l_encoder = label_encoders_from_json_file(args.labels, args.category) # --------------- Choose the Operation to perform =-------------- # if args.operation.lower() == "train": # get/create spark context sc = get_spark_context("Train/Update LR Model") # load initial weights if it's an update operation init_model = None if args.update: print("---> Loading model") LogisticRegressionModel.load(sc, args.model) print("---> OK") # do the train job model = perform_train_job(sc, args.input, l_encoder, initial_model=init_model, evaluate=args.evaluate, category=args.category) # save the model weights as a csv file try: system("hdfs dfs -rm -r " + args.model) except: print "Failed to delete model: ", args.model print("---> Saving LR model") model.save(sc, args.model) print("---> OK")
from pyspark import SparkConf if __name__ == '__main__': print("This is the name of the script: ", sys.argv[0]) print("Number of arguments: ", len(sys.argv)) print("The arguments are: ", str(sys.argv)) queryInputPath = sys.argv[1] savedModelPath = sys.argv[2] conf = SparkConf() conf.setAppName("SpamDetection") sc = SparkContext.getOrCreate(conf=conf) model = LogisticRegressionModel.load(sc, savedModelPath) query = sc.textFile(queryInputPath, use_unicode=False) tf = HashingTF(numFeatures=1000) def classify(data): data2 = data.split() datatf = tf.transform(data2) classifications = model.predict(datatf) return classifications classifications = query.map(lambda x: (classify(x), x)) predictions = classifications.collect()
data = sc.textFile("sample_svm_data.txt") # 注意这里要用LabeledPoint来生成feature列 parsedData = data.map(parsePoint) # 使用LBFGS进行优化 model = LogisticRegressionWithLBFGS.train(parsedData) # Evaluating the model on training data labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features))) trainErr = labelsAndPreds.filter(lambda lp: lp[0] != lp[1]).count() / float( parsedData.count()) print("Training Error = " + str(trainErr)) # Save and load model model.save(sc, "pythonLogisticRegressionWithLBFGSModel") sameModel = LogisticRegressionModel.load( sc, "pythonLogisticRegressionWithLBFGSModel") ################################################################### # 随机森林 from pyspark.mllib.tree import RandomForest, RandomForestModel from pyspark.mllib.util import MLUtils data = MLUtils.loadLibSVMFile(sc, 'sample_libsvm_data.txt') (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Setting featureSubsetStrategy="auto" lets the algorithm choose. # Maximum number of bins used for splitting features. 特征值的最大箱数,分割区间数 model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={}, numTrees=3,
from geopy.geocoders import Nominatim import json from Levenstein import Lev stop_words = nltk.corpus.stopwords.words('english') stop_words+=['?','.','!',','] geolocator = Nominatim() sparkConf = SparkConf().setMaster("local").setAppName("PredictKafkaTweetStreaming").set("spark.app.id", "Predict") sc = SparkContext(appName="PythonSparkStreamingKafka_RM_01") sc.setLogLevel("WARN") with open('WordVectors.json') as data_file: WordVectors= json.load(data_file) Positive = open(os.getcwd() + "/positive.txt").read().splitlines() Negative = open(os.getcwd() + "/negative.txt").read().splitlines() sameModel = LogisticRegressionModel.load(sc, "model") def MakeTuple(l): tweet=json.loads(l) loc = "" coords="" #print tweet if 'coordinates' in tweet and not (tweet["coordinates"] is None): #locAll=str(tweet["place"]["full_name"]) #loc=locAll.split(',')[-1] #location= geolocator.geocode(str(tweet['place']['full_name'])) #coords=str(str(location.longitude) + ',' + str(location.latitude)) coords=tweet["coordinates"] try: text = tweet["text"] except KeyError:
from pyspark.mllib.linalg import Vectors SparkContext.setSystemProperty('spark.rdd.compress', config.get('spark', 'spark_rdd_compress')) SparkContext.setSystemProperty('spark.driver.maxResultSize', config.get('spark', 'spark_driver_maxResultSize')) SparkContext.setSystemProperty('spark.executor.memory', args.exe_memory) SparkContext.setSystemProperty('spark.cores.max', args.core_max) sc = SparkContext(args.sp_master, 'single_predict:'+str(args.row_id)) flag_model = ml_opts['learning_algorithm'] save_dir = config.get('app', 'HADOOP_MASTER')+config.get('app', 'HDFS_MODEL_DIR')+'/'+row_id_str if flag_model == "linear_svm_with_sgd": mllib_model = SVMModel.load(sc, save_dir) col_num = len(mllib_model.weights) elif flag_model == "logistic_regression_with_lbfgs" or flag_model == "logistic_regression_with_sgd": mllib_model = LogisticRegressionModel.load(sc, save_dir) col_num = mllib_model.numFeatures # len(mllib_model.weights) return 3x value elif flag_model == "kmeans": mllib_model = KMeansModel.load(sc, save_dir) col_num =len(mllib_model.clusterCenters[0]) else: print "ERROR: Training model selection error: no valid ML model selected!" return # get the model dimension #col_num = len(mllib_model.weights) print "INFO: total feature # in mllib model=",col_num # calculate hypothesis value ================ model_weight=None if learning_algorithm not in ("kmeans") : model_weight=mllib_model.weights
# Save and load model #model.save(sc, "myModelPath") #sameModel = SVMModel.load(sc, "myModelPath") # In[ ]: from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel from pyspark.mllib.regression import LabeledPoint # Load and parse the data def parsePoint(line): values = [float(x) for x in line.split(' ')] return LabeledPoint(values[0], values[1:]) data = sc.textFile("data/mllib/sample_svm_data.txt") parsedData = data.map(parsePoint) # Build the model model = LogisticRegressionWithLBFGS.train(parsedData) # Evaluating the model on training data labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features))) trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count()) print("Training Error = " + str(trainErr)) # Save and load model model.save(sc, "myModelPath2") sameModel = LogisticRegressionModel.load(sc, "myModelPath2")
conf = SparkConf().setAppName("TFIDF").set("spark.executor.memory", "2g") sc = SparkContext(conf=conf) place = "/Users/daniellenash/Downloads/goodValidation/" placeAdd = ["goodValidation1.txt","goodValidation2.txt", "goodValidation3.txt","goodValidation4.txt","goodValidation5.txt","goodValidation6.txt", "goodValidation7.txt","goodValidation8.txt","goodValidation9.txt","goodValidation10.txt"] place2 = "/Users/daniellenash/Downloads/badValidation/" placeAdd2 = ["badValidation1.txt","badValidation2.txt", "badValidation3.txt","badValidation4.txt","badValidation5.txt","badValidation6.txt", "badValidation7.txt","badValidation8.txt","badValidation9.txt","badValidation10.txt"] hashingTF = HashingTF(100000) model = LogisticRegressionModel.load(sc, "/Users/daniellenash/Downloads/spark-1.6.1-bin-hadoop2.6/python/LRModel") for c in range(0,10): currentPlace = place +""+ placeAdd[c] documents = sc.textFile(currentPlace).filter(lambda x : len(x) > 15) docTokens = documents.map(lambda x: x.split(" ")) tf = hashingTF.transform(docTokens) idf = IDF(minDocFreq=5).fit(tf) tfidf = idf.transform(tf) val = model.predict(tfidf) mapped = val.map(lambda x: (x,1))
split = Tokenizer(inputCol="text", outputCol="words") wordsData = split.transform(train_hive_info) # 增加TF特征列 hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=2**10) TF_data = hashingTF.transform(wordsData) # 增加IDF特征列 idf = IDF(inputCol="rawFeatures", outputCol="features").fit(TF_data) if model_name == 'NaiveBayes': model = NaiveBayesModel.load(sc, model_path) elif model_name == 'LogisticRegression': model = LogisticRegressionModel.load(sc, model_path) else: model = RandomForestModel.load(sc, model_path) my_print('模型加载完成.......') # 1.以D_stream方式创建kafka数据源 numStreams = 3 kafkaStreams = [ KafkaUtils.createStream(ssc, "localhost:2181", b"streaming_group", {b"streaming": 1}) for _ in range(numStreams) ] kafkaStreams = ssc.union(*kafkaStreams) my_print('Kafka数据源链接成功......') # 2.以Direct方式创建kafka数据源
""" symbolic_indexes = [5, 7, 12, 18, 21] clean_line_split = [item for i, item in enumerate (line_split) if i in symbolic_indexes] #Cancelled becomes the 5th column now, and total columns in the data = 5 label = clean_line_split[4] nonLabel = clean_line_split[0:4] return LabeledPoint (label, nonLabel) parsedData = raw_data.map (parsePoint) #divide training and test data by 70-30 rule (training, test) = parsedData.randomSplit ([0.7, 0.3], seed=11L) #start timer at this point startTime = datetime.now() #build the model""" model = LogisticRegressionWithLBFGS.train (training, numClasses=3) training.cache () #evaluate the model on training data labelAndPreds = test.map (lambda x: (x.label, model.predict (x.features))) trainErr = labelAndPreds.filter (lambda (w, x): w != x).count () / float (test.count ()) print ('Time consumed = '), (datetime.now() - startTime) print ("Training error = " + str (trainErr)) #save and load model model.save (sc, "LRN-2008") sameModel = LogisticRegressionModel.load (sc, "LRN-2008") sc.stop ()
# http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import sys from pyspark import SparkConf, SparkContext from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel from pyspark.mllib.regression import LabeledPoint def parsePoint(line): data = line[1:][:-1] values = [float(x) for x in data.split(', ')] return LabeledPoint(1 if values[34] > 0.5 else 0, values[:-1]) conf = SparkConf() \ .setAppName(sys.argv[0])\ .set("spark.executor.memory", "2g") sc = SparkContext(conf=conf) data = sc.textFile(sys.argv[1]) parsedData = data.map(parsePoint) model = LogisticRegressionModel.load(sc, "Model_logistc") labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features))) trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count()) print("Test Error = " + str(trainErr))
training.cache () #start timer at this point startTime = datetime.now() #build the model model = LogisticRegressionWithLBFGS.train (training, numClasses=3) #evaluate the model on training data labelAndPreds = test.map (lambda x: (x.label, model.predict (x.features))) #labelAndPreds = testData.map (lambda x: (x.label, model.predict (x.features))) trainErr = labelAndPreds.filter (lambda (w, x): w != x).count () / float (test.count ()) print ('Time consumed = '), (datetime.now() - startTime) print ("Training error = " + str (trainErr)) #save and load model model.save(sc, "LRW-95-08") sameModel = LogisticRegressionModel.load(sc, "LRW-95-08") sc.stop () """metrics = MulticlassMetrics(labelAndPreds) # Overall statistics precision = metrics.precision() recall = metrics.recall() f1Score = metrics.fMeasure() print("Summary Stats") print("Precision = %s" % precision) print("Recall = %s" % recall) print("F1 Score = %s" % f1Score)"""
steam_key = "86FE36CEEF0FECD245B5C711C8B82C5A" CONV64_32 = 76561197960265728 SITE_ROOT = os.path.realpath(os.path.dirname(__file__)) profile_map = {} famous_map = {} dota_appid = 570 cur_id32 = -1 model_map = {} tag_name_map = defaultdict(dict) sc = SparkContext('local') for i in range(1, 11): model_name = str(i) + "_players_model.model" print "load " + model_name model = LogisticRegressionModel.load(sc, "models/" + model_name) model_map[i] = model tag_name_map[1][0] = "newbie" tag_name_map[1][1] = "normal" tag_name_map[1][2] = "legend" tag_name_map[1][3] = "divine" tag_name_map[2][0] = "onlooker" tag_name_map[2][1] = "effective assiatant" tag_name_map[2][2] = "warrior" tag_name_map[2][3] = "main force" tag_name_map[2][4] = "tower killer" tag_name_map[3][0] = "pioneer" tag_name_map[3][1] = "enemy controller"
] #Cancelled becomes the 5th column now, and total columns in the data = 5 label = clean_line_split[4] nonLabel = clean_line_split[0:4] return LabeledPoint(label, nonLabel) parsedData = raw_data.map(parsePoint) #divide training and test data by 70-30 rule (training, test) = parsedData.randomSplit([0.7, 0.3], seed=11L) #start timer at this point startTime = datetime.now() #build the model""" model = LogisticRegressionWithLBFGS.train(training, numClasses=3) training.cache() #evaluate the model on training data labelAndPreds = test.map(lambda x: (x.label, model.predict(x.features))) trainErr = labelAndPreds.filter(lambda (w, x): w != x).count() / float( test.count()) print('Time consumed = '), (datetime.now() - startTime) print("Training error = " + str(trainErr)) #save and load model model.save(sc, "LRN-95-08") sameModel = LogisticRegressionModel.load(sc, "LRN-95-08") sc.stop()
encoding='utf8', header=True, inferSchema=True) test_data = test_data.rdd # %% #将测试集的特征转为向量 test = test_data.map(lambda line: (line[0], line[1], line[2], Vectors.dense(line[3:]))) # %% [markdown] # ## Logistic Regression # %% from pyspark.mllib.classification import LogisticRegressionModel lr_model = LogisticRegressionModel.load( sc, "hdfs://node1:9000/user/root/exp4/models/LogisticRegressionModel") # %% lr_predictions = test.map(lambda line: (line[0], line[1], float(lr_model.predict(line[3])))) lr_predictions.coalesce(1).toDF().write.options(header="true").csv( "hdfs://node1:9000/user/root/exp4/predictions/lr_predictions.csv") # %% [markdown] # 日期:2020-12-20 14:08:52 排名: 无 # score:0.5015744 # %% [markdown] # ## SVM # %% from pyspark.mllib.classification import SVMModel
def _load_pre_trained_model(): ''' load trained LogisticRegressionModel model''' trained_model = LogisticRegressionModel.load(sc, "model/SGD") # trained_model = LogisticRegressionModel.load(sc, "model/LBFGS") return trained_model
def loadLogisticRegressionSparkModel(sc, modelFileName): print_(nowStr() + ':', 'loading', modelFileName + '...') modelFileName = joinPath(sparkFolder, modelFileName) return LogisticRegressionModel.load(sc, modelFileName)
from pyspark.mllib.regression import LabeledPoint # $example off$ if __name__ == "__main__": sc = SparkContext(appName="PythonLogisticRegressionWithLBFGSExample") # $example on$ # Load and parse the data def parsePoint(line): values = [float(x) for x in line.split(' ')] return LabeledPoint(values[0], values[1:]) data = sc.textFile("sample_svm_data.txt") parsedData = data.map(parsePoint) # Build the model model = LogisticRegressionWithLBFGS.train(parsedData) # Evaluating the model on training data labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features))) trainErr = labelsAndPreds.filter(lambda lp: lp[0] != lp[1]).count() / float(parsedData.count()) print("Training Error = " + str(trainErr)) # Save and load model model.save(sc, "target/tmp/pythonLogisticRegressionWithLBFGSModel") sameModel = LogisticRegressionModel.load(sc, "target/tmp/pythonLogisticRegressionWithLBFGSModel") # $example off$ sc.stop()
def load(self, location): try: self.model = LogisticRegressionModel.load(self.sc, location) except Exception as e: raise e