def cross_validation_lr(Data_1,Data_2,Data_3,regType, num_iter): # Training the model using Logistic Regression Classifier model_train_1 =LogisticRegressionWithLBFGS.train(Data_1.union(Data_2), regType =regType, iterations=num_iter, numClasses=5) # Evaluate model on test instances and compute test error predictions_1 = model_train_1.predict(Data_3.map(lambda x: x.features)) labelsAndPredictions_1 = Data_3.map(lambda lp: lp.label).zip(predictions_1) testMSE_1 = labelsAndPredictions_1.map(lambda (v, p): (v +0.5 - p) * (v +0.5- p )).sum() /\ float(Data_3.count()) model_train_2 =LogisticRegressionWithLBFGS.train(Data_2.union(Data_3), regType =regType, iterations=num_iter, numClasses=5) # Evaluate model on test instances and compute test error predictions_2 = model_train_2.predict(Data_1.map(lambda x: x.features)) labelsAndPredictions_2 = Data_1.map(lambda lp: lp.label).zip(predictions_2) testMSE_2 = labelsAndPredictions_2.map(lambda (v, p): (v +0.5- p) * (v +0.5- p )).sum() /\ float(Data_1.count()) model_train_3 =LogisticRegressionWithLBFGS.train(Data_3.union(Data_1), regType =regType, iterations=num_iter, numClasses=5) # Evaluate model on test instances and compute test error predictions_3 = model_train_3.predict(Data_2.map(lambda x: x.features)) labelsAndPredictions_3 = Data_2.map(lambda lp: lp.label).zip(predictions_3) testMSE_3 = labelsAndPredictions_3.map(lambda (v, p): (v +0.5- p ) * (v +0.5- p)).sum() /\ float(Data_2.count()) return (testMSE_1+testMSE_2+testMSE_3)/3
def training(model_directory, libsvm, scaler): sc = SparkContext(appName="PythonLinearRegressionWithSGDExample") training_rdd = MLUtils.loadLibSVMFile(sc, libsvm) training_rdd.cache() if scaler == '1': label = training_rdd.map(lambda x: x.label) features = training_rdd.map(lambda x: x.features) scaler1 = StandardScaler().fit(features) data1 = label.zip(scaler1.transform(features)) # convert into labeled point data2 = data1.map(lambda x: LabeledPoint(x[0], x[1])) model_logistic = LogisticRegressionWithLBFGS.train(data2) else: model_logistic = LogisticRegressionWithLBFGS.train(training_rdd) model_logistic.save(sc, model_directory)
def train(self, df, target, regularization=None, num_of_iterations=100): try: LOGGER.info("Generation logistic regression") spark_df = self.sql_context.createDataFrame(df) feature_columns = spark_df.columns feature_columns.remove(target) X_train = spark_df.select(*feature_columns).map(lambda x: list(x)) y_train = spark_df.select(target).map(lambda x: x[0]) zipped = y_train.zip(X_train) train_data = zipped.map(lambda x: LabeledPoint(x[0], x[1])) numOfClasses = len(df[target].unique()) logistic_model = LogisticRegressionWithLBFGS.train( train_data, numClasses=numOfClasses, regParam=0, regType=regularization, intercept=True, iterations=num_of_iterations, validateData=False) self.model = logistic_model except Exception as e: raise e
def lrTest(sqlContext,dataset_rdd,positive_negotive_rate): dataset_positive = dataset_rdd.filter(lambda e:e[1]>0.5) dataset_negotive = dataset_rdd.filter(lambda e:e[1]<0.5) train_positive = dataset_positive.sample(False,0.8) test_positive = dataset_positive.subtract(train_positive) train_negotive = dataset_negotive.sample(False,0.8) test_negotive = dataset_negotive.subtract(train_negotive) trainset_rdd = train_positive.union(train_negotive) testset_rdd = test_positive.union(test_negotive) trainset = trainset_rdd.map(lambda e:LabeledPoint(e[1],e[2:])) trainset_nums = trainset.count() testset = testset_rdd.map(lambda e:LabeledPoint(e[1],e[2:])) testset_nums = testset.count() trainset_positive = train_positive.count() testset_positive = test_positive.count() model = LogisticRegressionWithLBFGS.train(trainset,iterations = 100) predict = testset.map(lambda p:(p.label,model.predict(p.features))) hitALL =predict.filter(lambda e:e[0]==e[1]).count() hitPositive = predict.filter(lambda e:e[0]==e[1] and (e[0]>0.5)).count() positive = predict.filter(lambda e:e[1]>0.5).count() recallPositive = hitPositive/float(testset_positive) precision = hitPositive/float(positive) accuracy = hitALL/float(testset.count()) F_Value = 2/(1/precision+1/recallPositive) return (trainset_nums,testset_nums,trainset_positive,testset_positive,positive,hitPositive,precision,recallPositive,accuracy,F_Value,model)
def seg_model_lr(train_data, test_data, regType, num_iter): removelist_train= set(['stars', 'business_id', 'bus_id', 'b_id','review_id', 'user_id']) newlist_train = [v for i, v in enumerate(train_data.columns) if v not in removelist_train] # Putting data in vector assembler form assembler_train = VectorAssembler(inputCols=newlist_train, outputCol="features") transformed_train = assembler_train.transform(train_data.fillna(0)) # Creating input dataset in the form of labeled point for training the model data_train= (transformed_train.select("features", "stars")).map(lambda row: LabeledPoint(row.stars, row.features)) # Training the model using Logistic regression Classifier model_train = LogisticRegressionWithLBFGS.train(sc.parallelize(data_train.collect(),5), regType =regType, iterations=num_iter, numClasses=5) # Creating a list of features to be used for predictions removelist_final = set(['business_id', 'bus_id', 'b_id','review_id', 'user_id']) newlist_final = [v for i, v in enumerate(test_data.columns) if v not in removelist_final] # Putting data in vector assembler form assembler_final = VectorAssembler(inputCols=newlist_final,outputCol="features") transformed_final= assembler_final.transform(test_data.fillna(0)) # Creating input dataset to be used for predictions data_final = transformed_final.select("features", "review_id") # Predicting ratings using the developed model predictions = model_train.predict(data_final.map(lambda x: x.features)) labelsAndPredictions = data_final.map(lambda data_final: data_final.review_id).zip(predictions) return labelsAndPredictions
def main(): training_rdd = sc.textFile(train_inputs).map(to_LP_training).filter(lambda lp: lp!=None) testing_rdd = sc.textFile(test_inputs).map(to_LP_testing).filter(lambda lp: lp!=None) # # Logistic Regression with SGD # lg_model = LogisticRegressionWithSGD.train(training_rdd, step = 0.1, regType = 'l1') # lg_prediction = testing_rdd.map(lambda (qt, sv): (qt, lg_model.predict(sv))) # # # Logistic Regression with LBFGS lg_model2 = LogisticRegressionWithLBFGS.train(training_rdd) lg_prediction2 = testing_rdd.map(lambda (qt, sv): (qt, lg_model2.predict(sv))) # # # # SVM with SGD # svm_model = SVMWithSGD.train(training_rdd, step = 0.01) # svm_prediction = testing_rdd.map(lambda (qt, sv): (qt, svm_model.predict(sv))) # print 'Logistic Regression with SGD results: ', len(lg_prediction.filter(lambda (idx, p):p!=0).collect()) result = lg_prediction2.collect() # print 'SVM with SGD', len(svm_prediction.filter(lambda (idx, p):p!=0).collect()) with open('[your result.csv path]', 'w') as csvfile: fieldnames = ['QuoteNumber', 'QuoteConversion_Flag'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for l in result: writer.writerow({'QuoteNumber': l[0], 'QuoteConversion_Flag': l[1]})
def test_train(self, df, target, train_split, test_split, regularization=None, num_of_iterations=100): try: LOGGER.info("Generation logistic regression") spark_df = self.sql_context.createDataFrame(df) feature_columns = spark_df.columns feature_columns.remove(target) train, test = spark_df.randomSplit([train_split, test_split], seed=1000000) X_train = train.select(*feature_columns).map(lambda x: list(x)) y_train = train.select(target).map(lambda x: x[0]) zipped = y_train.zip(X_train) train_data = zipped.map(lambda x: LabeledPoint(x[0], x[1])) numOfClasses = len(df[target].unique()) logistic_model = LogisticRegressionWithLBFGS.train(train_data, numClasses=numOfClasses, regParam=0, regType=regularization, intercept=True, iterations=num_of_iterations, validateData=False) X_test = test.select(*feature_columns).map(lambda x: list(x)) y_test = test.select(target).map(lambda x: x[0]) prediction = X_test.map(lambda lp: (float(logistic_model.predict(lp)))) prediction_and_label = prediction.zip(y_test) LOGGER.info(prediction_and_label.map(lambda labelAndPred: labelAndPred[0] == labelAndPred[1]).mean()) except Exception as e: raise e
def logistic_regression(sc, in1, **params): temp = in1.map(lambda x: LabeledPoint( x[int(params['label'])], x[:int(params['label'])] + x[int(params[ 'label']) + 1:])) temp = LogisticRegressionWithLBFGS.train( temp, iterations=int(params['iterations']), numClasses=int(params['numClasses'])) return True, temp
def train(self, input_data, parameters): iterations = parameters.get('iterations', None) weights = parameters.get('weights', None) intercept = parameters.get('intercept', None) numFeatures = parameters.get('numFeatures', None) numClasses = parameters.get('numClasses', None) data = self._sc.parallelize(self._parser.parse(input_data)) self._model = LogisticRegressionWithLBFGS.train(data,\ iterations=iterations,\ numClasses=numClasses)
def RunLogit(tf): rdd = tf.map(parseAsLabeledPoints) train, test = rdd.randomSplit([.8, .2]) numCat = len(genCats) model = LogisticRegressionWithLBFGS.train(train, numClasses=numCat, iterations=100) predictionAndLabel = test.map(lambda p: (model.predict(p.features), p.label)) accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count() print 'Accuracy of Logit = ', accuracy * 100 print "Test Error = ", (1.0 - accuracy) * 100
def get_error(training, test): model = LogisticRegressionWithLBFGS.train(training, numClasses=18) # Evaluating the model on training data labelsAndPreds = test.map(lambda p: (p.label, model.predict(p.features))) ERR = labelsAndPreds.filter(lambda (v, p): v != p).count() / float( test.count()) print("Training Error = " + str(ERR)) return ERR
def main(): conf = SparkConf().setMaster("local").setAppName("Assignment 1") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) sc.setLogLevel("ERROR") #part 1 data = sc.textFile('/home/disha/Downloads/MSD.txt', 2) dc = data.count() #print data.count() #print data.take(40) sdata = data.take(40) #part 2 lp = [parse_line(p) for p in sdata] #part 3 x1 = list(lp[i].features[3] for i in range(40)) x2 = list(lp[i].features[4] for i in range(40)) dataFrame = sqlContext.createDataFrame([(Vectors.dense(x1), ), (Vectors.dense(x2), )], ["features"]) scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures") scalerModel = scaler.fit(dataFrame) scaledData = scalerModel.transform(dataFrame) x = scaledData.select("scaledFeatures").map(list).collect() xdf = pd.DataFrame({'1': x[0][0], '2': x[1][0]}) ''' fig, ax = plt.subplots() heatmap = ax.pcolor(xdf, cmap=plt.cm.Greys, alpha=0.8) fig = plt.gcf() fig.set_size_inches(8, 11) ax.set_frame_on(False) ax.invert_yaxis() ax.xaxis.tick_top()''' #plt.show() #part 4 onlyLabels = data.map(parse_line).map( lambda point: int(point.label)).collect() minYear = min(onlyLabels) maxYear = max(onlyLabels) print maxYear, minYear lp_rdd = data.map(parse_line).map( lambda l: LabeledPoint(int(l.label) - minYear, l.features)) #print lp_rdd.take(10) #part 5 train, test = lp_rdd.randomSplit([0.8, 0.2]) model = LogisticRegressionWithLBFGS.train(train, iterations=10, numClasses=maxYear - minYear + 1) vp = test.map(lambda p: (model.predict(p.features), p.label)) rmse = getrmse(vp) print rmse a1 = test.map(lambda p: model.predict(p.features)).collect() a2 = test.map(lambda p: int(p.label)).collect() plt.scatter(a1, a2) plt.show()
def train_model(training_rdd, **kwargs): """ Train a classifier model using an rdd training dataset :param training_rdd: the rdd of the training dataset :param kwargs: additional key-value params for the training (if any) :return: """ return LogisticRegressionWithLBFGS.train(training_rdd, regType=_REGULARIZATION, intercept=_INTERCEPT, **kwargs)
def regression(reg_data): (trainingData, testData) = reg_data.randomSplit([0.7, 0.3]) lrmodel = LogisticRegressionWithLBFGS.train(trainingData) labelsAndPreds = testData.map(lambda p: (p.label, lrmodel.predict(p.features))) trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(testData.count()) falsePos = labelsAndPreds.filter(lambda (v, p): v != p and v == 0.0).count() / float(testData.filter(lambda lp: lp.label == 0.0).count()) falseNeg = labelsAndPreds.filter(lambda (v, p): v != p and v == 1.0).count() / float(testData.filter(lambda lp: lp.label == 1.0).count()) print "*** Error Rate: %f ***" % trainErr print "*** False Positive Rate: %f ***" % falsePos print "*** False Negative Rate: %f ***" % falseNeg
def __init__(self, sc): """Init the engine and train the model """ logger.info("Starting up the GeneLearn Engine: ") self.sc = sc logger.info("Loading training data...") dataset_path = "/Users/qingpeng/Dropbox/Development/Bitbucket/jgi-genelearn/scripts/Flask" training_file_path = os.path.join(dataset_path, 'training.svmlib') training = MLUtils.loadLibSVMFile(sc, training_file_path) self.model = LogisticRegressionWithLBFGS.train(training)
def TrainLRCModel(trainingData, testData): print(type(trainingData)) print(trainingData.take(2)) model = LogisticRegressionWithLBFGS.train(trainingData, numClasses=5) print(type(model)) exit(); predictions = testData.map(lambda p: (p.label, model.predict(p.features))) correct = predictions.filter(lambda (x, p): x == p) ### Calculate the accuracy of the model using custom method accuracy = round((correct.count() / float(testData.count())) * 100, 3) # return the final accuracy return accuracy
def sim_function(isim, patsim, dataset, ss_ori): #select patients in each simulation from patsim valsimid = patsim.filter(patsim.simid == isim) sssim = ss_ori\ .join(valsimid,valsimid.matched_positive_id==ss_ori.matched_positive_id,'inner')\ .select(ss_ori.matched_positive_id, ss_ori.label, ss_ori.patid,ss_ori.features) #select corresponding trainning and test set valsim = dataset\ .join(valsimid, valsimid.matched_positive_id==dataset.matched_positive_id, 'inner')\ .select(dataset.matched_positive_id, dataset.label, dataset.patid,dataset.features) trsim = dataset.subtract(valsim) #get LabeledandPoint rdd data trsimrdd = trsim.map(parsePoint) valsimrdd = valsim.map(parsePoint) sssimrdd = sssim.map(parsePoint) # Build the model sim_model = LogisticRegressionWithLBFGS.train(trsimrdd, intercept=True, regType=None) #clear the threshold sim_model.clearThreshold() #output model sim_model.save(sc, resultDir_s3 + "model_sim" + str(isim)) #load model #model = LogisticRegressionModel.load(sc, resultDir_s3+"model_sim"+str(isim)) #predict on test data scoreAndLabels_val = valsimrdd.map( lambda p: (float(sim_model.predict(p.features)), p.label)) scoreAndLabels_ss = sssimrdd.map( lambda p: (float(sim_model.predict(p.features)), p.label)) #Identify the probility of response pred_score_val = scoreAndLabels_val.toDF()\ .withColumnRenamed('_1', 'prob_1')\ .withColumnRenamed('_2', 'label') pred_score_ss = scoreAndLabels_ss.toDF()\ .withColumnRenamed('_1', 'prob_1')\ .withColumnRenamed('_2', 'label') return [pred_score_val, pred_score_ss]
def validation_lr(trainingData,testData, regType, num_iter): # Training the model using Logistic Regression Classifier model_train =LogisticRegressionWithLBFGS.train(trainingData, regType =regType, iterations=num_iter, numClasses=5) # Evaluate model on test instances and compute test error predictions = model_train.predict(testData.map(lambda x: x.features)) testMSE_1 = labelsAndPredictions_1.map(lambda (v, p): (v - p) * (v - p)).sum() /\ float(testData.count()) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() /\ float(testData.count()) return testMSE_1,testMSE
def main(input_file_path): print('=====>>>>>') print('ddd') data = sc.textFile(input_file_path) traning_data_RDD = data.filter(lambda line: line.split(',')[4] != '' and line.split(',')[0] != 'INDEX') unseen_data_RDD = data.filter(lambda line: line.split(',')[4] == '') traning_data_pddf = create_pddf(traning_data_RDD) traning_data_df = sqlContext.createDataFrame(traning_data_pddf) print(traning_data_df.head()) parsed_data = rdd_to_labeled_point(traning_data_df.rdd) parsed_data.persist() # Correct print: [LabeledPoint(1.0, [1.0,8.6662186586,6.98047693487])] logisticRegressionWithLBFGS = LogisticRegressionWithLBFGS.train( parsed_data, iterations=500, numClasses=100) labels_and_preds = parsed_data.map( lambda lp: [lp.label, logisticRegressionWithLBFGS.predict(lp.features)]) Accuracy = float( labels_and_preds.filter(lambda ele: (int(ele[0]) - int(ele[1]))**2). reduce(lambda x, y: x + y)[0]) / float(parsed_data.count()) print("Training Accuracy on training data = " + str(Accuracy)) unseen_data_pddf = create_pddf(unseen_data_RDD) unseen_data_df = sqlContext.createDataFrame(unseen_data_pddf) unseen_parsed_data = rdd_to_index_featurs(unseen_data_df.rdd) unseen_parsed_data.persist() file = open( '/Users/1002720/Documents/workspace/SNU-project/data/BDA2Project/1-GenderPrediction/result2.csv', 'w', encoding='utf-8') file.write('INDEX,AGE\n') for data in unseen_parsed_data.collect(): file.write( str(data[0]) + ',' + str(logisticRegressionWithLBFGS.predict(data[1])) + '\n') # print(labels_and_preds.collect()) parsed_data.unpersist() unseen_parsed_data.unpersist() print('=====>>>>>') print('=====>>>>>') print('=====>>>>>') print('=====>>>>>')
def train_evaluate_model(train_data, valid_data, iterations, regParam): start_time = time() # 训练 model = LogisticRegressionWithLBFGS.train(train_data, numClasses=2, iterations=iterations, regParam=regParam) # 评估 # y_pred y_true AUC = evaluate_model(model, valid_data) duration = time() - start_time print( f"训练评估:使用参数 iterations={iterations}, regParam={regParam} ==>所需时间={duration} 结果AUC = {AUC}" ) return AUC, duration, iterations, regParam, model
def regression(reg_data): train_data, test_data = reg_data.randomSplit([0.7, 0.3]) model = LogisticRegressionWithLBFGS.train(train_data) labels_predictions = test_data.map(lambda p: (p.label, model.predict(p.features))) train_error = labels_predictions.filter(lambda (v, p): v != p).count() / float(test_data.count()) false_pos = labels_predictions.filter(lambda (v, p): v != p and v == 0.0).count() / float( test_data.filter(lambda lp: lp.label == 0.0).count()) false_neg = labels_predictions.filter(lambda (v, p): v != p and v == 1.0).count() / float( test_data.filter(lambda lp: lp.label == 1.0).count()) print "*** Error Rate: %f ***" % train_error print "*** False Positive Rate: %f ***" % false_pos print "*** False Negative Rate: %f ***" % false_neg
def main(): #parameters num_features = 400 #vocabulary size #load data print "loading 20 newsgroups dataset..." categories = [ 'rec.autos', 'rec.sport.hockey', 'comp.graphics', 'sci.space' ] tic = time() dataset = fetch_20newsgroups(shuffle=True, random_state=0, categories=categories, remove=('headers', 'footers', 'quotes')) train_corpus = dataset.data # a list of 11314 documents / entries train_labels = dataset.target toc = time() print "elapsed time: %.4f sec" % (toc - tic) #tf-idf vectorizer tfidf = TfidfVectorizer(max_df=0.5, max_features=num_features, \ min_df=2, stop_words='english', use_idf=True) X_tfidf = tfidf.fit_transform(train_corpus).toarray() #append document labels train_labels = train_labels.reshape(-1, 1) X_all = np.hstack([train_labels, X_tfidf]) #distribute the data sc = SparkContext('local', 'log_reg') rdd = sc.parallelize(X_all) labeled_corpus = rdd.map(parse_doc) train_RDD, test_RDD = labeled_corpus.randomSplit([8, 2], seed=0) #distributed logistic regression print "training logistic regression..." model = LogisticRegressionWithLBFGS.train(train_RDD, regParam=1, regType='l1', numClasses=len(categories)) #evaluated the model on test data labels_and_preds = test_RDD.map(lambda p: (p.label, model.predict(p.features))) test_err = labels_and_preds.filter(lambda (v, p): v != p).count() / float( test_RDD.count()) print "log-reg test error: ", test_err
def training(path): #import dataset into RDD raw_data = sc.textFile(path) #parse raw data into label bag-of-words pairs parsed_data = raw_data.map(lambda line: parse_line(line)) #separate into training set and test set training_set, test_set = parsed_data.randomSplit([0.6, 0.4], 17) #get features for model training features = feature_extraction(training_set) labeled_points_training = training_set.map(lambda line: construct_labeled_point(line, features)) labeled_points_test = test_set.map(lambda line: construct_labeled_point(line, features)) #train logistic regression model lrModel = LogisticRegressionWithLBFGS.train(labeled_points_training) #train naive bayes model nbModel = NaiveBayes.train(labeled_points_training) return lrModel, nbModel, labeled_points_test
def regression(reg_data): (trainingData, testData) = reg_data.randomSplit([0.7, 0.3]) lrmodel = LogisticRegressionWithLBFGS.train(trainingData) labelsAndPreds = testData.map(lambda p: (p.label, lrmodel.predict(p.features))) trainErr = labelsAndPreds.filter(lambda v, p: v != p).count() / float( testData.count()) falsePos = labelsAndPreds.filter(lambda v, p: v != p and v == 0.0).count( ) / float(testData.filter(lambda lp: lp.label == 0.0).count()) falseNeg = labelsAndPreds.filter(lambda v, p: v != p and v == 1.0).count( ) / float(testData.filter(lambda lp: lp.label == 1.0).count()) print("*** Error Rate: %f ***" % trainErr) print("*** False Positive Rate: %f ***" % falsePos) print("*** False Negative Rate: %f ***" % falseNeg)
def trainModel(lpRDD): """ Train 3 classifier models on the given RDD with LabeledPoint objects. A list of trained model is returned. """ lpRDD.persist( StorageLevel.MEMORY_ONLY ) # not really needed as the Spark implementations ensure caching themselves. Other implementations might not, however. # Train a classifier model. print('Starting to train the model') #give some immediate feedback model1 = LogisticRegressionWithLBFGS.train(lpRDD) # this is the best model print('Trained LR (model1)') #print(type(model1)) model2 = NaiveBayes.train(lpRDD) # doesn't work well print('Trained NB (model2)') print(type(model2)) model3 = SVMWithSGD.train(lpRDD) # or this ... print('Trained SVM (model3)') return [model1, model2, model3]
def main(): #spark = SparkSession.builder.master("yarn").appName("spark_demo").getOrCreate() spark = SparkSession.builder.getOrCreate() print "Session created!" sc = spark.sparkContext print "The url to track the job: http://namenode-01:8088/proxy/" + sc.applicationId print sys.argv sampleHDFS_1 = sys.argv[1] sampleHDFS_2 = sys.argv[2] outputHDFS = sys.argv[3] sampleRDD = sc.textFile(sampleHDFS_1).map(parse) predictRDD = sc.textFile(sampleHDFS_2).map(lambda x: parse(x, True)) # 训练 model = LogisticRegressionWithLBFGS.train(sampleRDD) model.clearThreshold() #删除默认阈值(否则后面直接输出0、1) # 预测,保存结果 labelsAndPreds = predictRDD.map( lambda p: (p[0], p[1].label, model.predict(p[1].features))) labelsAndPreds.map(lambda p: '\t'.join(map(str, p))).saveAsTextFile( outputHDFS + "/target/output") # 评估不同阈值下的准确率、召回率 labelsAndPreds_label_1 = labelsAndPreds.filter(lambda lp: int(lp[1]) == 1) labelsAndPreds_label_0 = labelsAndPreds.filter(lambda lp: int(lp[1]) == 0) t_cnt = labelsAndPreds_label_1.count() f_cnt = labelsAndPreds_label_0.count() print "thre\ttp\ttn\tfp\tfn\taccuracy\trecall" for thre in [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]: tp = labelsAndPreds_label_1.filter(lambda lp: lp[2] > thre).count() tn = t_cnt - tp fp = labelsAndPreds_label_0.filter(lambda lp: lp[2] > thre).count() fn = f_cnt - fp print("%.1f\t%d\t%d\t%d\t%d\t%.4f\t%.4f" % (thre, tp, tn, fp, fn, float(tp) / (tp + fp), float(tp) / (t_cnt))) # 保存模型、加载模型 model.save( sc, outputHDFS + "/target/tmp/pythonLogisticRegressionWithLBFGSModel") sameModel = LogisticRegressionModel.load( sc, outputHDFS + "/target/tmp/pythonLogisticRegressionWithLBFGSModel") print "output:", outputHDFS
def create_or_load_model(sc: SparkContext, train_dataset_path: str) -> LogisticRegressionModel: if not os.path.exists(MODEL_PATH): print('training model...') dataset_rdd = sc.textFile(train_dataset_path) table_rdd = dataset_rdd.map(lambda line: line.split(',')) labeled_features = rdd_to_feature(table_rdd) # labeled_features.foreach(lambda lp: print(lp)) labeled_features.cache() model = LogisticRegressionWithLBFGS.train(labeled_features, numClasses=NUM_CLASSES) model.setThreshold(0.5) model.save(sc, MODEL_PATH) return model else: model = LogisticRegressionModel.load(sc, MODEL_PATH) return model
def logistic_model(sc): # global conf # conf.setAppName("data analyse") # sc = SparkContext(conf=conf) # print ("Successfully started SparkContext") data = sc.textFile("file://" + ROOTDIR + "/sample_svm_data.txt") parsedData = data.map(parsePoint) # Build the model model = LogisticRegressionWithLBFGS.train(parsedData) # Evaluating the model on training data labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features))) trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float( parsedData.count()) print("Training Error = " + str(trainErr))
def test_train(self, df, target, train_split, test_split, regularization=None, num_of_iterations=100): try: LOGGER.info("Generation logistic regression") spark_df = self.sql_context.createDataFrame(df) feature_columns = spark_df.columns feature_columns.remove(target) train, test = spark_df.randomSplit([train_split, test_split], seed=1000000) X_train = train.select(*feature_columns).map(lambda x: list(x)) y_train = train.select(target).map(lambda x: x[0]) zipped = y_train.zip(X_train) train_data = zipped.map(lambda x: LabeledPoint(x[0], x[1])) numOfClasses = len(df[target].unique()) logistic_model = LogisticRegressionWithLBFGS.train( train_data, numClasses=numOfClasses, regParam=0, regType=regularization, intercept=True, iterations=num_of_iterations, validateData=False) X_test = test.select(*feature_columns).map(lambda x: list(x)) y_test = test.select(target).map(lambda x: x[0]) prediction = X_test.map(lambda lp: (float(logistic_model.predict(lp)))) prediction_and_label = prediction.zip(y_test) LOGGER.info( prediction_and_label.map(lambda labelAndPred: labelAndPred[0] == labelAndPred[1]).mean()) except Exception as e: raise e
def Train_Model(trainingRDD, method, parameter_Iterations, parameter_stepSize, parameter_reqParam): # model load in. if method == 'Logistic': Logistic_Model = LogisticRegressionWithLBFGS.train( trainingRDD, iterations=parameter_Iterations, regParam=parameter_reqParam) return Logistic_Model elif method == 'SVM': SVM_Model = SVMWithSGD.train(trainingRDD, iterations=parameter_Iterations, step=parameter_stepSize, regParam=parameter_reqParam) return SVM_Model else: return "No this method."
def predictions(train_data_labeled,test_data_labeled): time_start=time.time() model_lrm = LogisticRegressionWithLBFGS.train(train_data_labeled, iterations=100, initialWeights=None, regParam=0.01, regType='l2', intercept=False, corrections=10, tolerance=0.0001, validateData=True, numClasses=10) predictions = model_lrm.predict(test_data_labeled.map(lambda x: x.features)) predict_label = test_data_labeled.map(lambda x: x.label).repartition(1).saveAsTextFile("hdfs://soit-hdp-pro-1.ucc.usyd.edu.au/user/czho9311/stage3") labels_and_predictions = test_data_labeled.map(lambda x: x.label).zip(predictions) lrAccuracy = labels_and_predictions.filter(lambda x: x[0] == x[1]).count() / float(test_data_labeled.count()) time_end=time.time() time_lrm=(time_end - time_start) print("=========================================================================================================") print("run time: {},LogisticRegression accuracy: {}".format(time_lrm,lrAccuracy))
def fit_and_predict(rdd): ''' Fits a logistic regression model. Parameters ---------- rdd: A pyspark.rdd.RDD instance. Returns ------- An RDD of (label, prediction) pairs. ''' #Creates logistical regression model with 10 iterations that predicts on entire data model=LogisticRegressionWithLBFGS.train(rdd, iterations=10) #makes RDD with label and predictions rdd=rdd.map(lambda x: (x.label, float(model.predict(x.features)))) return rdd
def logisticRegression(features,sc,output_n): features_and_label = features.collect() training_features_labels = features_and_label[0:70] testing_features_labels = features_and_label[70:] labeled_training = [] labeled_testing = [] for x in training_features_labels: labeled_training.append(LabeledPoint(x[0],x[1])) for y in testing_features_labels: labeled_testing.append(LabeledPoint(y[0],y[1])) test = sc.parallelize(labeled_testing) logregression_model = LogisticRegressionWithLBFGS.train(labeled_training) predictions = test.map(lambda line: (line.label, float(logregression_model.predict(line.features)))) return predictions
def TrainLRCModel(trainingData, testData): print(type(trainingData)) print(trainingData.take(2)) # Map the training and testing dataset into Labeled Point trainingData = trainingData.map(lambda row:[LabeledPoint(row.label,row.features)]) print('After changing the dataset type to labeled Point') print(type(trainingData)) print(trainingData.take(2)) model = LogisticRegressionWithLBFGS.train(trainingData, numClasses=5) print(type(model)) exit(); predictions = testData.map(lambda p: (p.label, model.predict(p.features))) correct = predictions.filter(lambda (x, p): x == p) ### Calculate the accuracy of the model using custom method accuracy = round((correct.count() / float(testData.count())) * 100, 3) # return the final accuracy return accuracy
def create_model(name, training): if name == 'logistic': print_box() print "Logistic Regression Model" print_box() model = LogisticRegressionWithLBFGS.train(training) elif name == 'tree': print_box() print "Decision Tree Model" print_box() model = DecisionTree.trainClassifier(training, numClasses=2, categoricalFeaturesInfo={}, impurity='gini', maxDepth=5, maxBins=32) elif name == 'rf': print_box() print "Random Forest Model" print_box() model = RandomForest.trainClassifier(training, numClasses=2, categoricalFeaturesInfo={}, numTrees=15, featureSubsetStrategy="auto", impurity='gini', maxDepth=5, maxBins=50) return model
def main(input_file_path): print('=====>>>>>') print('ddd') data = sc.textFile(input_file_path) traning_data_RDD = data.filter(lambda line: line.split(',')[4] != '' and line.split(',')[0] != 'INDEX') unseen_data_RDD = data.filter(lambda line: line.split(',')[4] == '') traning_data_pddf = create_pddf(traning_data_RDD) traning_data_df = sqlContext.createDataFrame(traning_data_pddf) print(traning_data_df.head()) parsed_data = rdd_to_labeled_point(traning_data_df.rdd) parsed_data.persist() # Correct print: [LabeledPoint(1.0, [1.0,8.6662186586,6.98047693487])] logisticRegressionWithLBFGS = LogisticRegressionWithLBFGS.train(parsed_data, iterations=500, numClasses=100) labels_and_preds = parsed_data.map(lambda lp: [lp.label, logisticRegressionWithLBFGS.predict(lp.features)]) Accuracy = float(labels_and_preds.filter(lambda ele: (int(ele[0]) - int(ele[1])) ** 2).reduce(lambda x, y: x + y)[0]) / float(parsed_data.count()) print("Training Accuracy on training data = " + str(Accuracy)) unseen_data_pddf = create_pddf(unseen_data_RDD) unseen_data_df = sqlContext.createDataFrame(unseen_data_pddf) unseen_parsed_data = rdd_to_index_featurs(unseen_data_df.rdd) unseen_parsed_data.persist() file = open('/Users/1002720/Documents/workspace/SNU-project/data/BDA2Project/1-GenderPrediction/result2.csv', 'w', encoding='utf-8') file.write('INDEX,AGE\n') for data in unseen_parsed_data.collect(): file.write(str(data[0]) + ',' + str(logisticRegressionWithLBFGS.predict(data[1])) + '\n') # print(labels_and_preds.collect()) parsed_data.unpersist() unseen_parsed_data.unpersist() print('=====>>>>>') print('=====>>>>>') print('=====>>>>>') print('=====>>>>>')
def train(self, feat='tfidf'): """ Trains a multinomal NaiveBayes classifier on TFIDF features. Parameters --------- Spark DataFrame with columns: key: (label, filepath) tuple tf: Term-frequency Sparse Vector. IDF: TFIDF Sparse Vector. Returns --------- model: MLLib NaiveBayesModel object, trained. test_score: Accuracy of the model on test dataset. """ if not self.lp_path: self.labeled_points = self.make_labeled_points(self.extract_features()) self.make_train_test(self.test_size) train_rdd = self.labeled_points.join(self.y_train) \ .map(lambda (key, (lp, label)): lp) \ .repartition(self.n_part).cache() if self.model_type == 'naive_bayes': nb = NaiveBayes() self.model = nb.train(train_rdd) elif self.model_type == 'log_reg': n_classes = len(self.unique_ratings()) features = train_rdd.map(lambda lp: LabeledPoint(lp.label, lp.features.toArray())) logreg = LogisticRegressionWithLBFGS.train(features, numClasses=n_classes) self.model = logreg # elif self return self
def processData(sc): #load and parse the data raw_data = sc.textFile(DATA_FILE) raw_data.persist() print "Train data size {}".format(raw_data.count()) # map data to a format needed for logistic regression parsedData = raw_data.map(mapper) print "Sample of input to algorithm ", parsedData.take(10) # Train model t0 = time() model = LogisticRegressionWithLBFGS.train(parsedData) t1 = time() - t0 print "Classifier trained in {} seconds".format(round(t1, 3)) labelsAndPreds = parsedData.map(lambda point: (point.label, model.predict(point.features))) # Evaluating the model on training data trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count()) # Print some stuff print("Training Error = " + str(trainErr))
""" # Imports # The L-BFGS method approximates the objective function locally # as a quadratic without evaluating the second partial derivatives of the objective function to construct the Hessian matrix. # LogBFGS over mini-batch gradient descent for faster convergence. from pyspark.mllib.classification import LogisticRegressionWithLBFGS from pyspark.mllib.regression import LabeledPoint from pyspark import SparkContext from numpy import array sc = SparkContext("local", "SVM") # Loading and parsing data def parsePoint(line): vals = [float(i) for i in line.split(' ')] return LabeledPoint(vals[0], vals[1:]) # Sample data provided by Spark 1.3.1 folder data = sc.textFile("jingrong/sample_svm_data.txt") parsedData = data.map(parsePoint) # Building the model model = LogisticRegressionWithLBFGS.train(parsedData) # Evaluate the model based on training data labelAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features))) trainingError = labelAndPreds.filter(lambda (v,p): v!=p).count() / float(parsedData.count()) print "Training Error: ", str(trainingError)
def logsreg(loadTrainingFilePath, sc): # Load training data in LIBSVM format loadTrainingFilePath = '/Users/Jacob/repository/SparkService/data/sample_libsvm_data.txt' data = MLUtils.loadLibSVMFile(sc, loadTrainingFilePath) # Split data into training (60%) and test (40%) traindata, testdata = data.randomSplit([0.6, 0.4], seed = 11L) traindata.cache() # Load testing data in LIBSVM format #testdata = MLUtils.loadLibSVMFile(sc, loadTestingFilePath) # Run training algorithm to build the model model = LogisticRegressionWithLBFGS.train(traindata, numClasses=3) # Compute raw scores on the test set predictionAndLabels = testdata.map(lambda lp: (float(model.predict(lp.features)), lp.label)) Json.generateJson("LogisticRegression", "12345678", traindata, predictionAndLabels); print 'Completed.' # Instantiate metrics object # metrics = MulticlassMetrics(predictionAndLabels) # # Overall statistics # precision = metrics.precision() # recall = metrics.recall() # f1Score = metrics.fMeasure() # #confusion_matrix = metrics.confusionMatrix().toArray() # print("Summary Stats") # print("Precision = %s" % precision) # print("Recall = %s" % recall) # print("F1 Score = %s" % f1Score) # # Statistics by class # labels = traindata.map(lambda lp: lp.label).distinct().collect() # for label in sorted(labels): # print("Class %s precision = %s" % (label, metrics.precision(label))) # print("Class %s recall = %s" % (label, metrics.recall(label))) # print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0))) # # Weighted stats # print("Weighted recall = %s" % metrics.weightedRecall) # print("Weighted precision = %s" % metrics.weightedPrecision) # print("Weighted F(1) Score = %s" % metrics.weightedFMeasure()) # print("Weighted F(0.5) Score = %s" % metrics.weightedFMeasure(beta=0.5)) # print("Weighted false positive rate = %s" % metrics.weightedFalsePositiveRate) # #return model parameters # res = [('1','Yes','TP Rate', metrics.truePositiveRate(0.0)), # ('2','Yes','FP Rate', metrics.falsePositiveRate(0.0)), # ('3','Yes','Precision', metrics.precision(0.0)), # ('4','Yes','Recall', metrics.recall(0.0)), # ('5','Yes','F-Measure', metrics.fMeasure(0.0, beta=1.0)), # ('1','Yes','TP Rate', metrics.truePositiveRate(1.0)), # ('2','Yes','FP Rate', metrics.falsePositiveRate(1.0)), # ('3','Yes','Precision', metrics.precision(1.0)), # ('4','Yes','Recall', metrics.recall(1.0)), # ('5','Yes','F-Measure', metrics.fMeasure(1.0, beta=1.0)), # ('1','Yes','TP Rate', metrics.truePositiveRate(2.0)), # ('2','Yes','FP Rate', metrics.falsePositiveRate(2.0)), # ('3','Yes','Precision', metrics.precision(2.0)), # ('4','Yes','Recall', metrics.recall(2.0)), # ('5','Yes','F-Measure', metrics.fMeasure(2.0, beta=1.0))] # #save output file path as JSON and dump into dumpFilePath # rdd = sc.parallelize(res) # SQLContext.createDataFrame(rdd).collect() # df = SQLContext.createDataFrame(rdd,['Order','CLass','Name', 'Value']) #tempDumpFilePath = dumpFilePath + "/part-00000" #if os.path.exists(tempDumpFilePath): # os.remove(tempDumpFilePath) #df.toJSON().saveAsTextFile(hdfsFilePath) #tmpHdfsFilePath = hdfsFilePath + "/part-00000" #subprocess.call(["hadoop","fs","-copyToLocal", tmpHdfsFilePath, dumpFilePath]) # Save and load model #clusters.save(sc, "myModel") #sameModel = KMeansModel.load(sc, "myModel")
def train(self, num_iterations=10): model = LogisticRegressionWithLBFGS.train( self._labeled_feature_vector_rdd(), num_iterations) return LogisticRegressionModel(model, self.feature_cols)
regParams = [1e-3] corrections = [30] tolerances = [1e-4] bestReg = 0 bestCor = 0 bestTol = 0 from pyspark.mllib.classification import LogisticRegressionWithLBFGS for reg in regParams: for cor in corrections: for tol in tolerances: model = LogisticRegressionWithLBFGS.train(hashedTrainData, iterations=100, initialWeights=None, regParam=reg, regType='l2', intercept=False, corrections=cor, tolerance=tol, validateData=True, numClasses=2) logLossVa = (hashedValidationData.map(lambda p: (p.label, getCTRProb(p.features, model.weights, model.intercept))) .map(lambda p: computeLogLoss(p[1], p[0])) .reduce(lambda a,b: a+b))/hashedValidationData.count() # logLossVa = evaluateModel(model, hashedValidationData) print logLossVa, reg, cor, tol if (logLossVa < bestLogLoss): bestModel = model bestLogLoss = logLossVa bestReg = reg bestCor = cor bestTol = tol print bestLogLoss, bestReg, bestCor, bestTol
traindays = set(traindays.collect()) # for fast searching # read the data, filtering it to keep only traindays and non-cancels # the header is organically removed because FL_DATE is not a trainday #allfields = sc.textFile('gs://cloud-training-demos/flights/201501.csv') \ allfields = sc.textFile('gs://cloud-training-demos/flights/2015*.csv') \ .map(lambda line : line.split(',')) \ .filter(lambda fields: fields[0] in traindays and \ fields[22] != '') # these are the fields we'll use in the regression # format is LabeledPoint(label, [x1, x2, ...]) flights = allfields.map(lambda fields: LabeledPoint(\ float(float(fields[22]) < 15), #ontime \ [ \ float(fields[15]), # DEP_DELAY \ float(fields[16]), # TAXI_OUT \ float(fields[26]), # DISTANCE \ ])) #flights.saveAsTextFile('gs://cloud-training-demos/flights/sparkoutput/train') lrmodel = LogisticRegressionWithLBFGS.train(flights, intercept=True) print lrmodel.weights,lrmodel.intercept lrmodel.setThreshold(0.7) # cancel if prob-of-ontime < 0.7 #print lrmodel.predict([36.0,12.0,594.0]) lrmodel.save(sc, 'gs://cloud-training-demos/flights/sparkoutput/model')
Err = 0.0 results = [] for train_index, test_index in ss: X_training, Y_training, X_test, Y_test = [], [], [], [] for i in train_index: X_training.append(X[i]) Y_training.append(Y[i]) for i in test_index: X_test.append(X[i]) Y_test.append(Y[i]) parsedData = [] for i in range(0, len(X_training)): parsedData.append(LabeledPoint(Y_training[i], X_training[i])) model = LogisticRegressionWithLBFGS.train(sc.parallelize(parsedData)) testErr = 0 for i in range(0, len(X_test)): a = Y_test[i] b = model.predict(X_test[i]) #b = 1 if a != b: testErr += 1 Err += float(testErr) / float(len(X_test)) print ("AVG test error: %.6f" % (Err/iter_number))
def main(): appName = "BadOrGood;zl" conf = (SparkConf() .setAppName(appName) .set("spark.executor.memory", "5g") .set("spark.executor.cores","3") .set("spark.executor.instance", "3") ) sc = SparkContext(conf = conf) hc = HiveContext(sc) #fetch data #filepath = '/sshomework_zl/BadOrGood/AllDataRowrdd' #fetchDataToFile(hc, filepath) #load data # AllDataRawrdd = sc.pickleFile(filepath) \ # .map( lambda _: {'label':int(_.status), 'feature':extractFeature(_)} ) \ # .repartition(10) AllDataRawrdd = sc.pickleFile('/pickleData').repartition(10) #standardizer for train and test data model = StandardScaler(True, True) \ .fit( AllDataRawrdd \ .map( lambda _: Vectors.dense(_['feature']) ) ) labels = AllDataRawrdd.map(lambda _: _['label']) featureTransformed = model.transform( AllDataRawrdd.map(lambda _: _['feature']) ) AllDataRawrdd = labels \ .zip(featureTransformed) \ .map( lambda _: { 'label':_[0], 'feature':_[1] } ) #sampling trainDataRawrdd, testDataRawrdd = AllDataRawrdd.randomSplit(weights=[0.7, 0.3], seed=100) trainDatardd = trainDataRawrdd.map( lambda _: LabeledPoint( _['label'], _['feature'] ) ).persist() testDatardd = testDataRawrdd.map( lambda _: {'label': _['label'], 'feature': list(_['feature']) } ).persist() #prediction & test lrmLBFGS = LogisticRegressionWithLBFGS.train(trainDatardd, iterations=3000, regParam=0.01, regType="l1") resultrdd = test(lrmLBFGS, testDatardd) lrmLBFGSFone = fone(resultrdd) lrmLBFGSac = accuracy(resultrdd) lrmSGD = LogisticRegressionWithSGD.train(trainDatardd, iterations=3000, step=0.1, regParam=0.01, regType="l1") resultrdd = test(lrmSGD, testDatardd) lrmSGDFone = fone(resultrdd) lrmSGDac = accuracy(resultrdd) dt = DecisionTree.trainClassifier(trainDatardd, 2, {}, maxDepth=10) resultrdd = test(dt, testDatardd) dtFone = fone(resultrdd) dtac = accuracy(resultrdd) rf = RandomForest.trainClassifier(trainDatardd, 2, {}, 10) resultrdd = test(rf, testDatardd) rfFone = fone(resultrdd) rfac = accuracy(resultrdd) print "LR_LBFGS f1 is : %f, ac is : %f" % (lrmLBFGSFone, lrmLBFGSac) print "LR_SGD f1 is : %f, ac is : %f" % (lrmSGDFone, lrmSGDac) print "Decision Tree f1 is: %f, ac is : %f" % (dtFone, dtac) print "Random Forest f1 is: %f, ac is : %f" % (rfFone, rfac) print lrmLBFGS.weights print lrmSGD.weights sc.stop()
#creating RDD of reviews review = parts.map(lambda p: Row(id=p[0], label=float(p[1]), sentence=paragraph_to_wordlist(p[2]))) #creating the dataframe reviewDF = sqlContext.createDataFrame(review) #transforming the words to vectors using the trained model transformDF = wvModel.transform(reviewDF) #segregating the labels and features selectData = transformDF.select("label","features","id") #Creating RDD of LabeledPoints lpSelectData = selectData.map(lambda x : (x.id, LabeledPoint(x.label,x.features))) #Spliting the data for training and test (trainingData, testData) = lpSelectData.randomSplit([0.9, 0.1]) # training the Logistic regression with LBFGS model lrm = LogisticRegressionWithLBFGS.train(trainingData.map(lambda x: x[1]), iterations=10) #fetching the labels and predictions for test data labelsAndPreds = testData.map(lambda p: (p[0],p[1].label, lrm.predict(p[1].features))) #calculating the accuracy and printing it. accuracy = labelsAndPreds.filter(lambda (i, v, p): v == p).count() / float(testData.count()) print("Accuracy = " + str(accuracy)) #initializing Streming context with a window of 10 secs ssc = StreamingContext(sc, 10) #fetching the input statement from S3 lines = ssc.textFileStream("s3://spark-sentimentanalysis/") #calculating a wordcount counts = lines.flatMap(lambda line: line.split(" "))\ .map(lambda x: (x, 1))\ .reduceByKey(lambda a, b: a+b)
# Evaluate the model on training data model2 = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={}, numTrees=3, featureSubsetStrategy="auto", impurity='gini', maxDepth=4, maxBins=32) # Build the model model3 = LogisticRegressionWithLBFGS.train(trainingData) model4 = GradientBoostedTrees.trainClassifier(trainingData, categoricalFeaturesInfo={}, numIterations=3) #model.setThreshold(0.07) model.clearThreshold() # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(testData.count())
j=0 return LabeledPoint(float(int(hashlib.md5(datapoints[3]).hexdigest(), 16)/pow(10,38)), datapoints[1:3]) working_directory = os.getcwd() working_directory = working_directory+"/" configuartion=py.SparkConf() # setting the Spark Configuration sContext=py.SparkContext(conf=configuartion) # setting the Spark context sContext.defaultParallelism data = sContext.textFile(working_directory+"Test-TrainingData_SVM.csv") testdata = sContext.textFile("/media/vyassu/OS/Users/vyas/Documents/Assigments/BigData/AudioData/KL/") print testdata.take(1) parsedData = data.map(parsePoint) print parsedData.take(10) # Build the modelLogisticRegressionWithLBFGS model = LogisticRegressionWithLBFGS.train(parsedData, iterations=10,numClasses=7) # Evaluating the model on training data labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features))) trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count()) print("Training Error = " + str(trainErr)) # # Save and load model # model.save(sc, "myModelPath") # sameModel = SVMModel.load(sc, "myModelPath")
#Cancelled becomes the 8th column now, and total columns in the data = 8 label = clean_line_split[7] nonLable = clean_line_split[0:7] return LabeledPoint (label, nonLable) parsedData = raw_data.map (parsePoint) #divide training and test data by 70-30 rule (training, test) = parsedData.randomSplit ([0.7, 0.3], seed=11L) training.cache () #start timer at this point startTime = datetime.now() #build the model model = LogisticRegressionWithLBFGS.train (training, numClasses=3) #evaluate the model on training data labelAndPreds = test.map (lambda x: (x.label, model.predict (x.features))) #labelAndPreds = testData.map (lambda x: (x.label, model.predict (x.features))) trainErr = labelAndPreds.filter (lambda (w, x): w != x).count () / float (test.count ()) print ('Time consumed = '), (datetime.now() - startTime) print ("Training error = " + str (trainErr)) #save and load model model.save(sc, "LRW-95-08") sameModel = LogisticRegressionModel.load(sc, "LRW-95-08") sc.stop ()
#NB_socredLabel = numpy.array(test_set.map(lambda lp: (NBmodel.predict(lp.features), lp.label)).sortByKey(ascending=False).map(lambda (k,v): v).collect()) #findCoveragePercent(NB_socredLabel, 0.4) SVMSGDmodel = SVMWithSGD.train(train_set) SVMSGDmodel.clearThreshold() SVM_scoredLabel = numpy.array(test_set.map(lambda lp: (SVMSGDmodel.predict(lp.features), lp.label)).sortByKey(ascending=False).map(lambda (k,v): v).collect()) SVM_percent.append(findCoveragePercent(SVM_scoredLabel, 0.4)) SVM_percent.append(findCoveragePercent(SVM_scoredLabel, 0.8)) SVM_percent.append(findCoveragePercent(SVM_scoredLabel, 1.0)) LRSGDmodel = LogisticRegressionWithSGD.train(train_set) LRSGDmodel.clearThreshold() LRSGD_scoedLabel = numpy.array(test_set.map(lambda lp: (LRSGDmodel.predict(lp.features), lp.label)).sortByKey(ascending=False).map(lambda (k,v): v).collect()) LRSGD_percent.append(findCoveragePercent(LRSGD_scoedLabel, 0.4)) LRSGD_percent.append(findCoveragePercent(LRSGD_scoedLabel, 0.8)) LRSGD_percent.append(findCoveragePercent(LRSGD_scoedLabel, 1.0)) LRLBFGSmodel = LogisticRegressionWithLBFGS.train(train_set) LRLBFGSmodel.clearThreshold() LRLBFGS_scoredLabel = numpy.array(test_set.map(lambda lp: (LRLBFGSmodel.predict(lp.features), lp.label)).sortByKey(ascending=False).map(lambda (k,v): v).collect()) LRLBFGS_percent.append(findCoveragePercent(LRLBFGS_scoredLabel, 0.4)) LRLBFGS_percent.append(findCoveragePercent(LRLBFGS_scoredLabel, 0.8)) LRLBFGS_percent.append(findCoveragePercent(LRLBFGS_scoredLabel, 1.0)) def getAccumulatedPercentage(socredLabel): result = [] total = socredLabel.sum() accum = 0 for i in range(socredLabel.size): accum += socredLabel[i] result.append(accum/total) return result SVM_accum = getAccumulatedPercentage(SVM_socredLabel)
# $example off$ if __name__ == "__main__": sc = SparkContext(appName="BinaryClassificationMetricsExample") sqlContext = SQLContext(sc) # $example on$ # Several of the methods available in scala are currently missing from pyspark # Load training data in LIBSVM format data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_binary_classification_data.txt") # Split data into training (60%) and test (40%) training, test = data.randomSplit([0.6, 0.4], seed=11L) training.cache() # Run training algorithm to build the model model = LogisticRegressionWithLBFGS.train(training) # Compute raw scores on the test set predictionAndLabels = test.map(lambda lp: (float(model.predict(lp.features)), lp.label)) # Instantiate metrics object metrics = BinaryClassificationMetrics(predictionAndLabels) # Area under precision-recall curve print("Area under PR = %s" % metrics.areaUnderPR) # Area under ROC curve print("Area under ROC = %s" % metrics.areaUnderROC) # $example off$
def train(self,data,**kwargs): model = LogisticRegressionWithLBFGS.train(data=data,**kwargs) model.clearThreshold() self.model = model
print irisTrainRDD.take(2) print irisTestRDD.take(2) # COMMAND ---------- # MAGIC %md # MAGIC Now, we can use MLlib's logistic regression on our `RDD` of `LabeledPoints`. Note that we'll use `LogisticRegressionWithLBFGS` as it tends to converge faster than `LogisticRegressionWithSGD`. # COMMAND ---------- from pyspark.mllib.classification import LogisticRegressionWithLBFGS help(LogisticRegressionWithLBFGS) # COMMAND ---------- mllibModel = LogisticRegressionWithLBFGS.train(irisTrainRDD, iterations=1000, regParam=0.0) # COMMAND ---------- # MAGIC %md # MAGIC Let's calculate our accuracy using `RDDs`. # COMMAND ---------- rddPredictions = mllibModel.predict(irisTestRDD.values()) predictAndLabels = rddPredictions.zip(irisTestRDD.keys()) mllibAccuracy = predictAndLabels.map(lambda (p, l): p == l).mean() print 'MLlib model accuracy: {0:.3f}'.format(mllibAccuracy)
def train(self, df, target, regularization=None, num_of_iterations=100): try: LOGGER.info("Generation logistic regression") spark_df = self.sql_context.createDataFrame(df) feature_columns = spark_df.columns feature_columns.remove(target) X_train = spark_df.select(*feature_columns).map(lambda x: list(x)) y_train = spark_df.select(target).map(lambda x: x[0]) zipped = y_train.zip(X_train) train_data = zipped.map(lambda x: LabeledPoint(x[0], x[1])) numOfClasses = len(df[target].unique()) logistic_model = LogisticRegressionWithLBFGS.train(train_data, numClasses=numOfClasses, regParam=0, regType=regularization, intercept=True, iterations=num_of_iterations, validateData=False) self.model = logistic_model except Exception as e: raise e