def do_all(f_path,out_name): sc = SparkContext() data = sc.textFile(f_path) data = data.map(parseKeepD).filter(lambda p: p[0] != None) # Scale Features features = data.map(lambda x: x[0].features) summary = Statistics.colStats(features) global means global varis means = summary.mean() varis = summary.variance() #scale the points data = data.map(lambda y: (conv_label_pt(y[0]),y[1])) #train model model = LinearRegressionWithSGD().train(data.map(lambda x: x[0]), intercept=True, regType='none') #calculate disparity disparity = data.map(lambda p: (p[0].label, model.predict(p[0].features), p[1])) #calculate SSR for later ssr = disparity.map(lambda x: (x[0] - x[1])**2).sum() #keep N N = disparity.count() #shut down SC MSE = ssr/float(N) se = std_errors(data,MSE,N) disparity.saveAsTextFile(out_loc + out_name) sc.stop() return model.intercept,model.weights,se,disparity, ssr, N
def test_regression(self): from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \ RidgeRegressionWithSGD from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees data = [ LabeledPoint(-1.0, [0, -1]), LabeledPoint(1.0, [0, 1]), LabeledPoint(-1.0, [0, -2]), LabeledPoint(1.0, [0, 2]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] lr_model = LinearRegressionWithSGD.train(rdd, iterations=10) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) lasso_model = LassoWithSGD.train(rdd, iterations=10) self.assertTrue(lasso_model.predict(features[0]) <= 0) self.assertTrue(lasso_model.predict(features[1]) > 0) self.assertTrue(lasso_model.predict(features[2]) <= 0) self.assertTrue(lasso_model.predict(features[3]) > 0) rr_model = RidgeRegressionWithSGD.train(rdd, iterations=10) self.assertTrue(rr_model.predict(features[0]) <= 0) self.assertTrue(rr_model.predict(features[1]) > 0) self.assertTrue(rr_model.predict(features[2]) <= 0) self.assertTrue(rr_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories dt_model = DecisionTree.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0) rf_model = RandomForest.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10, maxBins=4, seed=1) self.assertTrue(rf_model.predict(features[0]) <= 0) self.assertTrue(rf_model.predict(features[1]) > 0) self.assertTrue(rf_model.predict(features[2]) <= 0) self.assertTrue(rf_model.predict(features[3]) > 0) gbt_model = GradientBoostedTrees.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4) self.assertTrue(gbt_model.predict(features[0]) <= 0) self.assertTrue(gbt_model.predict(features[1]) > 0) self.assertTrue(gbt_model.predict(features[2]) <= 0) self.assertTrue(gbt_model.predict(features[3]) > 0) try: LinearRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10) LassoWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10) RidgeRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10) except ValueError: self.fail()
def iterateLRwSGDBatch(iterNums, stepSizes, fractions, train, valid): for numIter in iterNums: for step in stepSizes: for miniBFraction in fractions: alg = LinearRegressionWithSGD() model = alg.train(train, intercept=True, iterations=numIter, step=step, miniBatchFraction=miniBFraction) rescaledPredicts = train.map(lambda x: (model.predict(x.features), x.label)) validPredicts = valid.map(lambda x: (model.predict(x.features), x.label)) meanSquared = math.sqrt(rescaledPredicts.map(lambda p: pow(p[0]-p[1],2)).mean()) meanSquaredValid = math.sqrt(validPredicts.map(lambda p: pow(p[0]-p[1],2)).mean()) print("%d, %5.3f %5.3f -> %.4f, %.4f" % (numIter, step, miniBFraction, meanSquared, meanSquaredValid))
def iterateLRwSGD(iterNums, stepSizes, train, valid): from pyspark.mllib.regression import LinearRegressionWithSGD import math for numIter in iterNums: for step in stepSizes: alg = LinearRegressionWithSGD() model = alg.train(train, iterations=numIter, step=step, intercept=True) rescaledPredicts = train.map(lambda x: (float(model.predict(x.features)), x.label)) validPredicts = valid.map(lambda x: (float(model.predict(x.features)), x.label)) meanSquared = math.sqrt(rescaledPredicts.map(lambda p: pow(p[0]-p[1],2)).mean()) meanSquaredValid = math.sqrt(validPredicts.map(lambda p: pow(p[0]-p[1],2)).mean()) print("%d, %5.3f -> %.4f, %.4f" % (numIter, step, meanSquared, meanSquaredValid))
def test_regression(self): from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \ RidgeRegressionWithSGD data = [ LabeledPoint(-1.0, self.scipy_matrix(2, {1: -1.0})), LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})), LabeledPoint(-1.0, self.scipy_matrix(2, {1: -2.0})), LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0})) ] rdd = self.sc.parallelize(data) features = [p.features for p in data] lr_model = LinearRegressionWithSGD.train(rdd) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) lasso_model = LassoWithSGD.train(rdd) self.assertTrue(lasso_model.predict(features[0]) <= 0) self.assertTrue(lasso_model.predict(features[1]) > 0) self.assertTrue(lasso_model.predict(features[2]) <= 0) self.assertTrue(lasso_model.predict(features[3]) > 0) rr_model = RidgeRegressionWithSGD.train(rdd) self.assertTrue(rr_model.predict(features[0]) <= 0) self.assertTrue(rr_model.predict(features[1]) > 0) self.assertTrue(rr_model.predict(features[2]) <= 0) self.assertTrue(rr_model.predict(features[3]) > 0)
def test_regression(self): from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \ RidgeRegressionWithSGD data = [ LabeledPoint(-1.0, [0, -1]), LabeledPoint(1.0, [0, 1]), LabeledPoint(-1.0, [0, -2]), LabeledPoint(1.0, [0, 2]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] lr_model = LinearRegressionWithSGD.train(rdd) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) lasso_model = LassoWithSGD.train(rdd) self.assertTrue(lasso_model.predict(features[0]) <= 0) self.assertTrue(lasso_model.predict(features[1]) > 0) self.assertTrue(lasso_model.predict(features[2]) <= 0) self.assertTrue(lasso_model.predict(features[3]) > 0) rr_model = RidgeRegressionWithSGD.train(rdd) self.assertTrue(rr_model.predict(features[0]) <= 0) self.assertTrue(rr_model.predict(features[1]) > 0) self.assertTrue(rr_model.predict(features[2]) <= 0) self.assertTrue(rr_model.predict(features[3]) > 0)
def regression(): #Regression Point #Reads the data from the joinedResults directory as a parquet file datadf = sqlContext.read.parquet(output+"/joinedResults") datadf.show() data = datadf.rdd.map(lambda w: (float(w.avg_prcp), int(w.yy), float(w.latitude), float(w.longitude))) max_prcp = data.max() min_prcp = data.min() lat = data.map(lambda x: (x[2])).cache() min_lat = lat.min() max_lat = lat.max() longt = data.map(lambda x: (x[3])).cache() min_long = longt.min() max_long = longt.max() max_ = [max_prcp[0], float(2050), max_lat, max_long] min_ = [min_prcp[0], float(1990), min_lat, min_long] # change the format to fit in LinearRegression library parsedData = data.map(lambda x: parsePointPrediction(x, max_, min_)).cache() # Split data aproximately into training (80%) and test (20%) trainData, testData = parsedData.randomSplit([0.8, 0.2], seed = 0) trainData.cache() testData.cache() # Build the model using Try and error to find out the Parameters. model = LinearRegressionWithSGD.train(trainData, iterations =500, regType="l2", regParam=10, intercept="true" ) # Evaluate the model on test data valuesAndPreds = testData.map(lambda p: (p.label, model.predict(p.features))) MSE = valuesAndPreds.map(lambda (v, p): (v - p)**2).reduce(lambda x, y: x + y) / valuesAndPreds.count() print("Mean Squared Error = " + str(MSE)) maxVal=max_prcp[0] model.save(sc, output+"/modelpath") return
def evaluate(train,test,iterations,step,regParam,regType,intercept): model = LinearRegressionWithSGD.train(train, iterations, step,regParam=regParam, regType=regType, intercept=intercept) tp = test.map(lambda p: (p.label, model.predict(p.features))) rmse = np.sqrt(tp.map(lambda (t,p): squarred_error(t,p)).mean()) mae = np.sqrt(tp.map(lambda (t,p): abs_error(t,p)).mean()) rmsle = np.sqrt(true_vs_predicted.map(lambda (t,p): squared_log_error(t,p)).mean()) opt_metrics = [rmse,mae,rmsle] return opt_metrics
def get_best_result(best_step_size, training_lp, testing_lp, iterations): model = LinearRegressionWithSGD.train(training_lp, iterations=iterations, step=best_step_size, regType = 'l2') values_and_preds = testing_lp.map(lambda p: (p.label, model.predict(p.features))) MSE = values_and_preds.map(lambda (v, p): (v-p)**2).reduce(operator.add) RMSE = math.sqrt(MSE) result_str = 'best step size got by cross validation cv: ' + str(best_step_size) + ', lowest RMSE: ' + str(RMSE) return result_str
def getRMSE(step_array): valRMSE_list = [] for step in step_array: model = LinearRegressionWithSGD.train(train_featureScoreTimeRDD, iterations=5000, step=step) labelsAndPreds = val_featureScoreTimeRDD.map(lambda p: (p.label, model.predict(p.features))) valMSE = labelsAndPreds.map(lambda (v, p): (v - p)**2).reduce(lambda x, y: x + y) / val_featureScoreTimeRDD.count() valRMSE=valMSE**0.5 valRMSE_list.append((step, valRMSE)) return valRMSE_list
def linearRegression(features,sc,output_n): features_and_label = features.collect() training_features_labels = features_and_label[0:70] testing_features_labels = features_and_label[70:116] linearregression_model = LinearRegressionWithSGD.train(training_data,iterations=0,regParam=200) prediction = testing_data.map(lambda line: (line.label, linearregression_model.predict(line.features))) return linearregression_model,prediction
def LinearRegressionModel(dataPath, label, normalize, character, master, ispca): pca_n = 2 sc = SparkContext(master) data = sc.textFile(dataPath) # not RDD data ndata = data.map(lambda line: line.split(character)).map(lambda part: (map(lambda x: float(x) ,part[0: len(part)]))) if label == 0: ndata = ndata.map(lambda line: line[::-1]) if normalize == 1: test_data = norm(ndata.collect()) norm_data = sc.parallelize(test_data) train_data = norm_data.map(lambda part: lbp(part[0], part[1])) #raw_data = data.map(lambda line: line.split(character)) else: test_data = ndata.map(lambda part: (part[len(part) - 1], part[0:len(part) - 1])).collect() train_data = ndata.map(lambda part: lbp(part[len(part) - 1], part[0: len(part) - 1])) if ispca == 1: pca = PCA(n_components = pca_n) pca_train = [test_data[i][1] for i in range(len(test_data))] pca_data = pca.fit(pca_train).transform(pca_train) test = [] for i in range(len(pca_data)): test.append([test_data[i][0], pca_data[i]]) train_data = sc.parallelize(test).map(lambda part: lbp(part[0], part[1])) test_data = test model_lr = lr.train(train_data) err_lr = 0.0 size = len(train_data.collect()) for i in range(size): err_lr = err_lr + abs(model_lr.predict(test_data[i][1]) - test_data[i][0]) print "result:", err_lr/size String = "Linear Regression Result:\n" String = String + str(model_lr.weights) + '\n' String = String + "Error: " + str(err_lr / size) sc.stop() return String
def test_regression(self): from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \ RidgeRegressionWithSGD from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees data = [ LabeledPoint(-1.0, [0, -1]), LabeledPoint(1.0, [0, 1]), LabeledPoint(-1.0, [0, -2]), LabeledPoint(1.0, [0, 2]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] lr_model = LinearRegressionWithSGD.train(rdd) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) lasso_model = LassoWithSGD.train(rdd) self.assertTrue(lasso_model.predict(features[0]) <= 0) self.assertTrue(lasso_model.predict(features[1]) > 0) self.assertTrue(lasso_model.predict(features[2]) <= 0) self.assertTrue(lasso_model.predict(features[3]) > 0) rr_model = RidgeRegressionWithSGD.train(rdd) self.assertTrue(rr_model.predict(features[0]) <= 0) self.assertTrue(rr_model.predict(features[1]) > 0) self.assertTrue(rr_model.predict(features[2]) <= 0) self.assertTrue(rr_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories dt_model = DecisionTree.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0) rf_model = RandomForest.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100) self.assertTrue(rf_model.predict(features[0]) <= 0) self.assertTrue(rf_model.predict(features[1]) > 0) self.assertTrue(rf_model.predict(features[2]) <= 0) self.assertTrue(rf_model.predict(features[3]) > 0) gbt_model = GradientBoostedTrees.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(gbt_model.predict(features[0]) <= 0) self.assertTrue(gbt_model.predict(features[1]) > 0) self.assertTrue(gbt_model.predict(features[2]) <= 0) self.assertTrue(gbt_model.predict(features[3]) > 0)
def get_best_stepsize(step_sizes, training_lp, testing_lp, iterations): best_stepsize = 0 lowest_RMSE = float("inf") for step_size in step_sizes: model = LinearRegressionWithSGD.train(training_lp, iterations=iterations, step=step_size) values_and_preds = testing_lp.map(lambda p: (p.label, model.predict(p.features))) MSE = values_and_preds.map(lambda (v, p): (v-p)**2).reduce(operator.add) RMSE = math.sqrt(MSE) if RMSE < lowest_RMSE: lowest_RMSE = RMSE best_stepsize = step_size result_str = 'best step size: ' + str(best_stepsize) + ', lowest RMSE: ' + str(lowest_RMSE) return result_str
def LinearRegression(filename, sc): filename = "/Users/Jacob/repository/SparkService/data/lpsa.data" data = sc.textFile(filename) parsedData = data.map(parsePoint) # train the model model = LinearRegressionWithSGD.train(parsedData) # Evaluate the model on training data valuesAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features))) MSE = valuesAndPreds.map(lambda (v, p): (v - p)**2).reduce(lambda x, y: x + y) / valuesAndPreds.count() print("\n\n\n\n\n\nMean Squared Error = " + str(MSE) + "\n\n\n\n\n") # Save and load model #model.save(sc, "myModelPath") #sameModel = LinearRegressionModel.load(sc, "myModelPath")
def algo(a): global data global week global target test = week week_target = week.map(convert) #apply(convert, axis=1) #np.random.seed(123) data_final = LabeledPoint(target, data) #make rdd that is input for algo if a == 'sgd': #time_0 = time.time() lrm = LinearRegressionWithSGD.train(sc.parallelize(data_final), iterations=10, initialWeights=np.array([1.0])) print (abs(lrm.predict(test))) print time.time() - time_0
def test_spark(): def parsePoint(line): values = [float(x) for x in line.replace(',', ' ').split(' ')] return LabeledPoint(values[0], values[1:]) data = sc.textFile(r"/usr/local/Cellar/apache-spark/1.6.1/libexec/data/mllib/ridge-data/lpsa.data") parsedData = data.map(parsePoint) print parsedData.collect() # Build the model model = LinearRegressionWithSGD.train(parsedData) # Evaluate the model on training data valuesAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features))) MSE = valuesAndPreds.map(lambda (v, p): (v - p) ** 2).reduce(lambda x, y: x + y) / valuesAndPreds.count() print("Mean Squared Error = " + str(MSE)) print "Model coefficients:", str(model)
def linearRegression(features,sc,output_n): features_and_label = features.collect() training_features_labels = features_and_label[0:70] testing_features_labels = features_and_label[70:] labeled_training = [] labeled_testing = [] for x in training_features_labels: labeled_training.append(LabeledPoint(x[0],x[1])) for y in testing_features_labels: labeled_testing.append(LabeledPoint(y[0],y[1])) test = sc.parallelize(labeled_testing) linearregression_model = LinearRegressionWithSGD.train(labeled_training,iterations=0,regParam=200) predictions = test.map(lambda line: (line.label, float(linearregression_model.predict(line.features)))) return predictions
def linearRegression_f(mode): if mode == "no_reg": model = LinearRegressionWithSGD.train(parsedData) elif mode == "L1_reg": model = LassoWithSGD.train(parsedData) elif mode == "L2_reg": model = RidgeRegressionWithSGD.train(parsedData) else: print("ERROR Mode") #Evaluate the model on training data # parsedData map method to get {train_data, predict_data} pairs valuesAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features))) #calculate the key-value pairs to get MSE MSE = valuesAndPreds.map(lambda (v, p): (v-p)**2).reduce(lambda x, y: x+y)/valuesAndPreds.count() return MSE
def LinearRegression(trainFile, testFile, taskid,sc): # filename = "/Users/Jacob/repository/SparkService/data/lpsa.data" # data = sc.textFile(filename) # parsedData = data.map(parsePoint) trainData = MLUtils.loadLibSVMFile(sc, trainFile) testData = MLUtils.loadLibSVMFile(sc, testFile) # train the model model = LinearRegressionWithSGD.train(trainData) # Evaluate the model on training data # predictionAndLabels = parsedData.map(lambda p: (p.label, model.predict(p.features))) predictionAndLabels = testData.map(lambda p: (p.label, model.predict(p.features))) MSE = predictionAndLabels.map(lambda (v, p): (v - p)**2).reduce(lambda x, y: x + y) / predictionAndLabels.count() print("\n\n\n\n\n\nMean Squared Error = " + str(MSE) + "\n\n\n\n\n") # Save and load model #model.save(sc, "myModelPath") #sameModel = LinearRegressionModel.load(sc, "myModelPath")
def get_best_stepsize(step_sizes, training_lp, iterations, cv_trails): best_stepsize = 0 lowest_RMSE = float("inf") num_folds = 4 fold_set = [1]*num_folds cv_data = training_lp.randomSplit(fold_set) # 4 folds for step_size in step_sizes: total_RMSE = 0.0 for i in range(num_folds): cv_testing = cv_data[i] cv_training = training_lp.subtract(cv_testing) model = LinearRegressionWithSGD.train(cv_training, iterations=iterations, step=step_size) values_and_preds = cv_testing.map(lambda p: (p.label, model.predict(p.features))) MSE = values_and_preds.map(lambda (v, p): (v-p)**2).reduce(operator.add) RMSE = math.sqrt(MSE) total_RMSE += RMSE avg_RMSE = total_RMSE/cv_trails if avg_RMSE < lowest_RMSE: lowest_RMSE = avg_RMSE best_stepsize = step_size return best_stepsize
def train_amount_model(self, model, data, i): rdd_data = self.sc.parallelize(data) self.logger.info('Start to train the amount model') if self.amount_prediction_method == self.ARTIFICIAL_NEURAL_NETWORK: input_num = self.feature_num layers = [input_num, input_num / 3 * 2, input_num / 3, 1] neural_network = NeuralNetworkSpark(layers=layers, bias=0) model = neural_network.train(rdd_data, method=neural_network.BP, seed=1234, learn_rate=0.0001, iteration=15, model=model) elif self.amount_prediction_method == self.RANDOM_FOREST: model = RandomForest.trainRegressor(rdd_data, categoricalFeaturesInfo={}, numTrees=40, featureSubsetStrategy="auto", impurity='variance', maxDepth=20, maxBins=32) elif self.amount_prediction_method == self.LINEAR_REGRESSION: model = LinearRegressionWithSGD.train(rdd_data, iterations=10000, step=0.001, initialWeights=model.weights if model is not None else None) else: self.logger.error("Unknown training method {}".format(self.amount_prediction_method)) raise ValueError("Unknown training method {}".format(self.amount_prediction_method)) return model
def test_regression(self): from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \ RidgeRegressionWithSGD from pyspark.mllib.tree import DecisionTree data = [ LabeledPoint(-1.0, self.scipy_matrix(2, {1: -1.0})), LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})), LabeledPoint(-1.0, self.scipy_matrix(2, {1: -2.0})), LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0})) ] rdd = self.sc.parallelize(data) features = [p.features for p in data] lr_model = LinearRegressionWithSGD.train(rdd) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) lasso_model = LassoWithSGD.train(rdd) self.assertTrue(lasso_model.predict(features[0]) <= 0) self.assertTrue(lasso_model.predict(features[1]) > 0) self.assertTrue(lasso_model.predict(features[2]) <= 0) self.assertTrue(lasso_model.predict(features[3]) > 0) rr_model = RidgeRegressionWithSGD.train(rdd) self.assertTrue(rr_model.predict(features[0]) <= 0) self.assertTrue(rr_model.predict(features[1]) > 0) self.assertTrue(rr_model.predict(features[2]) <= 0) self.assertTrue(rr_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories dt_model = DecisionTree.trainRegressor(rdd, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0)
def learn_model(sc, file_path, normalize): feature_file = sc.textFile(file_path).map(lambda l:l.split("\t")) points = feature_file.map(lambda f: LabeledPoint(f[1], f[2:])) #normalizing if normalize: nor = Normalizer() labels = points.map(lambda x: x.label) features = points.map(lambda x: x.features) points = labels.zip(nor.transform(features)) points = points.map(lambda i: LabeledPoint(i[0], i[1])) training, testing = points.randomSplit([0.7,0.3],11) index = 0 iterations = 100 p_mse = -1 converge = False result = {} while(not converge): x = time.clock() model = LinearRegressionWithSGD.train(training, iterations=iterations, step=0.00001,intercept=True,regType="l1") y = time.clock() print("========== time = " + str(y - x)) preds = testing.map(lambda p: (p.label, model.predict(p.features))) MSE = preds.map(lambda r: (r[1] - r[0])**2).reduce(lambda x, y: x + y) / preds.count() print("========== MSE = " + str(MSE)) if p_mse == MSE: converge = True iterations = iterations +100 result[iterations] = MSE p_mse = MSE print(result) return model
regType=reg_type, intercept=intercept) # use test data -> rdd: [(actual_value, prdict_value), (...), (...), ......] tlabel_tprediction = train_set.map( lambda point: (point.label, model.predict(point.features))) # calculate Root Mean Squared Log Error rmsle = np.sqrt( tlabel_tprediction.map( lambda tp: squared_log_error(tp[0], tp[1])).mean()) return rmsle if __name__ == '__main__': # create linear model and test linear_model = LinearRegressionWithSGD.train(data, iterations=200, step=0.05, intercept=False) linear_model.save( sc, 'PricePrediction/model/LR.model') # save the trained model to local true_vs_predicted = data.map( lambda point: (point.label, linear_model.predict(point.features))) print('线性回归模型对前5个样本的预测值: ' + str(true_vs_predicted.take(5))) # test ''' same_md = LinearRegressionModel.load(sc, 'PricePrediction/model/LR.model') true_vs_predicted = data.map(lambda point: (point.label, same_md.predict(point.features))) print(str(true_vs_predicted.take(2))) ''' # error analysis m_s_e = true_vs_predicted.map(
data_with_idx = data.zipWithIndex().map(lambda (k, v): (v, k)) test = data_with_idx.sample(False, 0.1, 100) train = data_with_idx.subtractByKey(test) train_data = train.map(lambda (idx, p): p) test_data = test.map(lambda (idx, p): p) train_size = train_data.count() test_size = test_data.count() print "Training data size: %d" % train_size print "Test data size: %d" % test_size print "Total data size: %d " % num_data print "Train + Test size : %d" % (train_size + test_size) # make the linear regression horsepower model linear_model_hp = LinearRegressionWithSGD.train(train_data, iterations=100, step=0.0000001, intercept=False) linear_model_hp # make predictions and measure error true_vs_predicted = test_data.map( lambda p: (p.label, linear_model_hp.predict(p.features))) def squared_error(actual, pred): return (pred - actual)**2 def squared_log_error(pred, actual): return (np.log(pred + 1) - np.log(actual + 1))**2
# In[77]: from pyspark.mllib.regression import LinearRegressionWithSGD # Values to use when training the linear regression model numIters = 500 # iterations alpha = 1.0 # step miniBatchFrac = 1.0 # miniBatchFraction reg = 1e-1 # regParam regType = 'l2' # regType useIntercept = True # intercept # In[79]: # TODO: Replace <FILL IN> with appropriate code firstModel = LinearRegressionWithSGD.train(parsedTrainData, numIters, alpha, miniBatchFrac, None, reg, regType, useIntercept) # weightsLR1 stores the model weights; interceptLR1 stores the model intercept weightsLR1 = firstModel.weights interceptLR1 = firstModel.intercept print weightsLR1, interceptLR1 # In[80]: # TEST LinearRegressionWithSGD (4a) expectedIntercept = 13.3335907631 expectedWeights = [16.682292427, 14.7439059559, -0.0935105608897, 6.22080088829, 4.01454261926, -3.30214858535, 11.0403027232, 2.67190962854, 7.18925791279, 4.46093254586, 8.14950409475, 2.75135810882] Test.assertTrue(np.allclose(interceptLR1, expectedIntercept), 'incorrect value for interceptLR1') Test.assertTrue(np.allclose(weightsLR1, expectedWeights), 'incorrect value for weightsLR1')
STEP_SIZE = 0.00000001 def parse_data(line): data = list(map(lambda n: float(n), line.replace(',', ' ').split(' '))) return LabeledPoint(data[0], Vectors.dense(data[0], data[len(data) - 1])) # # # # # # # # # # # # # # # # # # # # # ## # ## # # ## # # # # # # # # # # # # # # # # # ## # # # # # ## # ## # # ## # # # # # # # # # # # # # # # # # # # sc = SparkContext("local", "linear_regression_app") file_content = sc.textFile(FILE_PATH).cache() print(f'file_content.count = { file_content.count() }') data = file_content.map(parse_data).cache() print(f'data.count = { data.count() }') model = LinearRegressionWithSGD.train(data, NUM_ITERATIONS, STEP_SIZE) predictions = data.map(lambda point: (point.label, model.predict(point.features))) predictions.foreach( lambda point: print(f"Predicted: { point[0] }\t| Actual: { point[1] }")) mse = predictions.map(lambda point: pow((point[0] - point[1]), 2)).mean() print(f'Training Mean Squared Error = { mse }')
CVTrainData = ZippedData.filter(lambda tup: tup[1]<int(TSize*i) or tup[1]>int(TSize*(i+1))).map(lambda x:x[0]) CVTestData = ZippedData.filter(lambda tup: tup[1]>int(TSize*i) and tup[1]<int(TSize*(i+1))).map(lambda x:x[0]) model = LinearRegressionWithSGD.train(CVTrainData, iterations=10000, step=0.01, regType='l1', regParam=0.1) values_and_preds = CVTestData.map(lambda p: (p.label, model.predict(p.features))) RMSE = sqrt(values_and_preds.map(lambda vp: (vp[0] - vp[1])**2).reduce(lambda x, y: x + y)/values_and_preds.count()) total_rmse += RMSE MAE = values_and_preds.map(lambda vp: abs(vp[0] - vp[1])).reduce(lambda x, y: x + y)/values_and_preds.count() total_mae += MAE print(RMSE) print(MAE) print("Avg Root Mean Squared Error on CV = " + str(total_rmse/folds)) print("Avg Mean Absolute Error on CV = " + str(total_mae/folds)) """ test_model = LinearRegressionWithSGD.train(parsed_train_data, iterations=10000, step=0.01, regType='l1', regParam=0.1) values_and_preds = parsed_test_data.map( lambda p: (p.label, test_model.predict(p.features))) TestRMSE = sqrt( values_and_preds.map(lambda vp: (vp[0] - vp[1])**2).reduce(lambda x, y: x + y) / values_and_preds.count()) print("Root Mean Squared Error on Test Data = " + str(TestRMSE)) TestMAE = values_and_preds.map(lambda vp: abs(vp[0] - vp[1])).reduce( lambda x, y: x + y) / values_and_preds.count() print("TMean Absolute Error on Test Data = " + str(TestMAE))
categoricalFeaturesInfo={}, numTrees=5, impurity='variance', maxDepth=4, maxBins=32) predictionsRF = modelRF.predict(testData.map(lambda x: x.features)) #Gradient Boosted Model modelGB = GradientBoostedTrees.trainRegressor(trainingData, categoricalFeaturesInfo={}, numIterations=3) predictionsGB = modelGB.predict(testData.map(lambda x: x.features)) #Linear Regression Model modelLin = LinearRegressionWithSGD.train(trainingData, iterations=100, step=0.00000001) predictionsLin = modelLin.predict(testData.map(lambda x: x.features)) resultsRF = predictionsRF.collect() resultsGB = predictionsGB.collect() resultsLin = predictionsLin.collect() testDataList = testData.collect() #iterator for the test data array count = 0 print("Random Forest") for item in resultsRF: #Retrieve the actual salary from the labeledpoint salaryMatch = testDataList[count].label #Find the player who has this same salaryc
if __name__ == "__main__": sc = SparkContext(appName="Regression Metrics Example") # $example on$ # Load and parse the data def parsePoint(line): values = line.split() return LabeledPoint(float(values[0]), DenseVector([float(x.split(':')[1]) for x in values[1:]])) data = sc.textFile("data/mllib/sample_linear_regression_data.txt") parsedData = data.map(parsePoint) # Build the model model = LinearRegressionWithSGD.train(parsedData) # Get predictions valuesAndPreds = parsedData.map(lambda p: (float(model.predict(p.features)), p.label)) # Instantiate metrics object metrics = RegressionMetrics(valuesAndPreds) # Squared Error print("MSE = %s" % metrics.meanSquaredError) print("RMSE = %s" % metrics.rootMeanSquaredError) # R-squared print("R-squared = %s" % metrics.r2) # Mean absolute error
train_scaled = getScaledData(train) train_val_scaled = getScaledData(train_val) test_scaled = getScaledData(test) train.cache() train_scaled.cache() train_val.cache() train_val_scaled.cache() test.cache() test_scaled.cache() iter = 10**4 step = 10**(-5) model = LinearRegressionWithSGD.train(train, iter, step) # iter, step size # predict predictions_val = model.predict(train_val_scaled.map(lambda x: x.features)) labelsAndPreds_val = train_val_scaled.map(lambda lp: lp.label).zip( predictions_val).map(lambda (a, b): (b, a)) predictions = model.predict(test_scaled.map(lambda x: x.features)) labelsAndPreds = test_scaled.map(lambda lp: lp.label).zip(predictions).map( lambda (a, b): (b, a)) result = open('hw4.txt', 'a') result.write('---------------\n') result.write('Validation\n') result.write('MAE: %.5f\n' % getMAE(labelsAndPreds_val)) result.write('RMSE: %.5f\n\n' % getRMSE(labelsAndPreds_val))
label = clean_line_split[10] nonlabel = clean_line_split[0:10] + clean_line_split[11:] return LabeledPoint(label, nonlabel) data_file = sc.textFile("s3://aws-logs-012060642840-us-west-2/elasticmapreduce/cloud_proj/00-08.csv").cache () header = data_file.first () raw_data = data_file.filter (lambda x:x != header) parsedData = raw_data.map(parsePoint) (trainingData, testData) = parsedData.randomSplit([0.7, 0.3]) startTime = datetime.now() # Build the model trainingData.cache () model = LinearRegressionWithSGD.train(trainingData, iterations=1) print ('Training Time consumed = '), (datetime.now() - startTime) startTestTime = datetime.now() # Evaluating the model on training data valuesAndPreds = testData.map(lambda p: (p.label, model.predict(p.features))) MSE = valuesAndPreds \ .map(lambda (v, p): (v - p)**2) \ .reduce(lambda x, y: x + y) / valuesAndPreds.count() print ('Testing Time consumed = '), (datetime.now() - startTestTime) print ('Total Time: '), (datetime.now() - startTime) print("Mean Squared Error = " + str(MSE)) # Save and load model model.save(sc, "LinearRegressionNarrow00-08_cache_both_train_and_test") sameModel = LinearRegressionModel.load(sc, "LinearRegressionNarrow00-08_cache_both_train_and_test")
print "Training data size: %d" % train_size print "Test data size: %d" % test_size print "Total data size: %d" % num_data print "Train + Test size: %d" % (test_size + train_size) df = records.map(lambda line: Row(Displacement=line[2], Horsepower=line[6])).toDF() df.show(10) df = df.select('Horsepower', 'Displacement') df = df[df.Displacement > 0] df = df[df.Horsepower > 0] df.describe(['Horsepower', 'Displacement']).show() temp = df.map(lambda line: LabeledPoint(line[0], [line[1:]])) temp.take(5) linearModel = LinearRegressionWithSGD.train(temp, 10000, 0.0001, intercept=False) linearModel.weights test_data.take(10) true_vs_predicted = temp.map(lambda p: (p.label, linearModel.predict(p.features))) print "Linear Model predictions: " + str(true_vs_predicted.take(100)) def squared_error(actual, pred): return (pred - actual) ** 2 def abs_error(actual, pred): return np.abs(pred - actual)
dataset = db_client.iestimate.predictions81k_ecp_copy.find( {"postalcode": "01772"}) dataset = dsto_norm_labeled_points(dataset, features_regression_model) dataset = sc.parallelize(dataset) # Load and parse the data # def parsePoint(line): # values = [float(x) for x in line.replace(',', ' ').split(' ')] # return LabeledPoint(values[0], values[1:]) # # data = sc.textFile("data/mllib/ridge-data/lpsa.data") # parsedData = data.map(parsePoint) processed_data = dataset # Build the model model = LinearRegressionWithSGD.train(processed_data, iterations=300, step=0.01) # Evaluate the model on training data valuesAndPreds = processed_data.map(lambda p: (p.label, model.predict(p.features))) MSE = valuesAndPreds.map(lambda (v, p): (v - p)**2).reduce( lambda x, y: x + y) / valuesAndPreds.count() print("Mean Squared Error = " + str(MSE)) # Save and load model model.save(sc, "myModelPath") sameModel = LinearRegressionModel.load(sc, "myModelPath")
from pyspark.mllib.regression import LinearRegressionWithSGD, LabeledPoint def textParser(type): """ type : 0 the lowest prices and 1 is the average price """ datas = [] lines = open('cards2.txt') for line in lines: features = line.strip().split('\t') datas.append(LabeledPoint(float(features[type]), features[2:-1])) return datas if __name__ == '__main__': sc = SparkContext() datas = sc.parallelize(textParser(1)) model = LinearRegressionWithSGD.train(datas, step=0.00000000174434, iterations=2000, regType='l2') # model = LinearRegressionWithSGD.train(datas, step=0.00000000175234766555555566666, iterations=5000, regType='l2') print '**' * 50 print model.weights print model.intercept print model.predict(array([9409, 187533, 84500, 84572])) print '**' * 50 valuesAndPreds = datas.map(lambda p: (p.label, model.predict(p.features))) print valuesAndPreds.collect(), valuesAndPreds.count() MSE = valuesAndPreds.map(lambda (v, p): (v - p) ** 2).reduce(lambda x, y: x + y) / valuesAndPreds.count() print("Mean Squared Error = " + str(MSE)) # 2016.1 9409 82200 82352 187533 sc.stop()
from pyspark.mllib.regression import LinearRegressionWithSGD # Values to use when training the linear regression model numIters = 500 # iterations alpha = 1.0 # step miniBatchFrac = 1.0 # miniBatchFraction reg = 1e-1 # regParam regType = 'l2' # regType useIntercept = True # intercept # In[145]: # TODO: Replace <FILL IN> with appropriate code firstModel = LinearRegressionWithSGD.train(data=parsedTrainData, iterations=numIters, step=alpha, miniBatchFraction=miniBatchFrac, initialWeights=None, regParam=reg, regType=regType, intercept=useIntercept) # weightsLR1 stores the model weights; interceptLR1 stores the model intercept weightsLR1 = firstModel.weights interceptLR1 = firstModel.intercept print weightsLR1, interceptLR1 # In[146]: # TEST LinearRegressionWithSGD (4a) expectedIntercept = 13.3335907631 expectedWeights = [ 16.682292427, 14.7439059559, -0.0935105608897, 6.22080088829,
def test_regression(self): from pyspark.mllib.regression import ( LinearRegressionWithSGD, LassoWithSGD, RidgeRegressionWithSGD, ) from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees data = [ LabeledPoint(-1.0, [0, -1]), LabeledPoint(1.0, [0, 1]), LabeledPoint(-1.0, [0, -2]), LabeledPoint(1.0, [0, 2]), ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] lr_model = LinearRegressionWithSGD.train(rdd, iterations=10) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) lasso_model = LassoWithSGD.train(rdd, iterations=10) self.assertTrue(lasso_model.predict(features[0]) <= 0) self.assertTrue(lasso_model.predict(features[1]) > 0) self.assertTrue(lasso_model.predict(features[2]) <= 0) self.assertTrue(lasso_model.predict(features[3]) > 0) rr_model = RidgeRegressionWithSGD.train(rdd, iterations=10) self.assertTrue(rr_model.predict(features[0]) <= 0) self.assertTrue(rr_model.predict(features[1]) > 0) self.assertTrue(rr_model.predict(features[2]) <= 0) self.assertTrue(rr_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories dt_model = DecisionTree.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4 ) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0) rf_model = RandomForest.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10, maxBins=4, seed=1 ) self.assertTrue(rf_model.predict(features[0]) <= 0) self.assertTrue(rf_model.predict(features[1]) > 0) self.assertTrue(rf_model.predict(features[2]) <= 0) self.assertTrue(rf_model.predict(features[3]) > 0) gbt_model = GradientBoostedTrees.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4 ) self.assertTrue(gbt_model.predict(features[0]) <= 0) self.assertTrue(gbt_model.predict(features[1]) > 0) self.assertTrue(gbt_model.predict(features[2]) <= 0) self.assertTrue(gbt_model.predict(features[3]) > 0) try: LinearRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10) LassoWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10) RidgeRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10) except ValueError: self.fail() # Verify that maxBins is being passed through GradientBoostedTrees.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4, maxBins=32 ) with self.assertRaises(Exception): GradientBoostedTrees.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4, maxBins=1 )
total_cnt = type_cnt + number_cnt #print type_cnt def extract_features(fields): step = 0 features = np.zeros(total_cnt) for t_idx in type_columns: features[step + int(type_maps[t_idx][fields[t_idx]])] = 1.0 step = step + len(type_maps[t_idx]) for n_idx in number_columns: features[step] = float(fields[n_idx]) step = step + 1 return features data = raw_data.map(lambda fields: LabeledPoint( float(fields[saleprice_column]), extract_features(fields))) #first_point= data.first() #print "label of first point: %f" % first_point.label #print "features of first point: %s" % str(first_point.features) #print "feature vector length: %d" % len(first_point.features) lrModel = LinearRegressionWithSGD.train(data, iterations=10, step=0.1, intercept=False) actual_vs_pred = data.map(lambda p: (p.label, lrModel.predict(p.features))) print actual_vs_pred.take(10)
sc = SparkContext() selcol = [1, 3, 4, 6, 18, 23, 25] train = prep_Data("HW4/200[3-7].csv", selcol) test = prep_Data("HW4/2008.csv", selcol) #transform data into the format that can be feed into model trainLabeled = train.map( lambda line: LabeledPoint(extract_label(line), extract_features(line))) testLabeled = test.map( lambda line: LabeledPoint(extract_label(line), extract_features(line))) #preserver some part of the data as validation data train_dataset, val_dataset = trainLabeled.randomSplit([0.7, 0.3]) #train linear_model_val = LinearRegressionWithSGD.train(train_dataset, 100000, 0.00000000001) linear_model = LinearRegressionWithSGD.train(trainLabeled, 100000, 0.00000000001) #evaluateModel(linear_model_val, val_dataset) #evaluateModel(linear_model, testLabeled) #evaluate data mae_val, rmse_val = evaluateModel(linear_model_val, val_dataset) mae, rmse = evaluateModel(linear_model, testLabeled) print "Validation: \n" + "MAE: " + str(mae_val) + "\nRMSE: " + str(rmse_val) print "\nTest: \n" + "MAE: " + str(mae) + "\nRMSE: " + str(rmse)
# In[7]: from pyspark.mllib.regression import LinearRegressionWithSGD from pyspark.mllib.tree import DecisionTree help(LinearRegressionWithSGD.train) # In[8]: help(DecisionTree.trainRegressor) # ## Train a Regression Model on the Bike Sharing Dataset # In[9]: linear_model = LinearRegressionWithSGD.train(data, iterations=10, step=0.1, intercept=False) true_vs_predicted = data.map(lambda p: (p.label, linear_model.predict(p.features))) print "Linear Model predictions: " + str(true_vs_predicted.take(5)) # In[10]: # we pass in an mepty mapping for categorical feature size {} dt_model = DecisionTree.trainRegressor(data_dt, {}) preds = dt_model.predict(data_dt.map(lambda p: p.features)) actual = data.map(lambda p: p.label) true_vs_predicted_dt = actual.zip(preds) print "Decision Tree predictions: " + str(true_vs_predicted_dt.take(5)) print "Decision Tree depth: " + str(dt_model.depth()) print "Decision Tree number of nodes: " + str(dt_model.numNodes())
pydf = DataFrame({'x':x,'y':y}) p = ggplot(pydf, aes('x','y')) + \ geom_point(color='blue') display(p) # COMMAND ---------- # MAGIC %md ## Linear Regression with SGD # MAGIC * Load and parse the data where y = Median Housing Price (values[1]) and x = Population (values[0]) # MAGIC * Building two example models # MAGIC * Reference pyspark MLLib regression # MAGIC * * http://spark.apache.org/docs/latest/api/python/pyspark.mllib.html#module-pyspark.mllib.regression # COMMAND ---------- modelA = LinearRegressionWithSGD.train(parseddata, iterations=100, step=0.01, intercept=True) modelB = LinearRegressionWithSGD.train(parseddata, iterations=1500, step=0.1, intercept=True) # COMMAND ---------- print ">>>> ModelA intercept: %r, weights: %r" % (modelA.intercept, modelA.weights) # COMMAND ---------- print ">>>> ModelB intercept: %r, weights: %r" % (modelB.intercept, modelB.weights) # COMMAND ---------- # MAGIC %md ## Evaluate the Model # MAGIC #### Predicted vs. Actual
# get the data from each stock csv file stocks = sc.textFile("hdfs:///shared/financial_data/stocks/permno_csv/" + selected_file) stocks = stocks.mapPartitions(lambda x: csv.reader(x)) # map and filter the data to (stock, time) labeled_data = stocks.map(map_to_point) labeled_data = labeled_data.filter(lambda x: x) labeled_data = labeled_data.map(lambda x: LabeledPoint(x[0], x[1])).cache() training, test = labeled_data.randomSplit([0.7, 0.3]) # verify that the data exists if training.isEmpty(): metrics.append([]) continue # train the model model = LinearRegressionWithSGD.train(training, iterations=1000, step=0.00000001, intercept=True) test_features = test.map(lambda x: x.features) predictions = model.predict(test_features) test_preds = test.map(lambda x: x.label).zip(predictions) # grab percent error total_percent = test_preds.map(map_percent_error) total_percent = total_percent.filter(lambda x: x) # check to make sure not empty rdd if total_percent.isEmpty(): metrics.append([]) continue average_percent = total_percent.reduce(lambda x, y: (x[0] + y[0], x[1] + y[1])) average_percent = average_percent[0] / average_percent[1]
from pyspark.mllib.regression import LinearRegressionWithSGD as lrSGD spark = SparkSession \ .builder \ .appName("Python Spark regression example") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() regressionDataFrame = spark.read.csv('Advertising.csv', header=True, inferSchema=True) regressionDataFrame = regressionDataFrame.drop('_c0') regressionDataFrame.show(10) regressionDataRDD = regressionDataFrame.rdd.map(list) regressionDataLabelPoint = regressionDataRDD.map( lambda data: LabeledPoint(data[3], data[0:3])) regressionLabelPointSplit = regressionDataLabelPoint.randomSplit([0.7, 0.3]) regressionLabelPointTrainData = regressionLabelPointSplit[0] regressionLabelPointTestData = regressionLabelPointSplit[1] ourModelWithLinearRegression = lrSGD.train(data=regressionLabelPointTrainData, iterations=200, step=0.02, intercept=True)
print parsedData.take(3) # In[58]: #Devide rawData into Traning, Validation and Test weights = [.8, .1, .1] seed = 50 parsedTrainData, parsedValData, parsedTestData = parsedData.randomSplit(weights, seed) # In[64]: # Fit the model with default values fitModel = LinearRegressionWithSGD.train(parsedTrainData) print fitModel # In[65]: # Prediction testPoint = parsedTrainData.take(1)[0] print testPoint.label testPrediction = fitModel.predict(testPoint.features) print samplePrediction
def run_prdct(pr_values, sc): values = list(pr_values) # Configure train_path = '/Users/xiaoru_zhu/PycharmProjects/HousingPriceDA/Dataset/train.csv' # Initialize RDD rdd_lines = sc.textFile(train_path) head = rdd_lines.first() rdd_lines = rdd_lines.filter(lambda ln: ln != head) \ .mapPartitions(lambda x: csv.reader(x)) \ .persist(StorageLevel(True, True, False, False, 1)) # MEMORY_AND_DISK # data_num = rdd_lines.count() # Prepare for normalization sub = [] minimum = [] for index in range(5, 8): max_ = float(rdd_lines.map(lambda attr: attr[index]).max(key=float)) min_ = float(rdd_lines.map(lambda attr: attr[index]).min(key=float)) subtract = max_ - min_ minimum.append(min_) sub.append(subtract) # Normalization(gui yi): (val - min)/(max - min), to let number feature values in [0, 1] and narrow down the error def normalization(line): line[5] = (float(line[5]) - minimum[0]) / sub[0] line[6] = (float(line[6]) - minimum[1]) / sub[1] line[7] = (float(line[7]) - minimum[2]) / sub[2] return line rdd_lines = rdd_lines.map(lambda attr: normalization(attr)) values = normalization(values) # print(rdd_lines.first()) # test after normalization # extract features from every category column and generate dict def be_mapped(rdd_arg, column): return rdd_arg.map(lambda attr: attr[column]) \ .distinct() \ .zipWithIndex() \ .collectAsMap() # result : {'BATH BEACH': 0, 'BAY RIDGE': 1, 'BEDFORD STUYVESANT': 2, ...} mappings = [be_mapped(rdd_lines, i) for i in [0, 1, 2, 8]] # collect dicts into a list print('category feature mapping dict:', mappings) cat_len = sum(map(len, [i for i in mappings])) # category feature numbers using sum + map function num_len = len(rdd_lines.first()[5:8]) # number feature numbers,index = 5,6,7 total_len = num_len + cat_len # total feature numbers ''' >>> TEST print('category feature number: %d' % cat_len) print('number feature number: %d' % num_len) print('total feature number::%d' % total_len) ''' # Create eigenvectors(feature vectors) for linear regression def extract_features(line): cat_vec = np.zeros(cat_len) # new array for category features, init 0 for all elements step = 0 for i, raw_feature in enumerate([line[0], line[1], line[2], line[8]]): # [(0,line[0]), (1,line[1], ...) ] dict_cate = mappings[i] # category feature mapping dict {'BATH BEACH': 0, 'BAY RIDGE': 1, 'xxx': 2, ...} idx = dict_cate[raw_feature] # get value from dict cat_vec[idx + step] = 1 # set 1 for index in array step = step + len(dict_cate) # jump to the next attribute area num_vec = np.array([float(raw_feature) for raw_feature in line[5:8]]) return np.concatenate((cat_vec, num_vec)) # splice category and number vectors def extract_label(line): return float(line[-1]) # Error analysis def squared_error(actual, prdct): # Mean Squared Error 均方误差 return (prdct - actual) ** 2 def abs_error(actual, prdct): # Mean Absolute Error 平均绝对误差 return np.abs(prdct - actual) def squared_log_error(prdct, actual): # Root Mean Squared Log Error 均方根对数误差 return (np.log(prdct + 1) - np.log(actual + 1)) ** 2 # Adjust argument # there is no TEST dataset, using train data as test data! def evaluate(train_set, iterations, step, reg_param, reg_type, intercept): # create linear model using Stochastic gradient descent(随机梯度下降) model = LinearRegressionWithSGD.train(train_set, iterations, step, regParam=reg_param, regType=reg_type, intercept=intercept) # use test data -> rdd: [(actual_value, prdict_value), (...), (...), ......] tlabel_tprediction = train_set.map(lambda point: (point.label, model.predict(point.features))) # calculate Root Mean Squared Log Error rmsle = np.sqrt(tlabel_tprediction.map(lambda tp: squared_log_error(tp[0], tp[1])).mean()) return rmsle # Generate the final feature vectors by 'map' and 'extract' function data = rdd_lines.map(lambda line: LabeledPoint(extract_label(line), extract_features(line))) #first_point = data.first() values_vec = extract_features(values) # create linear model and test linear_model = LinearRegressionWithSGD.train(data, iterations=200, step=0.05, intercept=False) true_vs_predicted = data.map(lambda point: (point.label, linear_model.predict(point.features))) print('The first five prediction values: ' + str(true_vs_predicted.take(5))) # test rst = linear_model.predict(values_vec) # error analysis m_s_e = true_vs_predicted.map(lambda tp: squared_error(tp[0], tp[1])).mean() m_a_e = true_vs_predicted.map(lambda tp: abs_error(tp[0], tp[1])).mean() r_m_s_l_e = np.sqrt(true_vs_predicted.map(lambda tp: squared_log_error(tp[0], tp[1])).mean()) # print('Linear Model - Mean Squared Error: %2.4f' % m_s_e) print('Linear Model - Mean Absolute Error: %2.4f' % m_a_e) print('Linear Model - Root Mean Squared Log Error: %2.4f' % r_m_s_l_e) ''' # adjust 'iterations' argument args_it = [1, 5, 10, 20, 50, 100, 200] error_it = [evaluate(data, arg, 0.01, 0.0, 'l2', False) for arg in args_it] for i in range(len(args_it)): print('the r_m_s_l_e:%f when iteration = %f' % (error_it[i], args_it[i])) # adjust 'step' argument args_stp = [0.01, 0.025, 0.05, 0.1, 0.3, 0.5, 1.0] error_stp = [evaluate(data, 10, arg, 0.0, 'l2', False) for arg in args_stp] for i in range(len(args_stp)): print('the r_m_s_l_e:%f when step = %f' % (error_stp[i], args_stp[i])) ''' rst = round(rst, 2) r_m_s_l_e = round(r_m_s_l_e, 2) m_a_e = round(m_a_e, 2) rst_lst = [rst, r_m_s_l_e, m_a_e] print(rst_lst) return rst_lst
from pyspark.mllib.evaluation import RegressionMetrics # Cargar y parsear la data def parsePoint(line): values = [float(x) for x in line.replace(',', ' ').split(' ')] return LabeledPoint(values[0], values[1:]) data = sc.textFile("/home/master/ejemplos-python/lpsa.data") parsedData = data.map(parsePoint) # Divide la data en 2 set, de entrenamiento y pruebas # Aquí he establecido la semilla para que pueda reproducir el resultado (trainingData, testData) = parsedData.randomSplit([0.7, 0.3], seed=100) # contruir el modelo model = LinearRegressionWithSGD.train(trainingData) # evaluar el modelo y entrenar # --- Point 1 --- Preds = testData.map(lambda p: (float(model.predict(p.features)), p.label)) MSE = Preds.map(lambda (v, p): (v - p)**2).reduce(lambda x, y: x + y) / Preds.count() print("Mean Squared Error = " + str(MSE)) print("\n") # --- Point 2 --- # Más acerca del modelo y evaluar el analisis de regresión # Instanciar el objeto metrics = RegressionMetrics(Preds) # Squared Error print("MSE = %s" % metrics.meanSquaredError) print("RMSE = %s" % metrics.rootMeanSquaredError) # R-squared print("R-squared = %s" % metrics.r2)
def textParser(): datas = [] lines = open('abalone.txt').readlines() for line in lines: tmp = line.strip().split('\t') datas.append(LabeledPoint(tmp[-1], tmp[1:-1])) return datas if __name__ == '__main__': sc = SparkContext() datas = sc.parallelize(textParser()) print datas.collect()[0] model = LinearRegressionWithSGD.train(datas, step=2, iterations=100, intercept=True, regType='l2') print '**' * 50 print model.weights print model.intercept print '**' * 50 # 计算预测模型与训练值得方差 prevals = datas.map(lambda p: (p.label, model.predict(p.features))) MSE = prevals.map(lambda (v, p): (v - p)**2).reduce(lambda x, y: x + y) / prevals.count() print u'方差:', str(MSE) print u'测试数据值为:', datas.collect()[0] print u'模型预期数据:', model.predict(array(datas.collect()[0].features)) sc.stop()
#Section 7.4.6 from pyspark.mllib.feature import StandardScaler scaler = StandardScaler(True, True).fit(housingTrain.map(lambda x: x.features)) trainLabel = housingTrain.map(lambda x: x.label) trainFeatures = housingTrain.map(lambda x: x.features) validLabel = housingValid.map(lambda x: x.label) validFeatures = housingValid.map(lambda x: x.features) trainScaled = trainLabel.zip( scaler.transform(trainFeatures)).map(lambda x: LabeledPoint(x[0], x[1])) validScaled = validLabel.zip( scaler.transform(validFeatures)).map(lambda x: LabeledPoint(x[0], x[1])) #Section 7.5 from pyspark.mllib.regression import LinearRegressionWithSGD alg = LinearRegressionWithSGD() trainScaled.cache() validScaled.cache() model = alg.train(trainScaled, iterations=200, intercept=True) #Section 7.5.1 validPredicts = validScaled.map(lambda x: (float(model.predict(x.features)), x.label)) validPredicts.collect() import math RMSE = math.sqrt(validPredicts.map(lambda p: pow(p[0] - p[1], 2)).mean()) #Section 7.5.2 from pyspark.mllib.evaluation import RegressionMetrics validMetrics = RegressionMetrics(validPredicts) validMetrics.rootMeanSquaredError
Statistics.corr(rdd1,rdd2,method)计算两个RDD的相关矩阵,method同上 Statistics.chiSqTest(rdd)计算由LabeledPoint对象组成的RDD中每个特征与标签的皮尔森独立性测试, 返回一个ChiSqTestResult对象,其中有p值,测试统计,每个特征的自由度.特征和标签必须是分类的,即离散值 """ # 11.5.3分类与回归 """ 分类和回归都会使用MLlib中的LabeledPoint类(在mllin.regression包中) 一个 LabeledPoint 其实就是由一个 label( label 总是一个 Double 值, 不过可以为分类算法设为离散整数)和一个 features 向量组成 """ # 线性回归 from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.regression import LinearRegressionWithSGD points = sc.parallelize(LabeledPoint([1, 2, 3], 1)) # 创建LabeledPoint组成的RDD model = LinearRegressionWithSGD.train(points, iterations=200, intercept=True) print model.weights, model.intercept # 逻辑回归 # 支持向量机 # 朴素贝叶斯 # 决策树与随机森林 # 11.5.4聚类 # KMeans # 11.5.5协同过滤与推荐 # 11.5.6降维 # 1.主成分分析 # 2.奇异值分解 # 11.5.7模型评估 # 11.6 from pyspark.mllib.clustering import KMeans
def taxi_regression(sc, filename): ''' Args: sc: The Spark Context filename: Filename of the Amazon reviews file to use, where each line represents a review ''' sqlContext = SQLContext(sc) df = sqlContext.read.load(filename, format='com.databricks.spark.csv', header='true', inferSchema='true').sample(False, 0.001) df = df.filter((df.pickup_longitude < -73.75) & (df.pickup_longitude > -74.05) & (df.dropoff_longitude < -73.75) & (df.dropoff_longitude > -74.05)) df = df.filter((df.pickup_latitude < 40.9) & (df.pickup_latitude > 40.6) & (df.dropoff_latitude < 40.9) & (df.dropoff_latitude > 40.6)) discretizer1 = QuantileDiscretizer(numBuckets=100, inputCol="pickup_latitude", outputCol="pickup_latitude_bucket") discretizer2 = QuantileDiscretizer(numBuckets=100, inputCol="pickup_longitude", outputCol="pickup_longitude_bucket") discretizer3 = QuantileDiscretizer(numBuckets=100, inputCol="dropoff_latitude", outputCol="dropoff_latitude_bucket") discretizer4 = QuantileDiscretizer(numBuckets=100, inputCol="dropoff_longitude", outputCol="dropoff_longitude_bucket") result = discretizer1.fit(df).transform(df) result = discretizer2.fit(result).transform(result) result = discretizer3.fit(result).transform(result) result = discretizer4.fit(result).transform(result) vecAssembler3 = VectorAssembler(inputCols=[ "pickup_latitude_bucket", "pickup_longitude_bucket", "dropoff_latitude_bucket", "dropoff_longitude_bucket" ], outputCol="features") transformed = vecAssembler3.transform(result) # cluster_df = transformed.select("pickup_latitude","pickup_longitude","predction_pickup") # cluster_df.write.format("com.databricks.spark.csv").option("header", "true").save("file.csv") transformed = transformed.select("features", "fare_amount") labeled_rdd = transformed.rdd.map(lambda x: get_labeled_point(x)) for row in labeled_rdd.collect(): print(row) training_data, test_data = labeled_rdd.randomSplit([0.8, 0.2]) model = LinearRegressionWithSGD.train(training_data, iterations=100, step=0.2) valuesAndPredsTraining = training_data.map( lambda p: (float(model.predict(p.features)), p.label)) valuesAndPreds = test_data.map(lambda p: (float(model.predict(p.features)), p.label)) trainingMetrics = RegressionMetrics(valuesAndPredsTraining) metrics = RegressionMetrics(valuesAndPreds) print("RMSE = ", metrics.rootMeanSquaredError, " Explained Variance = ", metrics.explainedVariance, " RMSE Training = ", trainingMetrics.rootMeanSquaredError)
from StringIO import StringIO from pyspark import SparkConf, SparkContext from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD # Load and parse the data def parsePoint(line): values = csv.reader(StringIO(line), delimiter=";").next() # CSV parsing of line values = [float(x) for x in values] # Cast to all floats return LabeledPoint(values[-1], values[:-1]) # y = quality, X = row[:-1] if __name__ == '__main__': conf = SparkConf().setMaster("local[*]").setAppName("Wine Regression") sc = SparkContext(conf=conf) wines = sc.textFile("winequality-red.csv") parsedData = wines.map(parsePoint) # Build the model model = LinearRegressionWithSGD.train(parsedData) # Evaluate the model on training data valuesAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features))) MSE = valuesAndPreds.map(lambda (v, p): (v - p)**2).reduce( lambda x, y: x + y).count() / valuesAndPreds.count() print("Mean Squared Error = " + str(MSE))
housingTrain = sets[0] housingValid = sets[1] #Section 7.4.6 from pyspark.mllib.feature import StandardScaler scaler = StandardScaler(True, True).fit(housingTrain.map(lambda x: x.features)) trainLabel = housingTrain.map(lambda x: x.label) trainFeatures = housingTrain.map(lambda x: x.features) validLabel = housingValid.map(lambda x: x.label) validFeatures = housingValid.map(lambda x: x.features) trainScaled = trainLabel.zip(scaler.transform(trainFeatures)).map(lambda x: LabeledPoint(x[0], x[1])) validScaled = validLabel.zip(scaler.transform(validFeatures)).map(lambda x: LabeledPoint(x[0], x[1])) #Section 7.5 from pyspark.mllib.regression import LinearRegressionWithSGD alg = LinearRegressionWithSGD() trainScaled.cache() validScaled.cache() model = alg.train(trainScaled, iterations=200, intercept=True) #Section 7.5.1 validPredicts = validScaled.map(lambda x: (float(model.predict(x.features)), x.label)) validPredicts.collect() import math RMSE = math.sqrt(validPredicts.map(lambda p: pow(p[0]-p[1],2)).mean()) #Section 7.5.2 from pyspark.mllib.evaluation import RegressionMetrics validMetrics = RegressionMetrics(validPredicts) validMetrics.rootMeanSquaredError validMetrics.meanSquaredError
""" import sys from pyspark.mllib.regression import LinearRegressionWithSGD from spark_application import create_spark_application from data_loader import DataLoader from reader import read_districts_file # Get file paths from arguments if len(sys.argv) != 4: print "Usage: linear_regression.py FEATURES_FILE MODEL_FOLDER DISTRICTS_FILE" sys.exit() features_file, model_folder, districts_file = sys.argv[1:] spark_context, sql_context = create_spark_application( "train_linear_regression") data_loader = DataLoader(spark_context, sql_context, features_file) data_loader.initialize() # train and store a model for each district in the districts file for lat, lon in read_districts_file(districts_file): print("Training District: %f, %f" % (lat, lon)) model = LinearRegressionWithSGD.train(data_loader.get_train_data( (lat, lon)), iterations=1000, step=1e-1) # save the model in the specified model_folder model.save(spark_context, '%s/model_%s_%s' % (model_folder, str(lat), str(lon)))
def printMetrics(model): predictions_and_labels = test.map(lambda lr: (float(model.predict(lr.features)), lr.label)) metrics = RegressionMetrics(predictions_and_labels) f.write('Explained Variance:{0}\n'.format(metrics.explainedVariance)) f.write('Mean Absolute Error:{0}\n'.format(metrics.meanAbsoluteError)) f.write('Mean Squared Error:{0}\n'.format(metrics.meanSquaredError)) f.write('Root Mean Squared Error:{0}\n'.format(metrics.rootMeanSquaredError)) f.write('R^2 :{0}\n'.format(metrics.r2)) for j in range(numModels): regp = paramGrid[j]['regParam'] iters = paramGrid[j]['iterations'] regt = paramGrid[j]['regType'] timestart = datetime.datetime.now() f.write('Model{0}: regParam = {1}, iterations = {2}, regType = {3}\n'.format(str(j), regp, iters, regt)) # Train linear regression model with hypermarameter set model = LinearRegressionWithSGD.train(training, iterations=iters, \ step=1.0, miniBatchFraction=1.0, initialWeights=None, regParam=regp, \ regType=regt, intercept=False, validateData=True) printMetrics(model) timeend = datetime.datetime.now() timedelta = round((timeend-timestart).total_seconds(), 2) f.write("Time taken to execute this model is: " + str(timedelta) + " seconds.\n") f.close() sc.stop()
# In[77]: from pyspark.mllib.regression import LinearRegressionWithSGD # Values to use when training the linear regression model numIters = 500 # iterations alpha = 1.0 # step miniBatchFrac = 1.0 # miniBatchFraction reg = 1e-1 # regParam regType = 'l2' # regType useIntercept = True # intercept # In[79]: # TODO: Replace <FILL IN> with appropriate code firstModel = LinearRegressionWithSGD.train(parsedTrainData, numIters, alpha, miniBatchFrac, None, reg, regType, useIntercept) # weightsLR1 stores the model weights; interceptLR1 stores the model intercept weightsLR1 = firstModel.weights interceptLR1 = firstModel.intercept print weightsLR1, interceptLR1 # In[80]: # TEST LinearRegressionWithSGD (4a) expectedIntercept = 13.3335907631 expectedWeights = [ 16.682292427, 14.7439059559, -0.0935105608897, 6.22080088829, 4.01454261926, -3.30214858535, 11.0403027232, 2.67190962854, 7.18925791279, 4.46093254586, 8.14950409475, 2.75135810882
numIters = 500 # iterations alpha = 1.0 # step miniBatchFrac = 1.0 # miniBatchFraction reg = 1e-1 # regParam regType = 'l2' # regType useIntercept = True # intercept # In[62]: # TODO: Replace <FILL IN> with appropriate code firstModel = LinearRegressionWithSGD.train(parsedTrainData, iterations=numIters, step=alpha, miniBatchFraction=miniBatchFrac, initialWeights=None, regParam=reg, regType=regType, intercept=useIntercept ) # weightsLR1 stores the model weights; interceptLR1 stores the model intercept weightsLR1 = firstModel.weights interceptLR1 = firstModel.intercept print weightsLR1, interceptLR1 # In[63]: # TEST LinearRegressionWithSGD (4a) expectedIntercept = 13.3335907631
def test_regression(self): from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \ RidgeRegressionWithSGD from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees data = [ LabeledPoint(-1.0, [0, -1]), LabeledPoint(1.0, [0, 1]), LabeledPoint(-1.0, [0, -2]), LabeledPoint(1.0, [0, 2]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] lr_model = LinearRegressionWithSGD.train(rdd) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) lasso_model = LassoWithSGD.train(rdd) self.assertTrue(lasso_model.predict(features[0]) <= 0) self.assertTrue(lasso_model.predict(features[1]) > 0) self.assertTrue(lasso_model.predict(features[2]) <= 0) self.assertTrue(lasso_model.predict(features[3]) > 0) rr_model = RidgeRegressionWithSGD.train(rdd) self.assertTrue(rr_model.predict(features[0]) <= 0) self.assertTrue(rr_model.predict(features[1]) > 0) self.assertTrue(rr_model.predict(features[2]) <= 0) self.assertTrue(rr_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories dt_model = DecisionTree.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0) rf_model = RandomForest.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100, seed=1) self.assertTrue(rf_model.predict(features[0]) <= 0) self.assertTrue(rf_model.predict(features[1]) > 0) self.assertTrue(rf_model.predict(features[2]) <= 0) self.assertTrue(rf_model.predict(features[3]) > 0) gbt_model = GradientBoostedTrees.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(gbt_model.predict(features[0]) <= 0) self.assertTrue(gbt_model.predict(features[1]) > 0) self.assertTrue(gbt_model.predict(features[2]) <= 0) self.assertTrue(gbt_model.predict(features[3]) > 0) try: LinearRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0])) LassoWithSGD.train(rdd, initialWeights=array([1.0, 1.0])) RidgeRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0])) except ValueError: self.fail()
return LabeledPoint(values[7], values[0:11]) #data_file = sc.textFile("/home/faiz89/Desktop/Eastman/2008.csv") data_file = sc.textFile("../2008_small.csv") header = data_file.first () raw_data = data_file.filter (lambda x:x != header) #examples = MLUtils.loadLibSVMFile(sc, "2008.csv").collect() parsedData = raw_data.map(parsePoint) (trainingData, testData) = parsedData.randomSplit([0.7, 0.3]) startTime = datetime.now() # Build the model trainingData.cache () model = LinearRegressionWithSGD.train(trainingData, iterations=1) print ('Training Time consumed = '), (datetime.now() - startTime) startTestTime = datetime.now() testData.cache() # Evaluating the model on training data valuesAndPreds = testData.map(lambda p: (p.label, model.predict(p.features))) MSE = valuesAndPreds \ .map(lambda (v, p): (v - p)**2) \ .reduce(lambda x, y: x + y) / valuesAndPreds.count() print ('Testing Time consumed = '), (datetime.now() - startTestTime) print ('Total Time: '), (datetime.now() - startTime) print("Mean Squared Error = " + str(MSE)) # Save and load model model.save(sc, "LinearRegressionNarrow2008_cache_both_train_and_test") sameModel = LinearRegressionModel.load(sc, "LinearRegressionNarrow2008_cache_both_train_and_test")
# $example on$ # Load and parse the data def parsePoint(line): values = line.split() return LabeledPoint( int(values[0]), DenseVector([int(x.split(':')[1]) for x in values[1:]])) data = sc.textFile( "/Users/hugomathien/Documents/workspace/footballdata/learning_vector/learningVector8.txt" ) parsedData = data.map(parsePoint) # Build the model model = LinearRegressionWithSGD.train(parsedData, iterations=1000000, step=0.0000000000001) # Get predictions valuesAndPreds = parsedData.map( lambda p: (float(model.predict(p.features)), p.label)) # Instantiate metrics object metrics = RegressionMetrics(valuesAndPreds) # Squared Error print("MSE = %s" % metrics.meanSquaredError) print("RMSE = %s" % metrics.rootMeanSquaredError) # R-squared print("R-squared = %s" % metrics.r2)