def test_regression(self): from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \ RidgeRegressionWithSGD from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees data = [ LabeledPoint(-1.0, [0, -1]), LabeledPoint(1.0, [0, 1]), LabeledPoint(-1.0, [0, -2]), LabeledPoint(1.0, [0, 2]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] lr_model = LinearRegressionWithSGD.train(rdd, iterations=10) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) lasso_model = LassoWithSGD.train(rdd, iterations=10) self.assertTrue(lasso_model.predict(features[0]) <= 0) self.assertTrue(lasso_model.predict(features[1]) > 0) self.assertTrue(lasso_model.predict(features[2]) <= 0) self.assertTrue(lasso_model.predict(features[3]) > 0) rr_model = RidgeRegressionWithSGD.train(rdd, iterations=10) self.assertTrue(rr_model.predict(features[0]) <= 0) self.assertTrue(rr_model.predict(features[1]) > 0) self.assertTrue(rr_model.predict(features[2]) <= 0) self.assertTrue(rr_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories dt_model = DecisionTree.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0) rf_model = RandomForest.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10, maxBins=4, seed=1) self.assertTrue(rf_model.predict(features[0]) <= 0) self.assertTrue(rf_model.predict(features[1]) > 0) self.assertTrue(rf_model.predict(features[2]) <= 0) self.assertTrue(rf_model.predict(features[3]) > 0) gbt_model = GradientBoostedTrees.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4) self.assertTrue(gbt_model.predict(features[0]) <= 0) self.assertTrue(gbt_model.predict(features[1]) > 0) self.assertTrue(gbt_model.predict(features[2]) <= 0) self.assertTrue(gbt_model.predict(features[3]) > 0) try: LinearRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10) LassoWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10) RidgeRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10) except ValueError: self.fail()
def iterateRidge(iterNums, stepSizes, regParam, train, valid): from pyspark.mllib.regression import RidgeRegressionWithSGD import math for numIter in iterNums: for step in stepSizes: alg = RidgeRegressionWithSGD() model = alg.train(train, intercept=True, regParam=regParam, iterations=numIter, step=step) rescaledPredicts = train.map(lambda x: (model.predict(x.features), x.label)) validPredicts = valid.map(lambda x: (model.predict(x.features), x.label)) meanSquared = math.sqrt(rescaledPredicts.map(lambda p: pow(p[0]-p[1],2)).mean()) meanSquaredValid = math.sqrt(validPredicts.map(lambda p: pow(p[0]-p[1],2)).mean()) print("%d, %5.3f -> %.4f, %.4f" % (numIter, step, meanSquared, meanSquaredValid))
def test_regression(self): from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \ RidgeRegressionWithSGD data = [ LabeledPoint(-1.0, [0, -1]), LabeledPoint(1.0, [0, 1]), LabeledPoint(-1.0, [0, -2]), LabeledPoint(1.0, [0, 2]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] lr_model = LinearRegressionWithSGD.train(rdd) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) lasso_model = LassoWithSGD.train(rdd) self.assertTrue(lasso_model.predict(features[0]) <= 0) self.assertTrue(lasso_model.predict(features[1]) > 0) self.assertTrue(lasso_model.predict(features[2]) <= 0) self.assertTrue(lasso_model.predict(features[3]) > 0) rr_model = RidgeRegressionWithSGD.train(rdd) self.assertTrue(rr_model.predict(features[0]) <= 0) self.assertTrue(rr_model.predict(features[1]) > 0) self.assertTrue(rr_model.predict(features[2]) <= 0) self.assertTrue(rr_model.predict(features[3]) > 0)
def test_regression(self): from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \ RidgeRegressionWithSGD data = [ LabeledPoint(-1.0, self.scipy_matrix(2, {1: -1.0})), LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})), LabeledPoint(-1.0, self.scipy_matrix(2, {1: -2.0})), LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0})) ] rdd = self.sc.parallelize(data) features = [p.features for p in data] lr_model = LinearRegressionWithSGD.train(rdd) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) lasso_model = LassoWithSGD.train(rdd) self.assertTrue(lasso_model.predict(features[0]) <= 0) self.assertTrue(lasso_model.predict(features[1]) > 0) self.assertTrue(lasso_model.predict(features[2]) <= 0) self.assertTrue(lasso_model.predict(features[3]) > 0) rr_model = RidgeRegressionWithSGD.train(rdd) self.assertTrue(rr_model.predict(features[0]) <= 0) self.assertTrue(rr_model.predict(features[1]) > 0) self.assertTrue(rr_model.predict(features[2]) <= 0) self.assertTrue(rr_model.predict(features[3]) > 0)
def iterateRidge(iterNums, stepSizes, regParam, train, valid): from pyspark.mllib.regression import RidgeRegressionWithSGD import math for numIter in iterNums: for step in stepSizes: alg = RidgeRegressionWithSGD() model = alg.train(train, intercept=True, regParam=regParam, iterations=numIter, step=step) rescaledPredicts = train.map(lambda x: (model.predict(x.features), x.label)) validPredicts = valid.map(lambda x: (model.predict(x.features), x.label)) meanSquared = math.sqrt( rescaledPredicts.map(lambda p: pow(p[0] - p[1], 2)).mean()) meanSquaredValid = math.sqrt( validPredicts.map(lambda p: pow(p[0] - p[1], 2)).mean()) print("%d, %5.3f -> %.4f, %.4f" % (numIter, step, meanSquared, meanSquaredValid))
def RidgeRegressionModel(dataPath, label, normalize, character, master, ispca): pca_n = 2 sc = SparkContext(master) data = sc.textFile(dataPath) # not RDD data ndata = data.map(lambda line: line.split(character)).map(lambda part: (map(lambda x: float(x) ,part[0: len(part)]))) if label == 0: ndata = ndata.map(lambda line: line[::-1]) if normalize == 1: test_data = norm(ndata.collect()) norm_data = sc.parallelize(test_data) train_data = norm_data.map(lambda part: lbp(part[0], part[1])) #raw_data = data.map(lambda line: line.split(character)) else: test_data = ndata.map(lambda part: (part[0], part[1:len(part) - 1])).collect() train_data = ndata.map(lambda part: lbp(part[0], part[1: len(part) - 1])) if ispca == 1: pca = PCA(n_components = pca_n) pca_train = [test_data[i][1] for i in range(len(test_data))] pca_data = pca.fit(pca_train).transform(pca_train) test = [] for i in range(len(pca_data)): test.append([test_data[i][0], pca_data[i]]) train_data = sc.parallelize(test).map(lambda part: lbp(part[0], part[1])) test_data = test model_rr = rr.train(train_data) err_rr = 0.0 size = len(train_data.collect()) for i in range(size): err_rr = err_rr + abs(model_rr.predict(test_data[i][1]) - test_data[i][0]) print "result:", err_rr/size String = "Ridge Regression Result:\n" String = String + str(model_rr.weights) + '\n' String = String + "Error: " + str(err_rr / size) sc.stop() return String
def test_regression(self): from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \ RidgeRegressionWithSGD from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees data = [ LabeledPoint(-1.0, [0, -1]), LabeledPoint(1.0, [0, 1]), LabeledPoint(-1.0, [0, -2]), LabeledPoint(1.0, [0, 2]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] lr_model = LinearRegressionWithSGD.train(rdd) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) lasso_model = LassoWithSGD.train(rdd) self.assertTrue(lasso_model.predict(features[0]) <= 0) self.assertTrue(lasso_model.predict(features[1]) > 0) self.assertTrue(lasso_model.predict(features[2]) <= 0) self.assertTrue(lasso_model.predict(features[3]) > 0) rr_model = RidgeRegressionWithSGD.train(rdd) self.assertTrue(rr_model.predict(features[0]) <= 0) self.assertTrue(rr_model.predict(features[1]) > 0) self.assertTrue(rr_model.predict(features[2]) <= 0) self.assertTrue(rr_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories dt_model = DecisionTree.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0) rf_model = RandomForest.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100) self.assertTrue(rf_model.predict(features[0]) <= 0) self.assertTrue(rf_model.predict(features[1]) > 0) self.assertTrue(rf_model.predict(features[2]) <= 0) self.assertTrue(rf_model.predict(features[3]) > 0) gbt_model = GradientBoostedTrees.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(gbt_model.predict(features[0]) <= 0) self.assertTrue(gbt_model.predict(features[1]) > 0) self.assertTrue(gbt_model.predict(features[2]) <= 0) self.assertTrue(gbt_model.predict(features[3]) > 0)
def linearRegression_f(mode): if mode == "no_reg": model = LinearRegressionWithSGD.train(parsedData) elif mode == "L1_reg": model = LassoWithSGD.train(parsedData) elif mode == "L2_reg": model = RidgeRegressionWithSGD.train(parsedData) else: print("ERROR Mode") #Evaluate the model on training data # parsedData map method to get {train_data, predict_data} pairs valuesAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features))) #calculate the key-value pairs to get MSE MSE = valuesAndPreds.map(lambda (v, p): (v-p)**2).reduce(lambda x, y: x+y)/valuesAndPreds.count() return MSE
def test_regression(self): from pyspark.mllib.regression import ( LinearRegressionWithSGD, LassoWithSGD, RidgeRegressionWithSGD, ) from pyspark.mllib.tree import DecisionTree data = [ LabeledPoint(-1.0, self.scipy_matrix(2, {1: -1.0})), LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})), LabeledPoint(-1.0, self.scipy_matrix(2, {1: -2.0})), LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0})), ] rdd = self.sc.parallelize(data) features = [p.features for p in data] lr_model = LinearRegressionWithSGD.train(rdd) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) lasso_model = LassoWithSGD.train(rdd) self.assertTrue(lasso_model.predict(features[0]) <= 0) self.assertTrue(lasso_model.predict(features[1]) > 0) self.assertTrue(lasso_model.predict(features[2]) <= 0) self.assertTrue(lasso_model.predict(features[3]) > 0) rr_model = RidgeRegressionWithSGD.train(rdd) self.assertTrue(rr_model.predict(features[0]) <= 0) self.assertTrue(rr_model.predict(features[1]) > 0) self.assertTrue(rr_model.predict(features[2]) <= 0) self.assertTrue(rr_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories dt_model = DecisionTree.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0)
def exercise_2(self): """ # Write your Docstring here """ sc = self.spark_context file = sc.textFile("./carat-context-factors-percom.csv") energyRate,batteryHealth,batteryTemperature,batteryVoltage,\ cpuUsage,distanceTraveled,mobileDataActivity,mobileDataStatus,\ mobileNetworkType,networkType,roamingEnabled,screenBrightness,\ wifiLinkSpeed,wifiSignalStrength = [i for i in range(0,14)] data = file.map(lambda line: line.split(";")).map(lambda line: (float(line[energyRate]),line[batteryHealth], float(line[batteryTemperature]),float(line[batteryVoltage]), float(line[cpuUsage]),float(line[distanceTraveled]), line[mobileDataActivity],line[mobileDataStatus], line[mobileNetworkType],line[networkType], float(line[roamingEnabled]),float(line[screenBrightness]), float(line[wifiLinkSpeed]),float(line[wifiSignalStrength]))) data = data.filter(lambda x:((x[screenBrightness]==-1 or(x[screenBrightness]>=0 and x[screenBrightness]<=255)) and\ (x[cpuUsage]>=0 and x[cpuUsage]<=1) and\ (x[distanceTraveled]>=0) and\ (x[wifiSignalStrength]>-100 and x[wifiSignalStrength]<0) and\ (x[batteryTemperature]>=0))) data = data.map(lambda x:LabeledPoint(x[energyRate], [x[cpuUsage],x[screenBrightness], x[wifiSignalStrength], x[batteryTemperature]])) train,test = data.randomSplit([4,1]) lr = LinearRegressionWithSGD.train(train,iterations=100,step=1e-4,intercept=False) print lr#(weights=[4.05918718288e-07,2.01710179227e-05,-3.39410603521e-05,1.70383825251e-05], intercept=0.0) rr = RidgeRegressionWithSGD.train(train,iterations=100,step=1e-4,intercept=False) print rr#(weights=[4.05918453228e-07,2.0170994023e-05,-3.39410381473e-05,1.70383716836e-05], intercept=0.0) l = LassoWithSGD.train(train,iterations=100,step=1e-4,intercept=False) print l#(weights=[0.0,1.96629057526e-05,-3.29054093642e-05,1.56445907401e-05], intercept=0.0) valuesAndPreds = test.map(lambda p: (p.label,lr.predict(p.features), rr.predict(p.features),l.predict(p.features))) count = valuesAndPreds.count() MSE = valuesAndPreds.map(lambda (v,lrp,rrp,lp): ((v - lrp)**2/count, (v - rrp)**2/count,(v - lp)**2/count))\ .reduce(lambda a,b:(a[0]+b[0],a[1]+b[1],a[2]+b[2])) print MSE #(4.7634385303075644e-05, 4.7634387065855108e-05, 4.7873793406702168e-05) return None
def main(): records = get_records() records.cache() mappings = [get_mapping(records, i) for i in range(2, 10)] cat_len = sum(map(len, mappings)) num_len = len(records.first()[11:15]) total_len = num_len + cat_len data = records.map(lambda r: LabeledPoint( extract_label(r), extract_features(r, cat_len, mappings))) rr_model = RidgeRegressionWithSGD.train(data, iterations=10, step=0.1, intercept=False) true_vs_predicted_rr = data.map(lambda p: (p.label, rr_model.predict(p.features))) print "Ridge Regression Model predictions: " + str( true_vs_predicted_rr.take(5)) calculate_print_metrics("Ridge Regression", true_vs_predicted_rr)
def test_regression(self): from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \ RidgeRegressionWithSGD from pyspark.mllib.tree import DecisionTree data = [ LabeledPoint(-1.0, self.scipy_matrix(2, {1: -1.0})), LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})), LabeledPoint(-1.0, self.scipy_matrix(2, {1: -2.0})), LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0})) ] rdd = self.sc.parallelize(data) features = [p.features for p in data] lr_model = LinearRegressionWithSGD.train(rdd) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) lasso_model = LassoWithSGD.train(rdd) self.assertTrue(lasso_model.predict(features[0]) <= 0) self.assertTrue(lasso_model.predict(features[1]) > 0) self.assertTrue(lasso_model.predict(features[2]) <= 0) self.assertTrue(lasso_model.predict(features[3]) > 0) rr_model = RidgeRegressionWithSGD.train(rdd) self.assertTrue(rr_model.predict(features[0]) <= 0) self.assertTrue(rr_model.predict(features[1]) > 0) self.assertTrue(rr_model.predict(features[2]) <= 0) self.assertTrue(rr_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories dt_model = DecisionTree.trainRegressor(rdd, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0)
print (training_reg.take(5)) def evaluate_model_reg(test,model): valuesAndPreds = test.map(lambda p: (p.label, model.predict(p.features))) MSE = valuesAndPreds.map(lambda (v, p): (v - p)**2).reduce(lambda x, y: x + y) / valuesAndPreds.count() return (MSE) ### LinearRegression with SGD model_lreg_sgd_l2 = evaluate_model_reg(test_reg,LinearRegressionWithSGD.train(training_reg,iterations=1000,step=0.0001,regType="l2")) model_lreg_sgd_l1 = evaluate_model_reg(test_reg,LinearRegressionWithSGD.train(training_reg,iterations=1000,step=0.0001,regType="l1")) model_lreg_sgd_l0 = evaluate_model_reg(test_reg,LinearRegressionWithSGD.train(training_reg,iterations=1000,step=0.0001,regType=None)) ### RidgeRegression model_ridge = evaluate_model_reg(test_reg,RidgeRegressionWithSGD.train(training_reg,iterations=1000, step=0.0001)) ### Lasso model_lasso = evaluate_model_reg(test_reg,LassoWithSGD.train(training_reg,iterations=1000, step =0.0001)) #################### OUTPUTS ################################# print("Testing Error :"+"model_svm_l2 = " + str(model_svm_l2)) print("Testing Error :"+"model_svm_l1 = " + str(model_svm_l1)) print("Testing Error :"+"model_svm_l0 = " + str(model_svm_l0)) print("Testing Error :"+"model_log_lbfgs_l2 = " + str(model_log_lbfgs_l2)) print("Testing Error :"+"model_log_lbfgs_l1 = " + str(model_log_lbfgs_l1)) print("Testing Error :"+"model_log_lbfgs_l0 = " + str(model_log_lbfgs_l0)) print("Testing Error :"+"model_log_sgd_l2 = " + str(model_log_sgd_l2)) print("Testing Error :"+"model_log_sgd_l1 = " + str(model_log_sgd_l1))
def performRidgeRegression(training): model = RidgeRegressionWithSGD.train(training, iterations = 100, step = 0.001) return model
#load and parse the data def parsePoint(line): values = [np.float(x) for x in line.replace(',', ' ').split(' ')] return LabeledPoint(values[6], values[0:6]) data = sc.textFile("/user/cloudera/hw1/train_nohead.csv") wholedata = sc.textFile("/user/cloudera/hw1/wholedata.csv") parsedData = data.map(parsePoint) parsedWholeData = wholedata.map(parsePoint) #Build the model model = RidgeRegressionWithSGD.train(parsedData, iterations=100, step=0.1, regParam=0.01) #Evaluate the model valuesAndPreds = parsedWholeData.map(lambda p: (p.label, model.predict(p.features))) RMSE = np.sqrt( valuesAndPreds \ .map(lambda (v, p): (v - p)**2) \ .reduce(lambda x, y: x + y) / valuesAndPreds.count() ) print("ridge regression output : \n") print("RMSE = {0}\n".format(RMSE)) #save and load model
(p.label, model_least.predict(p.features))) MSE = valuesAndPreds.map(lambda vp: (vp[0] - vp[1])**2).reduce( lambda x, y: x + y) / valuesAndPreds.count() test_cnt = valuesAndPreds.count() least_RMSE_test = math.sqrt(MSE) valuesAndPreds = trainData.map(lambda p: (p.label, model_least.predict(p.features))) MSE = valuesAndPreds.map(lambda vp: (vp[0] - vp[1])**2).reduce( lambda x, y: x + y) / valuesAndPreds.count() train_cnt = valuesAndPreds.count() least_RMSE_train = math.sqrt(MSE) # Ridge Regression model_ridge = RidgeRegressionWithSGD.train(trainData, regParam=0.01, intercept=True) valuesAndPreds = testData.map(lambda p: (p.label, model_ridge.predict(p.features))) MSE = valuesAndPreds.map(lambda vp: (vp[0] - vp[1])**2).reduce( lambda x, y: x + y) / valuesAndPreds.count() test_cnt = valuesAndPreds.count() ridge_RMSE_test = math.sqrt(MSE) valuesAndPreds = trainData.map(lambda p: (p.label, model_ridge.predict(p.features))) MSE = valuesAndPreds.map(lambda vp: (vp[0] - vp[1])**2).reduce( lambda x, y: x + y) / valuesAndPreds.count() train_cnt = valuesAndPreds.count() ridge_RMSE_train = math.sqrt(MSE)
print(sameModel.predict(SparseVector(2, { 0: 100.0, 1: 150 }))) # 利用稀疏向量作为数据结构,返回单个预测值 test_set = [] for i in range(100): for j in range(100): test_set.append(SparseVector(2, {0: i, 1: j})) print(sameModel.predict(sc.parallelize(test_set)).collect()) # 预测多值,返回一个RDD数据集 print(sameModel.weights) # 返回参数 # -----------------岭回归------------------ from pyspark.mllib.regression import RidgeRegressionWithSGD data = [ LabeledPoint(1.0, [1.0, 1.0]), LabeledPoint(4.0, [1.0, 3.0]), LabeledPoint(8.0, [2.0, 3.0]), LabeledPoint(10.0, [3.0, 4.0]) ] train_set = sc.parallelize(data) rrm = RidgeRegressionWithSGD.train(train_set, iterations=100, initialWeights=np.array([1.0, 1.0])) test_set = [] for i in range(100): for j in range(100): test_set.append(np.array([i, j])) print(rrm.predict(sc.parallelize(test_set)).collect()) print(rrm.weights)
def modelSelection(argv): if len(argv) < 5: print("The arguments for this script require:\n" + "(hdfs or file):///path/to/filename of the dataset\n" + "supervised/unsupervised\n" + "classifier/regression/clustering\n" + "parameter trying to be guessed\n" + "other parameters\n") else: args = argv[1:] #sets up the RDD dataset = sc.textFile(args[0]) params = args[3:] if args[0][-3:] == "csv": dataset = csvFilterAndMap(dataset, params) elif args[0][-4:] =="json": dataset = jsonFilterAndMap(dataset, params) else: print("This program only supports .csv and .json files") return #Model selection algorithm. Currently goes off of scikit learn's cheat sheet if args[1] == "supervised": labels = dataset.map(lambda x: x[0]) values = dataset.map(lambda x: x[1:]) zipped_data = labels.zip(values).map(lambda x: LabeledPoint(x[0], x[1:])).cache() datasetTraining, datasetTest = zipped_data.randomSplit([.8, .2]) if args[2] == "classification": theModel = NaiveBayes.train(datasetTraining) test_preds = (datasetTest.map(lambda x: x.label).zip(theModel.predict(datasetTest.map(lambda x: x.features)))) predictions = theModel.predict(datasetTest.map(lambda x: x.features)) test_metrics = MulticlassMetrics(test_preds.map(lambda x: (x[0], float(x[1])))) testing_accuracy = test_metrics.precision() with open('results.txt', 'w+') as f: f.write("accuracy: " + str(testing_accuracy) + "\n") f.write("confusion matrix:\n" + str(test_metrics.confusionMatrix().toArray())) return theModel elif args[2] == "regression": sample = zipped_data.sample(False, .3) model = performRegression(sample, params) if(model == "lasso"): theModel = LassoWithSGD.train(datasetTraining, iterations = 1000, step = 0.001) elif(model == "linear"): theModel = LinearRegressionWithSGD.train(datasetTraining, iterations = 1000, step = 0.001) else: theModel = RidgeRegressionWithSGD.train(datasetTraining, iterations = 1000, step = 0.001) test = (datasetTest.map(lambda x: x.label).zip(theModel.predict(datasetTest.map(lambda x: x.features)))) metrics = RegressionMetrics(test.map(lambda x: (x[0], float(x[1])))) value = metrics.rootMeanSquaredError with open('results.txt', 'w+') as f: f.write(model +" root mean squared error: ") f.write(str(value)) return theModel else: print("Please use rather classification or regression for supervised learning") return elif args[1] == "unsupervised": sample = dataset.sample(False, .3) with open('datapoints.txt', 'w+') as f: f.write("dataset: " + str(dataset.take(10))) f.write('\n\n') if args[2] == "clustering": model = performClustering(sample, params) if(model[0] == "gaussian"): theModel = GuassianMixture.train(dataset, model[1]) else: theModel = KMeans.train(dataset, model[1]) with open('results.txt', 'w+') as f: f.write(str(model)) return theModel else: print("Currently this model selection algorithm only supports clustering for unsupervised algorithms") return
from pyspark.mllib.regression import LabeledPoint, RidgeRegressionWithSGD from numpy import array from pyspark import SparkContext from pyspark.mllib.classification import LogisticRegressionWithLBFGS # Load and parse the data def parsePoint(line): values = [float(x) for x in line.split(";")] return LabeledPoint(values[11], values[0:10]) sc = SparkContext("local", "Simple App") data = sc.textFile("../winequality.csv") parsedData = data.map(parsePoint) # Build the model model = RidgeRegressionWithSGD.train(parsedData) # Evaluating the model on training data labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features))) trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count()) print("Training Error = " + str(trainErr))
import sys from pyspark import SparkContext from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.regression import RidgeRegressionWithSGD, RidgeRegressionModel from pyspark.mllib.util import MLUtils sc = SparkContext(appName="PythonWordCount") data=MLUtils.loadLibSVMFile(sc, '/usr/hadoop/para_avg_halfsqrsum.txt') traindata=MLUtils.loadLibSVMFile(sc, '/usr/hadoop/train_para.txt') data_720=MLUtils.loadLibSVMFile(sc, '/usr/hadoop/para_avg_halfsqrsum_720.txt') data_540=MLUtils.loadLibSVMFile(sc, '/usr/hadoop/para_avg_halfsqrsum_540.txt') data_360=MLUtils.loadLibSVMFile(sc, '/usr/hadoop/para_avg_halfsqrsum_360.txt') model = RidgeRegressionWithSGD.train(traindata) predictions = model.predict(data.map(lambda x:x.features)) labelsandpredictions=data.map(lambda lp: lp.label).zip(predictions) MSE = labelsandpredictions.map(lambda (v,p): (v-p)*(v-p)).sum()/float(data.count()) print("training MSE = "+str(MSE)) labelsandpredictions.saveAsTextFile("/usr/hadoop/hf_Ridge") predictions_720 = model.predict(data_720.map(lambda x:x.features)) labelsandpredictions_720=data_720.map(lambda lp: lp.label).zip(predictions_720) MSE_720 = labelsandpredictions_720.map(lambda (v,p): (v-p)*(v-p)).sum()/float(data_720.count()) print("training MSE_720 = "+str(MSE_720)) labelsandpredictions_720.saveAsTextFile("/usr/hadoop/hf_720_Ridge") predictions_540 = model.predict(data_540.map(lambda x:x.features)) labelsandpredictions_540=data_540.map(lambda lp: lp.label).zip(predictions_540) MSE_540 = labelsandpredictions_540.map(lambda (v,p): (v-p)*(v-p)).sum()/float(data_540.count()) print("training MSE_540 = "+str(MSE_540)) labelsandpredictions_540.saveAsTextFile("/usr/hadoop/hf_540_Ridge") predictions_360 = model.predict(data_360.map(lambda x:x.features))
import sys from pyspark import SparkContext from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.regression import RidgeRegressionWithSGD, RidgeRegressionModel from pyspark.mllib.util import MLUtils sc = SparkContext(appName="PythonWordCount") data = MLUtils.loadLibSVMFile(sc, '/usr/hadoop/ssim.txt') traindata = MLUtils.loadLibSVMFile(sc, '/usr/hadoop/train_ssim.txt') data_720 = MLUtils.loadLibSVMFile(sc, '/usr/hadoop/ssim_720.txt') data_540 = MLUtils.loadLibSVMFile(sc, '/usr/hadoop/ssim_540.txt') data_360 = MLUtils.loadLibSVMFile(sc, '/usr/hadoop/ssim_360.txt') model = RidgeRegressionWithSGD.train(traindata) predictions = model.predict(data.map(lambda x: x.features)) labelsandpredictions = data.map(lambda lp: lp.label).zip(predictions) MSE = labelsandpredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float( data.count()) print("training MSE = " + str(MSE)) labelsandpredictions.saveAsTextFile("/usr/hadoop/ssim_Ridge") predictions_720 = model.predict(data_720.map(lambda x: x.features)) labelsandpredictions_720 = data_720.map(lambda lp: lp.label).zip( predictions_720) MSE_720 = labelsandpredictions_720.map(lambda (v, p): (v - p) * (v - p)).sum() / float(data_720.count()) print("training MSE_720 = " + str(MSE_720)) labelsandpredictions_720.saveAsTextFile("/usr/hadoop/ssim_720_Ridge") predictions_540 = model.predict(data_540.map(lambda x: x.features)) labelsandpredictions_540 = data_540.map(lambda lp: lp.label).zip( predictions_540)
def test_regression(self): from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \ RidgeRegressionWithSGD from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees data = [ LabeledPoint(-1.0, [0, -1]), LabeledPoint(1.0, [0, 1]), LabeledPoint(-1.0, [0, -2]), LabeledPoint(1.0, [0, 2]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] lr_model = LinearRegressionWithSGD.train(rdd, iterations=10) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) lasso_model = LassoWithSGD.train(rdd, iterations=10) self.assertTrue(lasso_model.predict(features[0]) <= 0) self.assertTrue(lasso_model.predict(features[1]) > 0) self.assertTrue(lasso_model.predict(features[2]) <= 0) self.assertTrue(lasso_model.predict(features[3]) > 0) rr_model = RidgeRegressionWithSGD.train(rdd, iterations=10) self.assertTrue(rr_model.predict(features[0]) <= 0) self.assertTrue(rr_model.predict(features[1]) > 0) self.assertTrue(rr_model.predict(features[2]) <= 0) self.assertTrue(rr_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories dt_model = DecisionTree.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0) rf_model = RandomForest.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10, maxBins=4, seed=1) self.assertTrue(rf_model.predict(features[0]) <= 0) self.assertTrue(rf_model.predict(features[1]) > 0) self.assertTrue(rf_model.predict(features[2]) <= 0) self.assertTrue(rf_model.predict(features[3]) > 0) gbt_model = GradientBoostedTrees.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4) self.assertTrue(gbt_model.predict(features[0]) <= 0) self.assertTrue(gbt_model.predict(features[1]) > 0) self.assertTrue(gbt_model.predict(features[2]) <= 0) self.assertTrue(gbt_model.predict(features[3]) > 0) try: LinearRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10) LassoWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10) RidgeRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10) except ValueError: self.fail() # Verify that maxBins is being passed through GradientBoostedTrees.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4, maxBins=32) with self.assertRaises(Exception) as cm: GradientBoostedTrees.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4, maxBins=1)
y_predict = [] if con.HDF == False: sc = SparkContext(appName="VolEstimation") #post-process input in pandas postProcess(con.DATA_PATH + con.FILE_NM, vol=7) xy_test_points = pd.read_csv( os.path.abspath(os.curdir) + '/data/' + con.TEST_FN) x_test_points = xy_test_points.ix[:, 1:11].values.tolist( ) #omit y output col y_test_points = xy_test_points.ix[:, 0].values.tolist() xy_train_points = sc.textFile( os.path.abspath(os.curdir) + '/data/' + con.TRAIN_FN).map(parsePoint) model = RidgeRegressionWithSGD.train(xy_train_points, iterations=5000) for x in x_test_points: y_predict.append(model.predict(x)) r2_knn = r2_score(y_test_points, y_predict) print "Final Out of Sample R^2 of Regression" + str(r2_knn) print "Final weights: " + str(model.weights) print "Final intercept: " + str(model.intercept) #kill Spark context gracefully sc.stop()