def main(): records = get_records() records.cache() # extract all the catgorical mappings mappings = [get_mapping(records, i) for i in range(2,10)] cat_len = sum(map(len, mappings)) num_len = len(records.first()[11:15]) data = records.map(lambda r: LabeledPoint(extract_label(r), extract_features(r, cat_len, mappings))) data_dt = records.map(lambda r: LabeledPoint(extract_label(r), extract_features_dt(r))) dt_model = DecisionTree.trainRegressor(data_dt, {}) preds = dt_model.predict(data_dt.map(lambda p: p.features)) actual = data.map(lambda p: p.label) true_vs_predicted_dt = actual.zip(preds) data_dt_log = data_dt.map(lambda lp: LabeledPoint(np.log(lp.label), lp.features)) dt_model_log = DecisionTree.trainRegressor(data_dt_log, {}) preds_log = dt_model_log.predict(data_dt_log.map(lambda p: p.features)) actual_log = data_dt_log.map(lambda p: p.label) true_vs_predicted_dt_log = actual_log.zip(preds_log).map(lambda (t, p): (np.exp(t), np.exp(p))) calculate_print_metrics("Decision Tree Log", true_vs_predicted_dt_log)
def decisionTreeRegression(trainingData, testData, trainingSize, testSize): ''' decision tree for regression ''' # parameter range maxDepthValList = [10, 20, 30] maxBinsValList = [16, 24, 32] # best parameters bestMaxDepthVal = 5 bestMaxBinsVal = 16 bestTrainingRMSE = 1e10 for maxDepthVal, maxBinsVal in itertools.product(maxDepthValList, maxBinsValList): model = DecisionTree.trainRegressor(trainingData, categoricalFeaturesInfo={}, impurity='variance', maxDepth=maxDepthVal, maxBins=maxBinsVal) predictions = model.predict(trainingData.map(lambda x: x.features)) ValsAndPreds = trainingData.map(lambda x: x.label).zip(predictions) trainingRMSE = math.sqrt( ValsAndPreds.map(lambda (v, p): (v - p)**2).reduce(lambda x, y: x + y) / trainingSize) if trainingRMSE: if trainingRMSE < bestTrainingRMSE: bestMaxDepthVal = maxDepthVal bestMaxBinsVal = maxBinsVal bestTrainingRMSE = trainingRMSE print maxDepthVal, maxBinsVal, trainingRMSE print bestMaxDepthVal, bestMaxBinsVal, bestTrainingRMSE model = DecisionTree.trainRegressor(trainingData, categoricalFeaturesInfo={}, impurity='variance', maxDepth=bestMaxDepthVal, maxBins=bestMaxBinsVal) # evaluating the model on training data predictions = model.predict(trainingData.map(lambda x: x.features)) ValsAndPreds = trainingData.map(lambda x: x.label).zip(predictions) trainingRMSE = math.sqrt( ValsAndPreds.map(lambda (v, p): (v - p)**2).reduce(lambda x, y: x + y) / trainingSize) print trainingRMSE # evaluating the model on test data predictions = model.predict(testData.map(lambda x: x.features)) ValsAndPreds = testData.map(lambda x: x.label).zip(predictions) testRMSE = math.sqrt( ValsAndPreds.map(lambda (v, p): (v - p)**2).reduce(lambda x, y: x + y) / testSize) print testRMSE pass
def main(): records = get_records() records.cache() # extract all the catgorical mappings mappings = [get_mapping(records, i) for i in range(2, 10)] cat_len = sum(map(len, mappings)) num_len = len(records.first()[11:15]) total_len = num_len + cat_len print "Feature vector length for categorical features: %d" % cat_len print "Feature vector length for numerical features: %d" % num_len print "Total feature vector length: %d" % total_len data = records.map(lambda r: LabeledPoint( extract_label(r), extract_features(r, cat_len, mappings))) data_dt = records.map( lambda r: LabeledPoint(extract_label(r), extract_features_dt(r))) first_point_dt = data_dt.first() print "Decision Tree feature vector: " + str(first_point_dt.features) print "Decision Tree feature vector length: " + str( len(first_point_dt.features)) dt_model = DecisionTree.trainRegressor(data_dt, {}) preds = dt_model.predict(data_dt.map(lambda p: p.features)) actual = data.map(lambda p: p.label) true_vs_predicted_dt = actual.zip(preds) print "Decision Tree predictions: " + str(true_vs_predicted_dt.take(5)) print "Decision Tree depth: " + str(dt_model.depth()) print "Decision Tree number of nodes: " + str(dt_model.numNodes()) calculate_print_metrics("Decision Tree", true_vs_predicted_dt)
def trainEvaluateModel(trainData, validationData, impurityParm, maxDepthParm, maxBinsParm): ''' 训练模型时会输入不同的参数。其中,DecisionTree参数有impurity、maxDepth、maxBins等的值都会影响准确率以及训练所需的时间。 我们以图表显示这些参数值、准确率与训练所需的时间。 我们每次只会评估单个参数的不同值,例如评估maxDepth参数的不同值[3, 5, 10, 15, 20, 25],执行步骤如下: (1)用DecisionTree.trainRegressor进行训练传入trainData与单个参数的不同数值; (2)建立模型后,用validationData评估模型的RMES准确率; (3)训练与评估模型重复执行多次,产生多个参数项的RMES与运行时间,并存储于metricsRDD中; (4)全部执行完成后,将metricsRDD转换为Pandas DataFrame; (5)Pandas DataFrame可绘制RMES与运行时间图表,用于显示不同参数的准确率与执行时间的关系。 :param trainData: :param validationData: :param impurityParm: :param maxDepthParm: :param maxBinsParm: :return: ''' print('======================= 训练评估模型 =======================') startTime = time() model = DecisionTree.trainRegressor(trainData, categoricalFeaturesInfo={}, impurity=impurityParm, maxDepth=maxDepthParm, maxBins=maxBinsParm) RMES = evaluateModel(model, validationData) duration = time() - startTime print('========== [trainEvaluateModel] >>>> 训练评估模型:使用参数:impurity=' + str(impurityParm) + ', maxDepth=' + str(maxDepthParm) + ', maxBins=' + str(maxBinsParm) + '\n' + '\t\t==>> 所需时间=' + str(duration) + ', 结果RMES=' + str(RMES)) return (RMES, duration, impurityParm, maxDepthParm, maxBinsParm, model)
def evaluate_dt(train,test,maxDepth,maxBins): model = DecisionTree.trainRegressor(train,{},impurity = 'variance',maxDepth = maxDepth,maxBins = maxBins) preds = model.predict(test.map(lambda p:p.features)) actual = test.map(lambda p:p.label) tp = actual.zip(preds) rmsle = np.sqrt(tp.map(lambda (t,p):squared_log_error(t,p)).mean()) return rmsle
def main(): records = get_records() records.cache() print "Mapping of first categorical feature column: %s" % get_mapping( records, 2) # extract all the catgorical mappings mappings = [get_mapping(records, i) for i in range(2, 10)] cat_len = sum(map(len, mappings)) num_len = len(records.first()[11:15]) total_len = num_len + cat_len data = records.map(lambda r: LabeledPoint( extract_label(r), extract_features(r, cat_len, mappings))) data_dt = records.map( lambda r: LabeledPoint(extract_label(r), extract_features_dt(r))) cat_features = dict([(i - 2, len(get_mapping(records, i)) + 1) for i in range(2, 10)]) print "Categorical feature size mapping %s" % cat_features # train the model again dt_model = DecisionTree.trainRegressor( data_dt, categoricalFeaturesInfo=cat_features) preds = dt_model.predict(data_dt.map(lambda p: p.features)) actual = data.map(lambda p: p.label) true_vs_predicted_dt = actual.zip(preds) calculate_print_metrics("Decision Tree Categorical Features", true_vs_predicted_dt)
def evaluate_final(description, data, maxDepth, maxBins): data_with_idx = data.zipWithIndex().map(lambda (k, v): (v, k)) test = data_with_idx.sample(False, 0.13, 63) #.13 WAS working to get 10..., for 10% ..., 63 is the seed train = data_with_idx.subtractByKey(test) train_data = train.map(lambda (idx, p): p) #train_size = train_data.count() test_data = test.map(lambda (idx, p) : p) #test_size = test_data.count() dt_model = DecisionTree.trainRegressor(train_data,{}, maxDepth=maxDepth, maxBins=maxBins) preds = dt_model.predict(test_data.map(lambda p: p.features)) actual = test_data.map(lambda p: p.label) true_vs_predicted_dt = actual.zip(preds) print '\r\n-------- ' + description + ' ---------' print "Decision Tree predictions: " + str(true_vs_predicted_dt.take(5)) print "Decision Tree depth: " + str(dt_model.depth()) print "Decision Tree number of nodes: " + str(dt_model.numNodes()) mse_dt = true_vs_predicted_dt.map(lambda (t, p): squared_error(t, p)).mean() mae_dt = true_vs_predicted_dt.map(lambda (t, p): abs_error(t, p)).mean() rmsle_dt = np.sqrt(true_vs_predicted_dt.map(lambda (t, p): squared_log_error(t, p)).mean()) print 'Decision Tree -Mean Squared Error: {0:2.4f}'.format(mse_dt) print 'Decision Tree -Root Mean Squared Error: {0:2.4f}'.format(np.sqrt(mse_dt)) print 'Decision Tree -Mean Absolute Error: {0:2.4f}'.format(mae_dt) print 'Decision Tree -Root Mean Squared Log Error: {0:2.4f}'.format(rmsle_dt)
def trainEvaluateModel(trainData,validationData,impurityParm, maxDepthParm, maxBinsParm): model = DecisionTree.trainRegressor(trainData, categoricalFeaturesInfo={}, impurity=impurityParm, maxDepth=maxDepthParm, maxBins=maxBinsParm) return model
def test_regression(self): from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \ RidgeRegressionWithSGD from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees data = [ LabeledPoint(-1.0, [0, -1]), LabeledPoint(1.0, [0, 1]), LabeledPoint(-1.0, [0, -2]), LabeledPoint(1.0, [0, 2]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] lr_model = LinearRegressionWithSGD.train(rdd, iterations=10) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) lasso_model = LassoWithSGD.train(rdd, iterations=10) self.assertTrue(lasso_model.predict(features[0]) <= 0) self.assertTrue(lasso_model.predict(features[1]) > 0) self.assertTrue(lasso_model.predict(features[2]) <= 0) self.assertTrue(lasso_model.predict(features[3]) > 0) rr_model = RidgeRegressionWithSGD.train(rdd, iterations=10) self.assertTrue(rr_model.predict(features[0]) <= 0) self.assertTrue(rr_model.predict(features[1]) > 0) self.assertTrue(rr_model.predict(features[2]) <= 0) self.assertTrue(rr_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories dt_model = DecisionTree.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0) rf_model = RandomForest.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10, maxBins=4, seed=1) self.assertTrue(rf_model.predict(features[0]) <= 0) self.assertTrue(rf_model.predict(features[1]) > 0) self.assertTrue(rf_model.predict(features[2]) <= 0) self.assertTrue(rf_model.predict(features[3]) > 0) gbt_model = GradientBoostedTrees.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4) self.assertTrue(gbt_model.predict(features[0]) <= 0) self.assertTrue(gbt_model.predict(features[1]) > 0) self.assertTrue(gbt_model.predict(features[2]) <= 0) self.assertTrue(gbt_model.predict(features[3]) > 0) try: LinearRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10) LassoWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10) RidgeRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10) except ValueError: self.fail()
def test_regression(self): from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \ RidgeRegressionWithSGD from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees data = [ LabeledPoint(-1.0, [0, -1]), LabeledPoint(1.0, [0, 1]), LabeledPoint(-1.0, [0, -2]), LabeledPoint(1.0, [0, 2]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] lr_model = LinearRegressionWithSGD.train(rdd) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) lasso_model = LassoWithSGD.train(rdd) self.assertTrue(lasso_model.predict(features[0]) <= 0) self.assertTrue(lasso_model.predict(features[1]) > 0) self.assertTrue(lasso_model.predict(features[2]) <= 0) self.assertTrue(lasso_model.predict(features[3]) > 0) rr_model = RidgeRegressionWithSGD.train(rdd) self.assertTrue(rr_model.predict(features[0]) <= 0) self.assertTrue(rr_model.predict(features[1]) > 0) self.assertTrue(rr_model.predict(features[2]) <= 0) self.assertTrue(rr_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories dt_model = DecisionTree.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0) rf_model = RandomForest.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100, seed=1) self.assertTrue(rf_model.predict(features[0]) <= 0) self.assertTrue(rf_model.predict(features[1]) > 0) self.assertTrue(rf_model.predict(features[2]) <= 0) self.assertTrue(rf_model.predict(features[3]) > 0) gbt_model = GradientBoostedTrees.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(gbt_model.predict(features[0]) <= 0) self.assertTrue(gbt_model.predict(features[1]) > 0) self.assertTrue(gbt_model.predict(features[2]) <= 0) self.assertTrue(gbt_model.predict(features[3]) > 0) try: LinearRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0])) LassoWithSGD.train(rdd, initialWeights=array([1.0, 1.0])) RidgeRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0])) except ValueError: self.fail()
def Regression_Model(filename): open_price, close_price, open_price_train, close_price_train, True_price, True_price_train, Date = get_csv_data( filename) output = [] for i in range(1, len(Date)): tmp = LabeledPoint(label=True_price_train[i], features=[close_price_train[i]]) output.append(tmp) output_train_RDD = sc.parallelize(output).cache() lrm = LinearRegressionWithSGD.train(output_train_RDD, step=0.001, iterations=100000) tree = DecisionTree.trainRegressor(output_train_RDD, categoricalFeaturesInfo={}, impurity='variance', maxDepth=5, maxBins=30) forest = RandomForest.trainRegressor(output_train_RDD, categoricalFeaturesInfo={}, numTrees=3, featureSubsetStrategy="auto", impurity='variance', maxDepth=5, maxBins=30) gradient = GradientBoostedTrees.trainRegressor(output_train_RDD, categoricalFeaturesInfo={}, numIterations=10) print("\n============MODEL Evaluation=============\n") model_name = [ 'LinearRegression', 'DecisionTree', 'RandomForest', 'GradientBoostedTrees' ] es_modelname = ['lrm', 'tree', 'forest', 'gradient'] result = '' x = 0 err = 1000 test_model = 'LinearRegression' #此处更换不同的RDD output_model_RDD = lrm for model in [lrm, tree, forest, gradient]: predictions = model.predict(output_train_RDD.map(lambda x: x.features)) labelsAndPredictions = output_train_RDD.map(lambda lp: lp.label).zip( predictions) MSE = ( labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(output_train_RDD.count()))**0.5 #print ("Predictions: ", valuesAndPreds.take(10)) result += model_name[x] + "\tMean Squared Error\t=" + str(MSE) + "\n" if (err > MSE): err = MSE output_model = model es_model = es_modelname[x] x += 1 print(result) print(es_model) return Date, True_price, output_model_RDD, open_price, close_price, es_model
def evaluate_dt(train, test, maxDepth, maxBins): model = DecisionTree.trainRegressor(train, {}, impurity='variance', maxDepth=maxDepth, maxBins=maxBins) preds = model.predict(test.map(lambda p: p.features)) actual = test.map(lambda p: p.label) tp = actual.zip(preds) rmsle = np.sqrt(tp.map(lambda (t, p): squared_log_error(t, p)).mean()) return rmsle
def trainEvaluateModel(trainData, validationData, impurityParam, maxDepthParam, maxBinsParam): startTime = time() model = DecisionTree.trainRegressor(trainData, categoricalFeaturesInfo={}, \ impurity=impurityParam, maxDepth=maxDepthParam, maxBins=maxBinsParam) RMSE = evaluateModel(model, validationData) duration = time() - startTime print("训练评估:impurity->", impurityParam, ", maxDepth->", maxDepthParam, ", maxBins->", maxBinsParam) print("==> 所需时间:", duration, "s , RMSE=", RMSE) return (RMSE, duration, impurityParam, maxDepthParam, maxBinsParam, model)
def evaluate_dt(train, test, maxDepth, maxBins): dtModel = DecisionTree.trainRegressor(train, {}, impurity='variance', maxDepth=maxDepth, maxBins=maxBins) preds = dtModel.predict(test.map(lambda p: p.features)) actual = test.map(lambda p: p.label) actual_vs_pred = actual.zip(preds) #print actual_vs_pred.take(10) #print "decision tree depth: %d" % dtModel.depth() #print "decision tree number of nodes: %d" % dtModel.numNodes() return actual_pred_error(actual_vs_pred)
def test_regression(self): from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \ RidgeRegressionWithSGD from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees data = [ LabeledPoint(-1.0, [0, -1]), LabeledPoint(1.0, [0, 1]), LabeledPoint(-1.0, [0, -2]), LabeledPoint(1.0, [0, 2]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] lr_model = LinearRegressionWithSGD.train(rdd) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) lasso_model = LassoWithSGD.train(rdd) self.assertTrue(lasso_model.predict(features[0]) <= 0) self.assertTrue(lasso_model.predict(features[1]) > 0) self.assertTrue(lasso_model.predict(features[2]) <= 0) self.assertTrue(lasso_model.predict(features[3]) > 0) rr_model = RidgeRegressionWithSGD.train(rdd) self.assertTrue(rr_model.predict(features[0]) <= 0) self.assertTrue(rr_model.predict(features[1]) > 0) self.assertTrue(rr_model.predict(features[2]) <= 0) self.assertTrue(rr_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories dt_model = DecisionTree.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0) rf_model = RandomForest.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100) self.assertTrue(rf_model.predict(features[0]) <= 0) self.assertTrue(rf_model.predict(features[1]) > 0) self.assertTrue(rf_model.predict(features[2]) <= 0) self.assertTrue(rf_model.predict(features[3]) > 0) gbt_model = GradientBoostedTrees.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(gbt_model.predict(features[0]) <= 0) self.assertTrue(gbt_model.predict(features[1]) > 0) self.assertTrue(gbt_model.predict(features[2]) <= 0) self.assertTrue(gbt_model.predict(features[3]) > 0)
def trainEvaluationModel(trainData, validationData, impurityParm, maxDepthParm, maxBinsParm): startTime = time() model = DecisionTree.trainRegressor(trainData, categoricalFeaturesInfo={}, impurity=impurityParm, maxDepth=maxDepthParm, maxBins=maxBinsParm) RMSE = evaluateModel(model, validationData) duration = time() - startTime print("训练评估:使用参数 " + \ " impurity = " + str(impurityParm) + \ " maxDepth = " + str(maxDepthParm) + \ " maxBins = " + str(maxBinsParm) + \ " ==> 所需时间 = " + str(duration) + " 秒"\ " 结果 RMSE = %f" %RMSE) return RMSE, duration, impurityParm, maxDepthParm, maxBinsParm, model
def trainEvaluateModel(trainData, validationData, impurtyParam, maxDepthParam, maxBinsParam): starttime = time() model = DecisionTree.trainRegressor(data=trainData, categoricalFeaturesInfo={}, impurity=impurtyParam, maxDepth=maxDepthParam, maxBins=maxBinsParam) RMSE = evaluateModel(model, validationData) #均方根误差 duration = time() - starttime print("训练评估使用参数:\n", "impurity=", impurtyParam, "\n maxDepth=", maxDepthParam, "\n maxBins=", maxBinsParam, "====>用时=", duration, "\n 结果AUC=", RMSE) return (RMSE, duration, impurtyParam, maxDepthParam, maxBinsParam, model)
def regression(sc, sample): traindata = sc.parallelize(sample) traindata = traindata.map(lambda x: LabeledPoint(x[1], x[0])) testdata = [8.2] ##### # linear_model = LinearRegressionWithSGD.train(traindata,iterations=10) # prediction = linear_model.predict(testdata) # print prediction ##### decision_model = DecisionTree.trainRegressor(traindata, {}) prediction = decision_model.predict(testdata) print prediction
def TrainEvaluateModel(trainData,validationData, impurityParm,maxDepthParm,maxBinsParm): startTime = time() model = DecisionTree.trainRegressor(trainData, categoricalFeaturesInfo={}, impurity=impurityParm, maxDepth=maxDepthParm, maxBins=maxBinsParm) RMSE = EvaluateModel(model, validationData) duration = time() - startTime print("Evaluate the model: use the params: " + \ "impurity=" + str(impurityParm) + \ " maxDepthParm=" + str(maxDepthParm) + \ " maxBinsParm=" + str(maxBinsParm) + "\n" + \ "====> duration time = " + str(duration) + \ " result RMSE = " + str(RMSE)) return (RMSE, duration, impurityParm, maxDepthParm, maxBinsParm, model)
def regression(sc, sample): traindata = sc.parallelize(sample) traindata = traindata.map(lambda x:LabeledPoint(x[1],x[0])) testdata = [8.2] ##### # linear_model = LinearRegressionWithSGD.train(traindata,iterations=10) # prediction = linear_model.predict(testdata) # print prediction ##### decision_model = DecisionTree.trainRegressor(traindata,{}) prediction = decision_model.predict(testdata) print prediction
def train_evaluate_model(train_data, valid_data, impurity, max_depth, max_bins): start_time = time() # 训练 model = DecisionTree.trainRegressor(train_data, categoricalFeaturesInfo={}, impurity=impurity, maxDepth=max_depth, maxBins=max_bins) # 评估 # y_pred y_true RMSE = evaluate_model(model, valid_data) duration = time() - start_time print(f"训练评估:使用参数 impurity={impurity}, maxDepth={max_depth},"\ f"maxBins={max_bins},==>所需时间={duration} 结果RMSE = {RMSE}") return RMSE, duration, impurity, max_depth, max_bins, model
def test_all(self, measure_columns=None, dimension_columns=None): measures = measure_columns[0] self._target_column = measures #dimension = dimension_columns[0] all_dimensions = self._dimension_columns all_measures = list(x for x in self._measure_columns if x != measures) cat_feature_info = [] #columns_without_dimension = list(x for x in all_dimensions if x != dimension) columns_without_dimension = all_dimensions mapping_dict = {} masterMappingDict = {} decision_tree_result = DecisionTreeResult() for column in all_dimensions: mapping_dict[column] = dict(enumerate(self._data_frame.select(column).distinct().rdd.map(lambda x: str(x[0])).collect())) # for c in mapping_dict: # name = c # reverseMap = {v: k for k, v in mapping_dict[c].iteritems()} # udf = UserDefinedFunction(lambda x: reverseMap[x], StringType()) # self._data_frame = self._data_frame.select(*[udf(column).alias(name) if column == name else column for column in self._data_frame.columns]) # converting spark dataframe to pandas for transformation and then back to spark dataframe pandasDataFrame = self._data_frame.toPandas() for key in mapping_dict: pandasDataFrame[key] = pandasDataFrame[key].apply(lambda x: 'None' if x==None else x) reverseMap = {v: k for k, v in mapping_dict[key].items()} pandasDataFrame[key] = pandasDataFrame[key].apply(lambda x: reverseMap[x]) # sqlCtx = SQLContext(self._spark) self._data_frame = self._spark.createDataFrame(pandasDataFrame) self._mapping_dict = mapping_dict for c in columns_without_dimension: cat_feature_info.append(self._data_frame.select(c).distinct().count()) if len(cat_feature_info)>0: max_length = max(cat_feature_info) else: max_length=32 cat_feature_info = dict(enumerate(cat_feature_info)) #dimension_classes = self._data_frame.select(dimension).distinct().count() self._data_frame = self._data_frame[[measures] + columns_without_dimension + all_measures] data = self._data_frame.rdd.map(lambda x: LabeledPoint(x[0], x[1:])) (trainingData, testData) = data.randomSplit([1.0, 0.0]) # TO DO : set maxBins at least equal to the max level of categories in dimension column model = DecisionTree.trainRegressor(trainingData, categoricalFeaturesInfo=cat_feature_info, impurity='variance', maxDepth=3, maxBins=max_length) output_result = model.toDebugString() decision_tree = self.tree_json(output_result, self._data_frame) self.generate_probabilities(decision_tree, measures) decision_tree_result.set_params(self._new_tree, self._new_rules, self._total, self._success, self._probability) return decision_tree_result
def trainEvaluateModel(trainData, validationData, impurityParm, maxDepthParm, maxBinsParm): startTime = time() model = DecisionTree.trainRegressor(trainData, categoricalFeaturesInfo={}, impurity=impurityParm, maxDepth=maxDepthParm, maxBins=maxBinsParm) RMSE = evaluateModel(model, validationData) duration = time() - startTime print "訓練評估:使用參數" + \ " impurityParm= %s" % impurityParm + \ " maxDepthParm= %s" % maxDepthParm + \ " maxBinsParm = %d." % maxBinsParm + \ " 所需時間=%d" % duration + \ " 結果RMSE = %f " % RMSE return (RMSE, duration, impurityParm, maxDepthParm, maxBinsParm, model)
def dealData(path): rawData = sc.textFile(path + 'hour.csv') header = rawData.first() rData = rawData.filter(lambda x: x != header) lines = rData.map(lambda x: x.split(",")) labelpointRDD = lines.map( lambda r: LabeledPoint(process_label(r), process_features(r))) print(labelpointRDD.first()) # 划分训练集、验证集和测试集 (trainData, validationData, testData) = labelpointRDD.randomSplit([7, 1, 2]) print("训练集样本个数:" + str(trainData.count()) + " 验证集样本个数:" + str(validationData.count()) + " 测试集样本个数:" + str(testData.count())) # 将数据暂存在内存中,加快后续运算效率 trainData.persist() validationData.persist() testData.persist() model = DecisionTree.trainRegressor(trainData, categoricalFeaturesInfo={}, impurity="variance", maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0) rmse = RMSE(model, validationData) print("均方误差RMSE=" + str(rmse)) ## 评估参数 maxDepth maxDepthList = [3, 5, 10, 15, 20, 25] maxBinsList = [10] minInstancesPerNodeList = [1] minInfoGainList = [0.0] ## 返回结果存放至metries中 metrics = [ trainEvaluateModel(trainData, validationData, maxDepth, maxBins, minInstancesPerNode, minInfoGain) for maxDepth in maxDepthList for maxBins in maxBinsList for minInstancesPerNode in minInstancesPerNodeList for minInfoGain in minInfoGainList ]
def test_regression(self): from pyspark.mllib.regression import ( LinearRegressionWithSGD, LassoWithSGD, RidgeRegressionWithSGD, ) from pyspark.mllib.tree import DecisionTree data = [ LabeledPoint(-1.0, self.scipy_matrix(2, {1: -1.0})), LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})), LabeledPoint(-1.0, self.scipy_matrix(2, {1: -2.0})), LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0})), ] rdd = self.sc.parallelize(data) features = [p.features for p in data] lr_model = LinearRegressionWithSGD.train(rdd) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) lasso_model = LassoWithSGD.train(rdd) self.assertTrue(lasso_model.predict(features[0]) <= 0) self.assertTrue(lasso_model.predict(features[1]) > 0) self.assertTrue(lasso_model.predict(features[2]) <= 0) self.assertTrue(lasso_model.predict(features[3]) > 0) rr_model = RidgeRegressionWithSGD.train(rdd) self.assertTrue(rr_model.predict(features[0]) <= 0) self.assertTrue(rr_model.predict(features[1]) > 0) self.assertTrue(rr_model.predict(features[2]) <= 0) self.assertTrue(rr_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories dt_model = DecisionTree.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0)
def trainEvaluateModel(trainData, validationData, maxDepthParm, maxBinsParm, minInstancesPerNodeParm, minInfoGainParm): startTime = time.time() # 创建并训练模型 model = DecisionTree.trainRegressor( trainData, categoricalFeaturesInfo={}, impurity="variance", maxDepth=maxDepthParm, maxBins=maxBinsParm, minInstancesPerNode=minInstancesPerNodeParm, minInfoGain=minInfoGainParm) # 计算RMSE rmse = RMSE(model, validationData) duration = time.time() - startTime # 持续时间 print("训练评估:参数" + ", maxDepth=" + str(maxDepthParm) + ", maxBins=" + str(maxBinsParm) + ", minInstancesPerNode=" + str(minInstancesPerNodeParm) + ", minInfoGainParm=" + str(minInfoGainParm) + "\n" "===>消耗时间=" + str(duration) + ", 均方误差RMSE=" + str(rmse)) return rmse, duration, maxDepthParm, maxBinsParm, minInstancesPerNodeParm, minInfoGainParm, model
def train_model(): data = get_dataset() (trainingData, testData) = data.randomSplit([0.7, 0.3]) metrics_combos = [] bins = [x for x in range(50, 500, 10)] depths = [x for x in range(4, 12)] for numBin in bins: for depth in depths: model = DecisionTree.trainRegressor(trainingData, categoricalFeaturesInfo={}, impurity='variance', maxDepth=depth, maxBins=numBin) predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip( predictions) metrics = RegressionMetrics(labelsAndPredictions) metrics_combos.append(((numBin, depth), metrics.meanSquaredError)) print(sorted(metrics_combos, key=lambda s: s[1]))
def test_regression(self): from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \ RidgeRegressionWithSGD from pyspark.mllib.tree import DecisionTree data = [ LabeledPoint(-1.0, [0, -1]), LabeledPoint(1.0, [0, 1]), LabeledPoint(-1.0, [0, -2]), LabeledPoint(1.0, [0, 2]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] lr_model = LinearRegressionWithSGD.train(rdd) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) lasso_model = LassoWithSGD.train(rdd) self.assertTrue(lasso_model.predict(features[0]) <= 0) self.assertTrue(lasso_model.predict(features[1]) > 0) self.assertTrue(lasso_model.predict(features[2]) <= 0) self.assertTrue(lasso_model.predict(features[3]) > 0) rr_model = RidgeRegressionWithSGD.train(rdd) self.assertTrue(rr_model.predict(features[0]) <= 0) self.assertTrue(rr_model.predict(features[1]) > 0) self.assertTrue(rr_model.predict(features[2]) <= 0) self.assertTrue(rr_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories dt_model = \ DecisionTree.trainRegressor(rdd, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0)
def test_regression(self): from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \ RidgeRegressionWithSGD from pyspark.mllib.tree import DecisionTree data = [ LabeledPoint(-1.0, self.scipy_matrix(2, {1: -1.0})), LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})), LabeledPoint(-1.0, self.scipy_matrix(2, {1: -2.0})), LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0})) ] rdd = self.sc.parallelize(data) features = [p.features for p in data] lr_model = LinearRegressionWithSGD.train(rdd) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) lasso_model = LassoWithSGD.train(rdd) self.assertTrue(lasso_model.predict(features[0]) <= 0) self.assertTrue(lasso_model.predict(features[1]) > 0) self.assertTrue(lasso_model.predict(features[2]) <= 0) self.assertTrue(lasso_model.predict(features[3]) > 0) rr_model = RidgeRegressionWithSGD.train(rdd) self.assertTrue(rr_model.predict(features[0]) <= 0) self.assertTrue(rr_model.predict(features[1]) > 0) self.assertTrue(rr_model.predict(features[2]) <= 0) self.assertTrue(rr_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories dt_model = DecisionTree.trainRegressor(rdd, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0)
def getPredictionsLabels(model, test): predictions = model.predict(test.map(lambda r: r.features)) return predictions.zip(test.map(lambda r: r.label)) def printMetrics(predictions_and_labels): metrics = RegressionMetrics(predictions_and_labels) f.write('Explained Variance:{0}\n'.format(metrics.explainedVariance)) f.write('Mean Absolute Error:{0}\n'.format(metrics.meanAbsoluteError)) f.write('Mean Squared Error:{0}\n'.format(metrics.meanSquaredError)) f.write('Root Mean Squared Error:{0}\n'.format( metrics.rootMeanSquaredError)) f.write('R^2 :{0}\n'.format(metrics.r2)) for j in range(numModels): regp = paramGrid[j]['regParam'] iters = paramGrid[j]['iterations'] regt = paramGrid[j]['regType'] con = paramGrid[j]['convergenceTol'] #f.write('Model{0}: regParam = {1}, iterations = {2}, regType = {3}, convergenceTol = {4}\n'.format(str(j), regp, iters, regt, con)) # Train decision tree regression model with hypermarameter set model = DecisionTree.trainRegressor(training, categoricalFeaturesInfo = {}, impurity='variance', \ maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0) predictions_and_labels = getPredictionsLabels(model, test) printMetrics(predictions_and_labels) f.close() sc.stop()
#ArrDelay is our response #ArrDelay becomes the 8tth column now, and total columns in the data = 12 label = clean_line_split[0] nonLable = clean_line_split[1:] return LabeledPoint (label, nonLable) parsedData = raw_data.map (parsePoint) #divide training and test data by 70-30 rule (training, test) = parsedData.randomSplit([0.7, 0.3]) #start timer at this point startTime = datetime.now() #build the model #empty categoricalFeaturesInfo indicates all features are continuous. model = DecisionTree.trainRegressor (training, categoricalFeaturesInfo={}, impurity='variance', maxDepth=5, maxBins=32) #evaluate model on test instances and compute test error predictions = model.predict (test.map (lambda x: x.features)) labelsAndPredictions = test.map (lambda lp: lp.label).zip (predictions) testMSE = labelsAndPredictions.map (lambda (v, p): (v - p) * (v - p)).sum() /\ float(testData.count()) print ('Time consumed = '), (datetime.now() - startTime) print ('Test Mean Squared Error = ' + str (testMSE)) print ('Learned regression tree model:') print (model.toDebugString()) #save and load model model.save (sc, "DTR-Narrow-2008")
# get 90% train and 10% test data data_with_idx = data_dt.zipWithIndex().map(lambda (k, v): (v, k)) test = data_with_idx.sample(False, 0.1) train = data_with_idx.subtractByKey(test) train_data = train.map(lambda (idx, p): p) test_data = test.map(lambda (idx, p): p) train_size = train_data.count() test_size = test_data.count() print "Training data size: %d" % train_size print "Test data size: %d" % test_size print "Total data size: %d " % num_data print "Train + Test size : %d" % (train_size + test_size) # make decision tree model dt_model = DecisionTree.trainRegressor(train_data, {}) # make predictions and measure error preds = dt_model.predict(test_data.map(lambda p: p.features)) actual = test_data.map(lambda p: p.label) true_vs_predicted_dt = actual.zip(preds) print "Decision Tree predictions: " + str(true_vs_predicted_dt.take(5)) print "Decision Tree depth: " + str(dt_model.depth()) print "Decision Tree number of nodes: " + str(dt_model.numNodes()) def squared_error(actual, pred): return (pred - actual)**2 def squared_log_error(pred, actual):
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel from pyspark.mllib.util import MLUtils sc = SparkContext(appName="PythonWordCount") data = MLUtils.loadLibSVMFile(sc, '/usr/hadoop/para_avg_halfsqrsum.txt') traindata = MLUtils.loadLibSVMFile(sc, '/usr/hadoop/train_para.txt') data_720 = MLUtils.loadLibSVMFile(sc, '/usr/hadoop/para_avg_halfsqrsum_720.txt') data_540 = MLUtils.loadLibSVMFile(sc, '/usr/hadoop/para_avg_halfsqrsum_540.txt') data_360 = MLUtils.loadLibSVMFile(sc, '/usr/hadoop/para_avg_halfsqrsum_360.txt') model = DecisionTree.trainRegressor(traindata, categoricalFeaturesInfo={}, impurity='variance', maxDepth=5, maxBins=32) predictions = model.predict(data.map(lambda x: x.features)) labelsandpredictions = data.map(lambda lp: lp.label).zip(predictions) MSE = labelsandpredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float( data.count()) print("training MSE = " + str(MSE)) #labelsandpredictions.saveAsTextFile("/usr/hadoop/hf_dt") predictions_720 = model.predict(data_720.map(lambda x: x.features)) labelsandpredictions_720 = data_720.map(lambda lp: lp.label).zip( predictions_720) MSE_720 = labelsandpredictions_720.map(lambda (v, p): (v - p) * (v - p)).sum() / float(data_720.count()) print("training MSE_720 = " + str(MSE_720))
data_dt = records.map(lambda r: LabeledPoint(extract_label(r), extract_features_dt(r))) first_point_dt = data_dt.first() first_point_dt.label first_point_dt.features len(first_point_dt.features) from pyspark.mllib.regression import LinearRegressionWithSGD from pyspark.mllib.tree import DecisionTree linear_model = LinearRegressionWithSGD.train(data, iterations=10, step=0.1, intercept=False) true_vs_predicted = data.map(lambda p: (p.label, linear_model.predict(p.features))) true_vs_predicted.take(5) dt_model = DecisionTree.trainRegressor(data_dt, {}) preds = dt_model.predict(data_dt.map(lambda p: p.features)) actual = data_dt.map(lambda p: p.label) true_vs_predicted_dt = actual.zip(preds) true_vs_predicted_dt.take(5) dt_model.depth() dt_model.numNodes() def squared_error(actual, pred): return (pred - actual) ** 2 def abs_error(actual, pred): return np.abs(pred - actual) def squared_log_error(actual, pred): return (np.log(pred + 1) - np.log(actual + 1)) ** 2 true_vs_predicted.map(lambda t: squared_error(t[0], t[1])).mean()
def learn(examples,depth,bin): global model model = DecisionTree.trainRegressor(examples, categoricalFeaturesInfo={}, impurity='variance', maxDepth=depth, maxBins=bin)
def evaluate_dt(train, test, maxDepth, maxBins): dt_model = DecisionTree.trainRegressor(train, categoricalFeaturesInfo={0:4},impurity='variance', maxDepth=maxDepth, maxBins=maxBins) dt_predictions = dt_model.predict(test.map(lambda x: x.features)) dt_labelsAndPredictions = test.map(lambda lp: lp.label).zip(dt_predictions) dt_testMSE = dt_labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(testData.count()) return dt_testMSE
# get 90% train and 10% test data data_with_idx = data_dt.zipWithIndex().map(lambda (k, v): (v, k)) test = data_with_idx.sample(False, 0.1) train = data_with_idx.subtractByKey(test) train_data = train.map(lambda (idx, p): p) test_data = test.map(lambda (idx, p) : p) train_size = train_data.count() test_size = test_data.count() print "Training data size: %d" % train_size print "Test data size: %d" % test_size print "Total data size: %d " % num_data print "Train + Test size : %d" % (train_size + test_size) # make decision tree model dt_model = DecisionTree.trainRegressor(train_data,{}) # make predictions and measure error preds = dt_model.predict(test_data.map(lambda p: p.features)) actual = test_data.map(lambda p: p.label) true_vs_predicted_dt = actual.zip(preds) print "Decision Tree predictions: " + str(true_vs_predicted_dt.take(5)) print "Decision Tree depth: " + str(dt_model.depth()) print "Decision Tree number of nodes: " + str(dt_model.numNodes()) def squared_error(actual, pred): return (pred - actual)**2 def squared_log_error(pred, actual): return (np.log(pred + 1) - np.log(actual + 1))**2
# MAGIC %md DecisionTree performs best when it is told which features are categorical. We constructor a map categoricalFeaturesInfo to pass this information to DecisionTree. # MAGIC If DecisionTree is not given this info, then it will treat all features as continuous. # COMMAND ---------- # Construct a map for categorical features: # categoricalFeaturesInfo[column index] = number of categories categoricalFeaturesInfo = {} for j in xrange(numFeatures): col = featureCols[j] if col in categoryIndexes: categoricalFeaturesInfo[j] = len(categoryIndexes[col]) # COMMAND ---------- initialModel = DecisionTree.trainRegressor(trainingData, categoricalFeaturesInfo) initialModel # COMMAND ---------- # We can print the full model, but it can be hard to parse when the tree is large. print initialModel.toDebugString() # COMMAND ---------- # MAGIC %md We now compute the error of the DecisionTreeModel on the training dataset. We use Root Mean Squared Error (RMSE) as our error metric. # MAGIC # MAGIC Denote (y_i, x_i) as the (label, feature vector) for instance i, and write model.predict(x_i) as our model's predicted label for instance i. RMSE is defined as: # MAGIC # MAGIC %[ RMSE(dataset) = \left[ \mathbf{avg}_{(y_i, x_i) \in dataset} \left( y_i - model.predict(x_i) \right)^2 \right]^{1/2} ]%
def run_decision_tree(userid): conf = SparkConf().setMaster("local[1]").setAppName("heart-disease-prediction-descision-tree") sc = SparkContext(conf=conf) print "Running Spark Version %s" % (sc.version) # https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data path = "/home/raju/Documents/hdp_proj" heartdf_tr = pd.read_csv(path+"processed.cleveland.data.csv",header=None) heartdf_test = pd.read_csv(path+"testdata.csv",header=None) print "Original training Dataset (Rows:Colums): " print heartdf_tr.shape print heartdf_test.shaperead_csvread_csvread_csv print "Categories of Diagnosis of heart disease (angiographic disease status) that we are predicting" print "-- Value 0: < 50% diameter narrowing" print "-- Value 1: > 50% diameter narrowing " print heartdf_tr.ix[:,13].unique() #Column containing the Diagnosis of heart disease print heartdf_test.ix[:,13].unique() #Column containing the Diagnosis of heart disease newheartdf = pd.concat([heartdf_tr.ix[:,13], heartdf_tr.ix[:,0:12]],axis=1, join_axes=[heartdf_tr.index]) newheartdf_test = pd.concat([heartdf_test.ix[:,13], heartdf_test.ix[:,0:12]],axis=1, join_axes=[heartdf_test.index]) newheartdf.replace('?', np.nan, inplace=True) # Replace ? values newheartdf_test.replace('?', np.nan, inplace=True) # Replace ? values print "After dropping rows with anyone empty value (Rows:Columns): " ndf2 = newheartdf.dropna() ndf_test = newheartdf_test.dropna() ndf2.to_csv(path+"new-heart-disease-cleaveland.txt",sep=",",index=False,header=None,na_rep=np.nan) ndf_test.to_csv(path+"new-heart-disease-cleaveland-test.txt",sep=",",index=False,header=None,na_rep=np.nan) print ndf2.shape print ndf_test.shape print ndf2.ix[:5,:] print ndf_test.ix[:5,:] print "Create a Labeled point which is a local vector, associated with a label/response" points = sc.textFile(path+'new-heart-disease-cleaveland.txt') points_test = sc.textFile(path+'new-heart-disease-cleaveland-test.txt') print "###############################Something" parsed_data = points.map(parsePoint) parsed_data_test = points_test.map(parsePoint) print 'After parsing, number of training lines: %s' %parsed_data.take(5) #parsed_data.count() print 'After parsing, number of test data lines: %s' %parsed_data_test.take(5) #parsed_data.count() #####Perform Classification using a Decision Tree##### # Split the data into training and test sets (30% held out for testing) (trainingData, trainingData1) = parsed_data.randomSplit([1,0]) (testData , testData1) = parsed_data_test.randomSplit([1,0]) # Train a DecisionTree model. # Empty categoricalFeaturesInfo indicates all features are continuous. print "+++++++++++++++++++++++++++++++++ Perform Classification using a Decision Tree +++++++++++++++++++++++++++++++++" model = DecisionTree.trainClassifier(trainingData, numClasses=5, categoricalFeaturesInfo={}, impurity='gini', maxDepth=4, maxBins=32) predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count()) print('Test Error = ' + str(testErr)) print('=================== Learned classification tree model ====================') print(model.toDebugString()) print "+++++++++++++++++++++++++++++++++ Perform Regression using a Decision Tree +++++++++++++++++++++++++++++++++" model1 = DecisionTree.trainRegressor(trainingData, categoricalFeaturesInfo={}, impurity='variance', maxDepth=4, maxBins=32) ####### Evaluate model on test instances and compute test error######## predictions = model1.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(testData.count()) print('Test Mean Squared Error = ' + str(testMSE)) print('================== Learned regression tree model ====================') print(model1.toDebugString()) print(userid) input_data = get_input_data(userid[-20:-2]) #features = vector.dense(result) prediction_value = model1.predict(input_data) print(prediction_value) post_prediction(userid[-20:-2],prediction_value)
from pyspark.mllib.tree import DecisionTree def loadRecord(line): input = StringIO.StringIO(line) reader = csv.reader(input) row = map(float, reader.next()) return LabeledPoint(row[-1],row[:-1]) chf = open('data/CAhousing.csv','r') header = chf.next().rstrip("\n").split(",") for i,j in enumerate(header): print "%d: %s" % (i,j) chrdd = sc.parallelize(chf).map(lambda line: loadRecord(line)) chrdd.persist() (trainingData, testData) = chrdd.randomSplit([0.7, 0.3]) model = DecisionTree.trainRegressor(trainingData, categoricalFeaturesInfo={}, impurity='variance', minInstancesPerNode=2500) predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(testData.count()) with open("trunk.txt", "w") as f: f.write('Test Mean Squared Error = ' + str(testMSE)) f.write('Learned regression tree model:') f.write( model.toDebugString() )
data = records.map( lambda r: LabeledPoint(extract_label(r), extract_features(r))) first_point = data.first() print("Raw data :") print(first[2:]) print("Label") print(first_point.label) print("decision tree model feature vector :") print(first_point.features) print("decision tree model feature vector length :" + str(len(first_point.features))) # dt_model = DecisionTree.trainRegressor(data, {}) preds = dt_model.predict(data.map(lambda d: d.features)) actual = data.map(lambda d: d.label) true_vs_predicted = actual.zip(preds) print("decision tree prediction :" + str(true_vs_predicted.take(5))) print("decision tree depth :" + str(dt_model.depth())) print("decision tree number of nodes :" + str(dt_model.numNodes())) # def squared_error(actual, prediction): return (actual - prediction)**2 def abs_error(actual, prediction): return np.abs(actual - prediction)
from sklearn.cross_validation import LeaveOneOut from sklearn.cross_validation import KFold # Kfold if __name__ == "__main__": sc = SparkContext('local',appName="Prediction") import fileinput data_y1, data_y2 = [], [] for line in fileinput.input("data/feature_extracted_class3.txt"): data_y1.append(LabeledPoint(float(1 if int(line.split("\t")[2])!=0 else 0), [float(i) for i in line.split("\t")[3:]])) data_y2.append(LabeledPoint(int(line.split("\t")[2]), [float(i) for i in line.split("\t")[3:]])) total, right, mse = 0, 0, [] for t in xrange(10): kf = KFold(32*40, n_folds=10) for train, test in kf: data_train_y1, data_train_y2 = [], [] for i in train: data_train_y1.append(data_y1[i]) data_train_y2.append(data_y2[i]) clf1 = DecisionTree.trainClassifier(sc.parallelize(data_train_y1), numClasses=2, categoricalFeaturesInfo={}, impurity='gini', maxDepth=5, maxBins=100) clf2 = DecisionTree.trainRegressor(sc.parallelize(data_train_y2), categoricalFeaturesInfo={}, impurity='variance', maxDepth=5, maxBins=100) for i in test: data_test_y1, data_test_y2 = data_y1[i], data_y2[i] r1 = clf1.predict(data_test_y1.features) r2 = clf2.predict(data_test_y2.features) if r1 == data_test_y1.label: right += 1 mse.append(abs(r2-data_test_y2.label)) total += 1 print float(right)/total, sum(mse)/len(mse)
print "Decision Tree feature vector length: " + str( len(first_point_tree.features)) # In[167]: from pyspark.mllib.tree import DecisionTree #from the RDD sample 20% for training and rest for test records_tree_with_idx = data_tree.zipWithIndex().map(lambda (k, v): (v, k)) test_tree_idx = records_tree_with_idx.sample(False, 0.2, 42) training_tree_idx = records_tree_with_idx.subtractByKey(test_tree_idx) test_tree = test_tree_idx.map(lambda (idx, p): p) training_tree = training_tree_idx.map(lambda (idx, p): p) model_tree = DecisionTree.trainRegressor(training_tree, {}) preds_tree = model_tree.predict(test_tree.map(lambda p: p.features)) actual_tree = test_tree.map(lambda p: p.label) true_vs_predicted_tree = actual_tree.zip(preds_tree) print "Decision Tree predictions: " + str(true_vs_predicted_tree.take(5)) print "Decision Tree depth: " + str(model_tree.depth()) print "Decision Tree number of nodes: " + str(model_tree.numNodes()) # In[177]: mse_tree = true_vs_predicted_tree.map(lambda (t, p): squared_error(t, p)).mean() mae_tree = true_vs_predicted_tree.map(lambda (t, p): abs_error(t, p)).mean()
help(DecisionTree.trainRegressor) # ## Train a Regression Model on the Bike Sharing Dataset # In[9]: linear_model = LinearRegressionWithSGD.train(data, iterations=10, step=0.1, intercept=False) true_vs_predicted = data.map(lambda p: (p.label, linear_model.predict(p.features))) print "Linear Model predictions: " + str(true_vs_predicted.take(5)) # In[10]: # we pass in an mepty mapping for categorical feature size {} dt_model = DecisionTree.trainRegressor(data_dt, {}) preds = dt_model.predict(data_dt.map(lambda p: p.features)) actual = data.map(lambda p: p.label) true_vs_predicted_dt = actual.zip(preds) print "Decision Tree predictions: " + str(true_vs_predicted_dt.take(5)) print "Decision Tree depth: " + str(dt_model.depth()) print "Decision Tree number of nodes: " + str(dt_model.numNodes()) # ## Perfomance Metrics # In[11]: # set up performance metrics functions def squared_error(actual, pred):
def extract_label(record): return float(record[17]) data_dt = RDD.map(lambda r: LabeledPoint(extract_label(r),extract_features_dt(r))) first_point_dt = data_dt.first() print "Decision Tree feature vector: " + str(first_point_dt.features) print "Decision Tree feature vector length: " + str(len(first_point_dt.features)) training_dt, test_dt = data_dt.randomSplit([0.9, 0.1]) print "trainging_dt count = ", training_dt.count() print "test_dt count = ", test_dt.count() print "###########Start decision tree using Spark MLLib ################" from pyspark.mllib.tree import DecisionTree dt_model = DecisionTree.trainRegressor(training_dt,{}) preds = dt_model.predict(test_dt.map(lambda p: p.features)) actual = test_dt.map(lambda p: p.label) true_vs_predicted_dt = actual.zip(preds) print "Decision Tree predictions: " + str(true_vs_predicted_dt.collect()) print "Decision Tree depth: " + str(dt_model.depth()) print "Decision Tree number of nodes: " + str(dt_model.numNodes()) # data_dt.saveAsTextFile("file:///home/cloudera/MMZ/FinalProject/temp/temp_training_data_dt") def squared_error(actual, pred): # Mean Squared Error (MSE) return (pred - actual)**2 def abs_error(actual, pred): # Mean Absolute Error (MAE) return np.abs(pred - actual) def squared_log_error(pred, actual): # Root Mean Squared Log Error (RMSE) return (np.log(pred + 1) - np.log(actual + 1))**2 mse_dt = true_vs_predicted_dt.map(lambda (t, p): squared_error(t, p)).mean()
summary = Statistics.colStats(testvecData) variance = summary.variance()[0] # compute the pseudo R-square test_Rsqr1 = 1 - testMSE1/float(variance) # Train a DecisionTree model. # Empty categoricalFeaturesInfo indicates all features are continuous. # use variance as impurity for regression # maxDepth is the maximum number of level for each tree model2 = DecisionTree.trainRegressor(trainparsedData , categoricalFeaturesInfo={} , impurity='variance' , maxDepth=8 , maxBins=32) # evaluate the training error # first make the prediction and create a new "vector" of all the predictions trainpredictions = model2.predict(trainparsedData.map(lambda x: x.features)) # then you column bind the prediction and actual values into a new RDD trainlabelsAndPredictions = trainparsedData.map(lambda lp: lp.label).zip(trainpredictions) # use map operation to compute MSE trainMSE2 = trainlabelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(trainparsedData.count()) # use the the Statistics library to obtain the variance summary = Statistics.colStats(trainvecData) variance = summary.variance()[0]
# In[22]: for i,x in enumerate(features): print i,x # In[23]: # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = d2.randomSplit([0.7, 0.3]) # Train a DecisionTree model. # Empty categoricalFeaturesInfo indicates all features are continuous. model = DecisionTree.trainRegressor(trainingData, categoricalFeaturesInfo={}, impurity="variance", maxDepth=6, maxBins=12) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(testData.count()) print('Test Mean Squared Error = ' + str(testMSE)) print('Learned regression tree model:') print(model.toDebugString()) # In[24]: # plt.xlabel("response") plt.ylabel("prediction")
print '决策树特征向量长度: ' + str(len(first_point_dt.features)) from pyspark.mllib.regression import LinearRegressionWithSGD from pyspark.mllib.tree import DecisionTree help(LinearRegressionWithSGD.train) linear_model = LinearRegressionWithSGD.train(data, iterations=10, step=0.1, intercept=False) true_vs_predicted = data.map( lambda point: (point.label, linear_model.predict(point.features))) print '线性回归模型对前5个样本的预测值: ' + str(true_vs_predicted.take(5)) dt_model = DecisionTree.trainRegressor(data_dt, {}) preds = dt_model.predict(data_dt.map(lambda p: p.features)) actual = data.map(lambda p: p.label) true_vs_predicted_dt = actual.zip(preds) print '决策树回归模型对前5个样本的预测值: ' + str(true_vs_predicted_dt.take(5)) print '决策树模型的深度: ' + str(dt_model.depth()) print '决策树模型的叶子节点个数: ' + str(dt_model.numNodes()) def squared_error(actual, pred): return (pred - actual)**2 def abs_error(actual, pred): return np.abs(pred - actual)