def main(): #Reading the test and train files trainData = sc.pickleFile(input + '/Train_data.average/part-00000') testData = sc.pickleFile(input + '/Test_data.average/part-00000') parsedData = trainData.map(parseInput).filter( lambda line: len(line.features) != 0 or len(line.label) != 0) parsedTestData = testData.map(parseInput).filter( lambda line: len(line.features) != 0 or len(line.label) != 0) numTrees = [3, 5, 10] bestmaxBins = [5, 10, 15] BestError = 1000000 #Cross validation for x in bestmaxBins: for y in numTrees: (Train_RDD, Valid_RDD) = trainData.randomSplit([1, 2], 10L) parsed_input = Train_RDD.map(parseInput).filter( lambda line: len(line.features) != 0 or len(line.label) != 0) parsed_valid = Valid_RDD.map(parseInput).filter( lambda line: len(line.features) != 0 or len(line.label) != 0) model = RandomForest.trainRegressor(parsed_input, categoricalFeaturesInfo={}, numTrees=y, featureSubsetStrategy="auto", impurity='variance', maxDepth=4, maxBins=x) predictions = model.predict(parsed_valid.map(lambda x: x.features)) labelsAndPredictions = parsed_valid.map(lambda lp: lp.label).zip( predictions) validationErr = labelsAndPredictions.filter( lambda (v, p): v != p).count() / float(parsed_valid.count()) RMSE = math.sqrt(validationErr) if RMSE < BestError: BestError = RMSE bestmaxBins = x bestnumTrees = y #Finding Test error model = RandomForest.trainRegressor(parsedData, categoricalFeaturesInfo={}, numTrees=bestnumTrees, featureSubsetStrategy="auto", impurity='variance', maxDepth=4, maxBins=bestmaxBins) predictions = model.predict(parsedTestData.map(lambda x: x.features)) labelsAndPredictions = parsedTestData.map(lambda lp: lp.label).zip( predictions) testErr = labelsAndPredictions.filter( lambda (v, p): v != p).count() / float(parsedTestData.count()) RMSE_test = math.sqrt(testErr) #Reporting validation and test error print("Best Root Mean Squared Error Validation= " + str(BestError)) print("Best Root Mean Squared Error Test= " + str(RMSE_test))
def _set_rddModel(self, _type, _SLA, data): if _type == 'regression': if _SLA == 'randomForest': self._rddModel = RandomForest.trainRegressor( data, categoricalFeaturesInfo={}, numTrees=int(self.sparkOptions[4]), featureSubsetStrategy=self.sparkOptions[5], impurity='variance', maxDepth=int(self.sparkOptions[1]), maxBins=32) else: self._rddModel = "" else: #classification if _SLA == 'randomForest': print self.numClasses self._rddModel = RandomForest.trainClassifier( data, numClasses=self.numClasses, categoricalFeaturesInfo={}, numTrees=int(self.sparkOptions[4]), maxDepth=int(self.sparkOptions[1]), featureSubsetStrategy=self.sparkOptions[5], impurity=self.sparkOptions[2]) else: self._rddModel = ""
def main(): input_train = sys.argv[1] input_test = sys.argv[2] conf = SparkConf().setAppName('Sentiment Analysis with Random Forest') sc = SparkContext(conf=conf) assert sc.version >= '1.5.1' train = sc.textFile(input_train).cache() test = sc.textFile(input_test).cache() '''sbaronia - get training and testing labeled points''' train_lp = train.map(to_labeledpoint).cache() test_lp = test.map(to_labeledpoint).cache() '''sbaronia - run RandomForest regression on our training data with default options except numTrees = 5''' rf_model = RandomForest.trainRegressor(train_lp,categoricalFeaturesInfo={},numTrees=5,featureSubsetStrategy="auto", impurity='variance', maxDepth=4, maxBins=32) '''sbaronia - run predictions on testing data and calculate RMSE value''' predictions = rf_model.predict(test_lp.map(lambda x: x.features)) labelsAndPredictions = test_lp.map(lambda lp: lp.label).zip(predictions) rmse = math.sqrt(labelsAndPredictions.map(lambda (v, p): (v-p)**2).reduce(lambda x, y: x + y)/float(test_lp.count())) print("RMSE = " + str(rmse))
def test_regression(self): from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \ RidgeRegressionWithSGD from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees data = [ LabeledPoint(-1.0, [0, -1]), LabeledPoint(1.0, [0, 1]), LabeledPoint(-1.0, [0, -2]), LabeledPoint(1.0, [0, 2]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] lr_model = LinearRegressionWithSGD.train(rdd) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) lasso_model = LassoWithSGD.train(rdd) self.assertTrue(lasso_model.predict(features[0]) <= 0) self.assertTrue(lasso_model.predict(features[1]) > 0) self.assertTrue(lasso_model.predict(features[2]) <= 0) self.assertTrue(lasso_model.predict(features[3]) > 0) rr_model = RidgeRegressionWithSGD.train(rdd) self.assertTrue(rr_model.predict(features[0]) <= 0) self.assertTrue(rr_model.predict(features[1]) > 0) self.assertTrue(rr_model.predict(features[2]) <= 0) self.assertTrue(rr_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories dt_model = DecisionTree.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0) rf_model = RandomForest.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100, seed=1) self.assertTrue(rf_model.predict(features[0]) <= 0) self.assertTrue(rf_model.predict(features[1]) > 0) self.assertTrue(rf_model.predict(features[2]) <= 0) self.assertTrue(rf_model.predict(features[3]) > 0) gbt_model = GradientBoostedTrees.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(gbt_model.predict(features[0]) <= 0) self.assertTrue(gbt_model.predict(features[1]) > 0) self.assertTrue(gbt_model.predict(features[2]) <= 0) self.assertTrue(gbt_model.predict(features[3]) > 0) try: LinearRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0])) LassoWithSGD.train(rdd, initialWeights=array([1.0, 1.0])) RidgeRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0])) except ValueError: self.fail()
def test_regression(self): from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \ RidgeRegressionWithSGD from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees data = [ LabeledPoint(-1.0, [0, -1]), LabeledPoint(1.0, [0, 1]), LabeledPoint(-1.0, [0, -2]), LabeledPoint(1.0, [0, 2]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] lr_model = LinearRegressionWithSGD.train(rdd, iterations=10) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) lasso_model = LassoWithSGD.train(rdd, iterations=10) self.assertTrue(lasso_model.predict(features[0]) <= 0) self.assertTrue(lasso_model.predict(features[1]) > 0) self.assertTrue(lasso_model.predict(features[2]) <= 0) self.assertTrue(lasso_model.predict(features[3]) > 0) rr_model = RidgeRegressionWithSGD.train(rdd, iterations=10) self.assertTrue(rr_model.predict(features[0]) <= 0) self.assertTrue(rr_model.predict(features[1]) > 0) self.assertTrue(rr_model.predict(features[2]) <= 0) self.assertTrue(rr_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories dt_model = DecisionTree.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0) rf_model = RandomForest.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10, maxBins=4, seed=1) self.assertTrue(rf_model.predict(features[0]) <= 0) self.assertTrue(rf_model.predict(features[1]) > 0) self.assertTrue(rf_model.predict(features[2]) <= 0) self.assertTrue(rf_model.predict(features[3]) > 0) gbt_model = GradientBoostedTrees.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4) self.assertTrue(gbt_model.predict(features[0]) <= 0) self.assertTrue(gbt_model.predict(features[1]) > 0) self.assertTrue(gbt_model.predict(features[2]) <= 0) self.assertTrue(gbt_model.predict(features[3]) > 0) try: LinearRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10) LassoWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10) RidgeRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10) except ValueError: self.fail()
def Regression_Model(filename): open_price, close_price, open_price_train, close_price_train, True_price, True_price_train, Date = get_csv_data( filename) output = [] for i in range(1, len(Date)): tmp = LabeledPoint(label=True_price_train[i], features=[close_price_train[i]]) output.append(tmp) output_train_RDD = sc.parallelize(output).cache() lrm = LinearRegressionWithSGD.train(output_train_RDD, step=0.001, iterations=100000) tree = DecisionTree.trainRegressor(output_train_RDD, categoricalFeaturesInfo={}, impurity='variance', maxDepth=5, maxBins=30) forest = RandomForest.trainRegressor(output_train_RDD, categoricalFeaturesInfo={}, numTrees=3, featureSubsetStrategy="auto", impurity='variance', maxDepth=5, maxBins=30) gradient = GradientBoostedTrees.trainRegressor(output_train_RDD, categoricalFeaturesInfo={}, numIterations=10) print("\n============MODEL Evaluation=============\n") model_name = [ 'LinearRegression', 'DecisionTree', 'RandomForest', 'GradientBoostedTrees' ] es_modelname = ['lrm', 'tree', 'forest', 'gradient'] result = '' x = 0 err = 1000 test_model = 'LinearRegression' #此处更换不同的RDD output_model_RDD = lrm for model in [lrm, tree, forest, gradient]: predictions = model.predict(output_train_RDD.map(lambda x: x.features)) labelsAndPredictions = output_train_RDD.map(lambda lp: lp.label).zip( predictions) MSE = ( labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(output_train_RDD.count()))**0.5 #print ("Predictions: ", valuesAndPreds.take(10)) result += model_name[x] + "\tMean Squared Error\t=" + str(MSE) + "\n" if (err > MSE): err = MSE output_model = model es_model = es_modelname[x] x += 1 print(result) print(es_model) return Date, True_price, output_model_RDD, open_price, close_price, es_model
def trainRandomForestModel(data): """ Train a random forest regression model and return it :param data: RDD[LabeledPoint] :return: random forest regression model """ from pyspark.mllib.tree import RandomForest model = RandomForest.trainRegressor(data, categoricalFeaturesInfo={}, numTrees=2000, featureSubsetStrategy="auto", impurity="variance", maxDepth=4, maxBins=32) return model
def build_regressors(self, split_dataset, split_kmeans_dataset, feature_keys): self.logger.info('building regressors') mce_tuples = [] for dataset, kmeans_dataset in zip(split_dataset, split_kmeans_dataset): kmeans_train_set = [] for item in kmeans_dataset: features = [item[column] for column in feature_keys] kmeans_train_set.append(array(features)) # print "kmeans_train_set", len(kmeans_train_set) del kmeans_dataset kmeans_train_set = sc.parallelize(kmeans_train_set) clusters = KMeans.train(kmeans_train_set, 100, maxIterations=200, runs=10, initializationMode="random") del kmeans_train_set data = [] for item in dataset: features = [] for column in feature_keys: features.append(item[column]) data.append(LabeledPoint(item[self.target_key], features)) del dataset data = sc.parallelize(data) def preprocess(observation): observation.label = float(observation.label / 10000) return observation data = data.map(preprocess) (trainingData, testData) = data.randomSplit([0.7, 0.3]) # del data model = RandomForest.trainRegressor( trainingData, categoricalFeaturesInfo={}, numTrees=self.rfr_config['num_trees'], featureSubsetStrategy=self. rfr_config['feature_subset_strategy'], # "all", impurity='variance', maxDepth=self.rfr_config['max_depth']) predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip( predictions) testMSE = -1 try: testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() /\ float(testData.count()) except: pass mce_tuples.append((model, clusters, testMSE)) self.logger.info('regressors build finished') return mce_tuples
def train_model(filename='final_tip_all.txt', test_portion=0.2, cat_var=cat_var_dic, n_tree=250, mode_feature_strat='auto', max_deep=5, max_bin=32): # Train a RandomForest model. # Empty categoricalFeaturesInfo indicates all features are continuous. # Note: Use larger numTrees in practice. # Setting featureSubsetStrategy="auto" lets the algorithm choose sc = SparkContext() sqlContext = SQLContext(sc) spark = SparkSession.builder.appName("RandomForestRegressor").getOrCreate() # Load and parse the data file into an RDD of LabeledPoint. data = MLUtils.loadLibSVMFile(sc, filename) # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([1 - test_portion, test_portion]) ##### TREAT TEMP AS CONTINUOUS #### model = RandomForest.trainRegressor( trainingData, categoricalFeaturesInfo=cat_var, numTrees=n_tree, featureSubsetStrategy=mode_feature_strat, impurity='variance', maxDepth=max_deep, maxBins=max_bin) ############ prediction !!!! #### # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(testData.count()) testRMSE = math.sqrt(testMSE) #predictions.takeSample(withReplacement = False, num = 5) # convert the rdd object to dataframe as follows df_predictions = predictions.map(lambda x: (x, )).toDF() df_predictions.cache() #df_predictions.show(5, False) #print('Learned regression forest model:') #print(model.toDebugString()) print('Test Root Mean Squared Error on ' + filename + ' = ' + str(testRMSE))
def getRandomForestRMSE(trees_array): valRMSE_list = [] for trees in trees_array: model = RandomForest.trainRegressor(train_featureScoreTimeRDD, categoricalFeaturesInfo={}, numTrees=trees, featureSubsetStrategy="auto", impurity='variance', maxDepth=4, maxBins=32) predictions = model.predict(val_featureScoreTimeRDD.map(lambda lp: lp.features)) labelsAndPreds = val_featureScoreTimeRDD.map(lambda lp: lp.label).zip(predictions) valMSE = labelsAndPreds.map(lambda (v, p): (v - p)*(v-p)).sum() / float(val_featureScoreTimeRDD.count()) valRMSE=valMSE**0.5 valRMSE_list.append((trees, valRMSE)) return valRMSE_list
def test_regression(self): from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \ RidgeRegressionWithSGD from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees data = [ LabeledPoint(-1.0, [0, -1]), LabeledPoint(1.0, [0, 1]), LabeledPoint(-1.0, [0, -2]), LabeledPoint(1.0, [0, 2]) ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] lr_model = LinearRegressionWithSGD.train(rdd) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) lasso_model = LassoWithSGD.train(rdd) self.assertTrue(lasso_model.predict(features[0]) <= 0) self.assertTrue(lasso_model.predict(features[1]) > 0) self.assertTrue(lasso_model.predict(features[2]) <= 0) self.assertTrue(lasso_model.predict(features[3]) > 0) rr_model = RidgeRegressionWithSGD.train(rdd) self.assertTrue(rr_model.predict(features[0]) <= 0) self.assertTrue(rr_model.predict(features[1]) > 0) self.assertTrue(rr_model.predict(features[2]) <= 0) self.assertTrue(rr_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories dt_model = DecisionTree.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0) rf_model = RandomForest.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100) self.assertTrue(rf_model.predict(features[0]) <= 0) self.assertTrue(rf_model.predict(features[1]) > 0) self.assertTrue(rf_model.predict(features[2]) <= 0) self.assertTrue(rf_model.predict(features[3]) > 0) gbt_model = GradientBoostedTrees.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo) self.assertTrue(gbt_model.predict(features[0]) <= 0) self.assertTrue(gbt_model.predict(features[1]) > 0) self.assertTrue(gbt_model.predict(features[2]) <= 0) self.assertTrue(gbt_model.predict(features[3]) > 0)
def testRegression(trainingData, testData): # Train a RandomForest model. # Empty categoricalFeaturesInfo indicates all features are continuous. # Note: Use larger numTrees in practice. # Setting featureSubsetStrategy="auto" lets the algorithm choose. model = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo={},numTrees=3, featureSubsetStrategy="auto",impurity='variance', maxDepth=4, maxBins=32) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testMSE = labelsAndPredictions.map(lambda v_p1: (v_p1[0] - v_p1[1]) * (v_p1[0] - v_p1[1])).sum() / float(testData.count()) print('Test Mean Squared Error = ' + str(testMSE)) print('Learned regression forest model:') print(model.toDebugString())
def train(self): """ Trains the Random Forest model with the optimal parameters. @return: The trained RF model """ target_test = self._test_data.map(lambda p: p.label) hyper_params = self.find_rf_parameters() rf_model = RandomForest.trainRegressor(self._train_data, categoricalFeaturesInfo={}, numTrees=hyper_params['trees'], featureSubsetStrategy="auto", impurity="variance", maxDepth=hyper_params['depth'], maxBins=54) return rf_model
def testRegression(trainingData, testData): # Train a RandomForest model. # Empty categoricalFeaturesInfo indicates all features are continuous. # Note: Use larger numTrees in practice. # Setting featureSubsetStrategy="auto" lets the algorithm choose. model = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo={}, numTrees=3, featureSubsetStrategy="auto", impurity='variance', maxDepth=4, maxBins=32) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testMSE = labelsAndPredictions.map(lambda v_p1: (v_p1[0] - v_p1[1]) * (v_p1[0] - v_p1[1]))\ .sum() / float(testData.count()) print('Test Mean Squared Error = ' + str(testMSE)) print('Learned regression forest model:') print(model.toDebugString())
def mllib_rf_regress(lp_train_rdd, lp_test_rdd, trees, depth, bins): ''' RandomForest Regression takes in train/test LabeledPoint rdds ''' model = RandomForest.trainRegressor(lp_train_rdd, categoricalFeaturesInfo={}, numTrees=trees, featureSubsetStrategy="auto", impurity='variance', maxDepth=depth, maxBins=bins) # Evaluate model on test instances and compute test error predictions = model.predict(lp_test_rdd.map(lambda x: x.features)) labelsAndPredictions = lp_test_rdd.map(lambda lp: lp.label).zip( predictions) test_error = labelsAndPredictions.map(lambda lp: (lp[0] - lp[1]) * (lp[0] - lp[1])).sum() / float( lp_test_rdd.count()) return test_error
def find_rf_parameters(self): """ Iterates through a set of numbers corresponding to numTrees and maxDepth to search for the optimal hyperparameters. @return: The best hyperparameters, that minimise MSE """ min_error = 99999999 num_trees = self._trees[0] depth = self._depths[0] for i in self._trees: for j in self._depths: rf_model = RandomForest.trainRegressor( self._train_data, categoricalFeaturesInfo={ 3: 153, 4: 4, 5: 80 }, numTrees=i, featureSubsetStrategy="auto", impurity="variance", maxDepth=j, maxBins=54) predictions = rf_model.predict( self._train_data.map(lambda x: x.features)) target_train = self._train_data.map(lambda p: p.label) rf_values = target_train.zip( predictions.map(lambda x: float(x))) metrics_rf = RegressionMetrics(rf_values) mse = metrics_rf.meanSquaredError if (mse < min_error): min_error = mse num_trees = i depth = j self._log.info('Estimating Parameters for Random Forests:\n=====') self._log.info('MSE = {}, trees = {}, depth = {}'.format( min_error, num_trees, depth)) return {'trees': num_trees, 'depth': depth}
def train_amount_model(self, model, data, i): rdd_data = self.sc.parallelize(data) self.logger.info('Start to train the amount model') if self.amount_prediction_method == self.ARTIFICIAL_NEURAL_NETWORK: input_num = self.feature_num layers = [input_num, input_num / 3 * 2, input_num / 3, 1] neural_network = NeuralNetworkSpark(layers=layers, bias=0) model = neural_network.train(rdd_data, method=neural_network.BP, seed=1234, learn_rate=0.0001, iteration=15, model=model) elif self.amount_prediction_method == self.RANDOM_FOREST: model = RandomForest.trainRegressor(rdd_data, categoricalFeaturesInfo={}, numTrees=40, featureSubsetStrategy="auto", impurity='variance', maxDepth=20, maxBins=32) elif self.amount_prediction_method == self.LINEAR_REGRESSION: model = LinearRegressionWithSGD.train(rdd_data, iterations=10000, step=0.001, initialWeights=model.weights if model is not None else None) else: self.logger.error("Unknown training method {}".format(self.amount_prediction_method)) raise ValueError("Unknown training method {}".format(self.amount_prediction_method)) return model
features_modeled_train, features_categorical_indexed_vec_train) ## select the one-hot-encoded categorical features along with numerical features as well as label to contrust the modeling dataset df_train_modeling = df_train.select(features_modeled_train) ## df_train_modeling_rdd for mllib package df_train_modeling_rdd = df_train_modeling.rdd.map( lambda p: convert_sparsevec_to_vec_df( p, features_categorical_indexed_vec_index_train)) df_train_modeling_rdd = df_train_modeling_rdd.map( lambda l: LabeledPoint(l[0], l[1:])) ################################################## 5: train random forest regression model ## random forest ## train model rfModel = RandomForest.trainRegressor(df_train_modeling_rdd, categoricalFeaturesInfo={}, numTrees=100, featureSubsetStrategy="auto", impurity='variance', maxDepth=10, maxBins=32) # Predict on train data predictions = rfModel.predict( df_train_modeling_rdd.map(lambda l: l.features)) ## Evaluation of the model predictionAndObservations = predictions.zip( df_train_modeling_rdd.map(lambda l: l.label)) testMetrics = RegressionMetrics(predictionAndObservations) model_time = str(model_time[0][0]) df_model_performance = spark.createDataFrame( sc.parallelize( [[model_time, testMetrics.rootMeanSquaredError, testMetrics.r2]]), ["model_time", "RMSE", "R2"])
sparkConf = SparkConf().setAppName("BeijingTomorrow").setMaster("local") sc=SparkContext(conf=sparkConf) sqlContext = SQLContext(sc) (visual_training_image_array , visual_training_outcome_array ) = loadVisualTrainingDataToArray() #We have to turn it into a list of observations visual_training_data = [] for i in range(0,len(visual_training_outcome_array) ): visual_training_data.append((visual_training_outcome_array[i],visual_training_image_array[i])) visual_training_rdd = sc.parallelize(visual_training_data) visual_data_flattened = visual_training_rdd.map(lambda x : ( x[0] , averageBrightness4By4(x[1])) ) visual_data_labeled_points = visual_data_flattened.map(lambda x : varsToLabeledPoint(x)) toprint=visual_data_labeled_points.take(1) print(str(toprint)) visual_model = RandomForest.trainRegressor(visual_data_labeled_points, categoricalFeaturesInfo={}, numTrees=1000, featureSubsetStrategy="auto", impurity='variance', maxDepth=5, maxBins=100) #visual_model = LinearRegressionWithSGD.train(visual_data_labeled_points, iterations=3,intercept=True) visual_training_vectors = visual_data_flattened.map(lambda x : featuresToVectors(x[1])) toprint = visual_training_vectors.take(1) print(str(toprint)) visual_in_sample_predictions = visual_model.predict(visual_training_vectors) visual_in_sample_labels_and_predictions = visual_data_labeled_points.map(lambda lp: lp.label).zip(visual_in_sample_predictions) visual_in_sample_labels_and_predictions.foreach(printline) squaresdf = visual_in_sample_labels_and_predictions.map(lambda p : (p[0] , p[0]*p[0] , p[0] - p[1] , (p[0] - p[1])*(p[0] - p[1]) , 1 ) ) squares = squaresdf.reduce(lambda a , b : (a[0]+b[0] , a[1]+b[1] , a[2]+b[2] , a[3]+b[3] , a[4]+b[4] ) ) tss = float(squares[1]) - float(squares[0]*squares[0])/float(squares[4]) rss = float(squares[3]) - float(squares[2]*squares[2])/float(squares[4]) r2 = 1-rss/tss print("Training set:")
def rf(userID, n): ### CREATING GAME PROFILE DF #### game_profiles = get_game_profiles() df = pd.DataFrame(game_profiles) df_clean = preprocess(df) # Full df for games only, no playtimes (for prediction later) df_games = df_clean.drop('genres', 1) #df_games = df_games.drop('name', 1) df_games = df_games.drop('appID', 1) df_games = df_games.drop('cat', 1) df_games = df_games.drop('tags', 1) df_games = df_games.drop('type', 1) games = get_games('/media/sf_AdvancedML/Final/gameData.txt') missing = set() ### CROSS VALIDATING ### all_accur, avg_accur = cross_validate(df_clean, games, 10) print "Accuracies, Average Accuracy" print all_accur, avg_accur ### TRAIN ON INCOMING USER ### ownedGames = build_user_dataset.get_ownedGames(userID) #json object with open('/media/sf_AdvancedML/Final/userData'+str(userID)+'.txt', 'w') as outFile: if len(ownedGames) == 0: print "This user's library is empty or unreachable." return json.dump({'user': userID, 'ownedGames':ownedGames}, outFile) # initialize empty frame with appropriate columns df = pd.DataFrame(columns = list(df_clean.columns.values)+['playtime']) # Randomly select user's library gamesOwned = get_gamesOwned('/media/sf_AdvancedML/Final/userData'+str(userID)+'.txt') user = random.choice(gamesOwned.values()) gamesList = gamesOwned[gamesOwned.keys()[0]].keys() # Connect playtime to game df for games owned if len(user.values()) > 0: #print user.values()[0] for k, v in user.values()[0].iteritems(): if k in games: row = df_clean.loc[df_clean['name'] == k] row['playtime'] = np.log(v) df = df.append(row) else: missing.add(k) df = df.drop('genres', 1) df = df.drop('name', 1) df = df.drop('appID', 1) df = df.drop('cat', 1) df = df.drop('tags', 1) df = df.drop('type', 1) # Pass User DF to Spark df.to_csv('/media/sf_AdvancedML/Final/RF.csv') data = sc.textFile('/media/sf_AdvancedML/Final/RF.csv') header = data.first() data = data.filter(lambda x: x != header) data = data.map(lambda line: convertUni(line)) data = data.map(lambda line: line.split(',')) # RDD of (label, features) pairs data = data.map(lambda line: LabeledPoint(line[0], line[1:])) model = RandomForest.trainRegressor(data, categoricalFeaturesInfo = {}, numTrees = 3, featureSubsetStrategy = "auto", impurity = 'variance', maxDepth = 4) ### PREDICT ### # for every game in Steam library # df_games.to_csv('/media/sf_AdvancedML/Final/RF_games_names.csv') df_games.drop('name', 1).to_csv('/media/sf_AdvancedML/Final/RF_games.csv') data_games = sc.textFile('/media/sf_AdvancedML/Final/RF_games.csv') header = data_games.first() data_games = data_games.filter(lambda x: x != header) data_games = data_games.map(lambda line: convertUni(line)) data_games = data_games.map(lambda line: line.split(',')) data_test = sc.textFile('/media/sf_AdvancedML/Final/RF_games_names.csv') header2 = data_test.first() data_test = data_test.filter(lambda x: x != header2) data_test = data_test.map(lambda line: convertUni(line)) data_test = data_test.map(lambda line: line.split(',')) predictions = model.predict(data_games) idPredictions = data_test.map(lambda x: x[6]).zip(predictions) # Filter predictions for games owned or trailers/apps idPredictions = idPredictions.filter(lambda x: x[0] not in gamesList) # Export predictions to pandas df predDF = idPredictions.toDF() predDF = predDF.toPandas() # Name, Prediction predDF.columns = ['Name', 'PredictedPlaytime'] # Returning top n not in library sorted_predDF = predDF.sort_values(by = 'PredictedPlaytime', ascending = False) recs = [] #while len(recs) <= n: # check if rec in library #game = # check if game or trailer/app return sorted_predDF[:n]
return LabeledPoint(loss, array(line_split[1:len(line_split) - 1])) train_data_labeled_point = train_data_csv.map(parse_labled_point) test_data_labeled_point = test_data_csv.map(parse_labled_point) # ======================= TRAIN MODEL ================================================= t0 = time() # smaller MSE generally indicates a better estimate # after tweak round parameters # FeatureSubsetStrargety=auto => it will help us analyse and choose best algorithm base on dataset # larger numTrees and maxDepth will be more accurate but it will take long time to train # so I think 10 wound be balance model = RandomForest.trainRegressor(train_data_labeled_point, categoricalFeaturesInfo={}, numTrees=10, maxDepth=10, featureSubsetStrategy="auto") tt = time() - t0 print("RandomForest trained in {} seconds".format(round(tt, 3))) # ======================= TEMPORARY TEST PREDICT MODEL ================================================= t0 = time() predictions = model.predict(test_data_labeled_point.map(lambda x: x.features)) labels_preds = test_data_labeled_point.map(lambda x: x.label).zip(predictions) testMSE = labels_preds.map(lambda lp: (lp[0] - lp[1]) * (lp[0] - lp[1])).sum() / float( test_data_labeled_point.count()) tt = time() - t0 print("Prediction made in {} seconds.".format(round(tt, 3))) print('Test Mean Squared Error = ' + str(testMSE))
# sc.setLogLevel("ERROR") t1 = datetime.datetime.now() data = sc.textFile('hdfs://node1:9000/input/checkerboard2x2_train.txt') data = data.map(lambda _: _.split(' ')) data = data.map(lambda row: LabeledPoint(row[-1], row[:-1])) #print(data .take(20)) train, test = data.randomSplit([95.0, 5.0]) # training model reg = RandomForest.trainRegressor(train, numTrees=100, categoricalFeaturesInfo={}) t2 = datetime.datetime.now() time_difference = t2 - t1 time_difference_in_minutes = time_difference / timedelta(minutes=1) print('Time elapsed = ', time_difference_in_minutes, ' minutes') predictions = reg.predict(test.map(lambda x: x.features)) # creating RDD of pairs of (true_label, predicted_label) labelsAndPredictions = test.map(lambda lp: lp.label).zip(predictions) testMSE = labelsAndPredictions.map(lambda x: (x[0] - x[1])**2).sum() / float( test.count())
# Data points as LabeledPoints # (crime count, [beat, week]) predArrayLP = joinedData.map(lambda x: LabeledPoint(x[ 0], [weekDict[x[1][0]], beatsDict[x[1][1]], x[1][2]])) # Split into training and testing set. 70-30 split. (train, test) = predArrayLP.randomSplit([0.7, 0.3]) # Feature categories : featuresCat = {0: len(beatsDict), 1: 53} maxBins = max(len(beatsDict), len(weekDict)) model = RandomForest.trainRegressor(train, categoricalFeaturesInfo=featuresCat, numTrees=10, featureSubsetStrategy="auto", impurity='variance', maxDepth=5, maxBins=maxBins) # Evaluate model on test instances and compute test error predictions = model.predict(test.map(lambda x: x.features)) #rschoolCountBeats = schoolCount.map(lambda x: x[0]) predOutput = predictions.collect() labelsAndPredictions = test.map(lambda lp: lp.label).zip(predictions) testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(test.count()) print('Test Mean Squared Error = ' + str(testMSE)) ### Write output to file ### with open("predictions.txt", 'wb') as f:
continue #print(item[1]) #print(matched.first()) labeledArray.append(createLabeledPoint(item, matched.first())) print("Appended DataSets") #Convert the array of labelled points into an RDD dataSet = sc.parallelize(labeledArray) #Split the data into testing and training (trainingData, testData) = dataSet.randomSplit([.7, .3]) #Create the Random Forest model using the training data and generate predictions using the test data modelRF = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo={}, numTrees=5, impurity='variance', maxDepth=4, maxBins=32) predictionsRF = modelRF.predict(testData.map(lambda x: x.features)) #Gradient Boosted Model modelGB = GradientBoostedTrees.trainRegressor(trainingData, categoricalFeaturesInfo={}, numIterations=3) predictionsGB = modelGB.predict(testData.map(lambda x: x.features)) #Linear Regression Model modelLin = LinearRegressionWithSGD.train(trainingData, iterations=100, step=0.00000001) predictionsLin = modelLin.predict(testData.map(lambda x: x.features))
val_data = truetestData.map(lambda line: LabeledPoint(line[7], line[0:7])) # debug print(data.take(1)) print(val_data.take(1)) # for holdout validation (trData, tData) = data.randomSplit([0.7, 0.3]) # random forest training model mod = RandomForest.trainRegressor(trData, categoricalFeaturesInfo={ 0: 13, 1: 1499, 2: 2 }, numTrees=4, featureSubsetStrategy="auto", impurity='variance', maxDepth=8, maxBins=1500) # prediction and evaluation predictions = mod.predict(tData.map(lambda x: x.features)) pred = mod.predict(val_data.map(lambda x: x.features)) labelsAndPredictions = tData.map(lambda lp: lp.label).zip(predictions) truePred = val_data.map(lambda lp: lp.label).zip(pred) metrics = RegressionMetrics(labelsAndPredictions) met2 = RegressionMetrics(truePred) # Squared Error print("Validation MSE = %s" % metrics.meanSquaredError)
# Putting data in vector assembler form assembler_train = VectorAssembler(inputCols=newlist_train, outputCol="features") transformed_train = assembler_train.transform(merged_train) # Creating input dataset in the form of labeled point for training the model data_train = (transformed_train.select( "features", "stars")).map(lambda row: LabeledPoint(row.stars, row.features)) # Training the model using Random forest regressor model_train = RandomForest.trainRegressor(data_train, categoricalFeaturesInfo={}, numTrees=10, featureSubsetStrategy="auto", impurity='variance', maxDepth=8, maxBins=32) ######################################################################################################## # PREDICTIONS ON FINAL (TEST) DATASET USING DEVELOPED MODEL 'model_train' ######################################################################################################## # Creating a list of features to be used for predictions removelist_final = set(['business_id', 'review_id', 'u_review_count']) newlist_final = [ v for i, v in enumerate(merged_final_ku_kb.columns) if v not in removelist_final ]
def test_regression(self): from pyspark.mllib.regression import ( LinearRegressionWithSGD, LassoWithSGD, RidgeRegressionWithSGD, ) from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees data = [ LabeledPoint(-1.0, [0, -1]), LabeledPoint(1.0, [0, 1]), LabeledPoint(-1.0, [0, -2]), LabeledPoint(1.0, [0, 2]), ] rdd = self.sc.parallelize(data) features = [p.features.tolist() for p in data] lr_model = LinearRegressionWithSGD.train(rdd, iterations=10) self.assertTrue(lr_model.predict(features[0]) <= 0) self.assertTrue(lr_model.predict(features[1]) > 0) self.assertTrue(lr_model.predict(features[2]) <= 0) self.assertTrue(lr_model.predict(features[3]) > 0) lasso_model = LassoWithSGD.train(rdd, iterations=10) self.assertTrue(lasso_model.predict(features[0]) <= 0) self.assertTrue(lasso_model.predict(features[1]) > 0) self.assertTrue(lasso_model.predict(features[2]) <= 0) self.assertTrue(lasso_model.predict(features[3]) > 0) rr_model = RidgeRegressionWithSGD.train(rdd, iterations=10) self.assertTrue(rr_model.predict(features[0]) <= 0) self.assertTrue(rr_model.predict(features[1]) > 0) self.assertTrue(rr_model.predict(features[2]) <= 0) self.assertTrue(rr_model.predict(features[3]) > 0) categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories dt_model = DecisionTree.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4 ) self.assertTrue(dt_model.predict(features[0]) <= 0) self.assertTrue(dt_model.predict(features[1]) > 0) self.assertTrue(dt_model.predict(features[2]) <= 0) self.assertTrue(dt_model.predict(features[3]) > 0) rf_model = RandomForest.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10, maxBins=4, seed=1 ) self.assertTrue(rf_model.predict(features[0]) <= 0) self.assertTrue(rf_model.predict(features[1]) > 0) self.assertTrue(rf_model.predict(features[2]) <= 0) self.assertTrue(rf_model.predict(features[3]) > 0) gbt_model = GradientBoostedTrees.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4 ) self.assertTrue(gbt_model.predict(features[0]) <= 0) self.assertTrue(gbt_model.predict(features[1]) > 0) self.assertTrue(gbt_model.predict(features[2]) <= 0) self.assertTrue(gbt_model.predict(features[3]) > 0) try: LinearRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10) LassoWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10) RidgeRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10) except ValueError: self.fail() # Verify that maxBins is being passed through GradientBoostedTrees.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4, maxBins=32 ) with self.assertRaises(Exception): GradientBoostedTrees.trainRegressor( rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4, maxBins=1 )
kmeans_features = df_valid.distinct().map(lambda x: np.array([x.song_hotttnesss, x.loudness])) clusters = KMeans.train(kmeans_features, 4, maxIterations=10, runs=10, initializationMode="random") from numpy import array # regression regression_features = df_valid.distinct().map(lambda x: LabeledPoint(x.song_hotttnesss, [x.loudness])) from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.tree import RandomForest, RandomForestModel training_data, test_data = regression_features.randomSplit([0.8, 0.2]) model = RandomForest.trainRegressor( training_data, categoricalFeaturesInfo={}, numTrees=3, featureSubsetStrategy="auto", impurity="variance", maxDepth=4, maxBins=32, ) print model.toDebugString() # prediction error predictions = model.predict(test_data.map(lambda x: x.features)) labelsAndPredictions = test_data.map(lambda lp: lp.label).zip(predictions) testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) ** 2).sum() / float(test_data.count()) print ("Test Mean Squared Error = " + str(testMSE))
testvecData = testdata.map(parseVec) # use map operation to map the first column of each row to be the label, and the rest into a vector, combined they become a tuple called LabelPoint in Spark testparsedData = testdata.map(parsePoint) # Train a RandomForest model. # Empty categoricalFeaturesInfo indicates all features are continuous. # numTres is the number of tree used in the model # Setting featureSubsetStrategy="auto" lets the algorithm choose what feature each tree use # impurity is variance for regression # maxDepth is the maximum depth of each tree model1 = RandomForest.trainRegressor(trainparsedData , categoricalFeaturesInfo={} , numTrees=1000 , featureSubsetStrategy="auto" , impurity='variance' , maxDepth=13 , maxBins=32) # evaluate the training error # first make the prediction and create a new "vector" of all the predictions trainpredictions = model1.predict(trainparsedData.map(lambda x: x.features)) # then you column bind the prediction and actual values into a new RDD trainlabelsAndPredictions = trainparsedData.map(lambda lp: lp.label).zip(trainpredictions) # use map operation to compute MSE trainMSE1 = trainlabelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(trainparsedData.count()) # use the the Statistics library to obtain the variance summary = Statistics.colStats(trainvecData)
def main(): text = sc.textFile(inputs) nltk_data_path = "[change to your own nltk_data location]" # maybe changed to the sfu server path nltk.data.path.append(nltk_data_path) cleaned_review = text.map(clean_reviewf).cache() reviews_txt = cleaned_review.map(lambda review: review['reviewText']) reviews = cleaned_review.map(lambda review: (review['overall'], review[ 'reviewText'], review['reviewTime'])).cache() training_reviews = reviews.filter( lambda (rating, review_text, review_date): review_date.tm_year < 2014) testing_reviews = reviews.filter( lambda (rating, review_text, review_date): review_date.tm_year == 2014) training_data = training_reviews.map( lambda (rating, review_text, review_date): (rating, review_text)).zipWithIndex().cache() testing_data = testing_reviews.map( lambda (rating, review_text, review_date): (rating, review_text)).zipWithIndex().cache() training_rating = training_data.map( lambda ((rating, review_text), review_index): (review_index, rating)) training_review_text = training_data.map(lambda ( (rating, review_text), review_index): (review_index, review_text)) training_review_text_flat = training_review_text.flatMapValues(myf) training_review_text_flat = training_review_text_flat.map( lambda (review_index, review_word): (review_word, review_index)) testing_rating = testing_data.map( lambda ((rating, review_text), review_index): (review_index, rating)) testing_review_text = testing_data.map(lambda ( (rating, review_text), review_index): (review_index, review_text)) testing_review_text_flat = testing_review_text.flatMapValues(myf) testing_review_text_flat = testing_review_text_flat.map( lambda (review_index, review_word): (review_word, review_index)) word2vec_model = generate_word2vec_model(reviews_txt) mv = word2vec_model.getVectors() # this step seems redundant but necessary mvdct = [] for k, v in mv.items(): vec = [f for f in v] mvdct.append((k, vec)) dct_rdd = sc.parallelize(mvdct) training_feature_vecs = dct_rdd.join(training_review_text_flat) training_vecs = training_feature_vecs.map(lambda (w, ( feature_vec, review_index)): (review_index, (feature_vec, 1))) training_reduce_vecs = training_vecs.reduceByKey( lambda v1, v2: (np.sum([v1[0], v2[0]], axis=0), v1[1] + v2[1])) training_avg_vecs = training_reduce_vecs.map(lambda (review_index, ( feature_vec, ct)): (review_index, np.array(feature_vec) / float(ct))) training_rating_avgf = training_rating.join(training_avg_vecs) training_lps = training_rating_avgf.map(get_lp) testing_feature_vecs = dct_rdd.join(testing_review_text_flat) testing_vecs = testing_feature_vecs.map(lambda (w, ( feature_vec, review_index)): (review_index, (feature_vec, 1))) testing_reduce_vecs = testing_vecs.reduceByKey( lambda v1, v2: (np.sum([v1[0], v2[0]], axis=0), v1[1] + v2[1])) testing_avg_vecs = testing_reduce_vecs.map(lambda (review_index, ( feature_vec, ct)): (review_index, np.array(feature_vec) / float(ct))) testing_rating_avgf = testing_rating.join(testing_avg_vecs) testing_lps = testing_rating_avgf.map(get_lp) trees_nums = range(2, 10) results = [] for trees_num in trees_nums: rf_model = RandomForest.trainRegressor(training_lps, categoricalFeaturesInfo={}, numTrees=trees_num, featureSubsetStrategy="auto", impurity='variance', maxDepth=10, maxBins=32) predictions = rf_model.predict(testing_lps.map(lambda x: x.features)) labelsAndPredictions = testing_lps.map(lambda lp: lp.label).zip( predictions) MSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float( testing_lps.count()) RMSE = math.sqrt(MSE) result = 'tree nums: ' + str(trees_num) + ', RMSE: ' + str(RMSE) results.append(result) outdata = sc.parallelize(results) outdata.saveAsTextFile(output)
target_data = keyed_data.join(keyed_target) labled_point_data = target_data.map(lambda tup: LabeledPoint(tup[1][1][0], tup[1][0][0].split(','))) #map(lambda line: line.split(",")).map(lambda line: tuple((feature for feature in line))) # Split the data into training and test sets (30% held out for testing) print("Creating Training and Test Data Split") (trainingData, testData) = labled_point_data.randomSplit([0.7, 0.3]) # Train a RandomForest model. # Empty categoricalFeaturesInfo indicates all features are continuous. # Note: Use larger numTrees in practice. # Setting featureSubsetStrategy="auto" lets the algorithm choose. model = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo={}, numTrees=5, featureSubsetStrategy="auto", impurity='variance', maxDepth=8, maxBins=32) # # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(testData.count()) print('Test Mean Squared Error = ' + str(testMSE)) testAccuracy = labelsAndPredictions.map(lambda (v, p): 1 if (abs(v - p) < 10) else 0).sum() / float(testData.count()) print('Total Accuracy = ' + str(testAccuracy)) # print('Learned regression forest model:') # print(model.toDebugString()) # # Save and load model
def main(): text = sc.textFile(inputs) nltk_data_path = "[change to your own nltk_data location]" # maybe changed to the sfu server path nltk.data.path.append(nltk_data_path) cleaned_review = text.map(clean_reviewf).cache() reviews_txt = cleaned_review.map(lambda review: review['reviewText']) reviews = cleaned_review.map(lambda review: (review['overall'], review['reviewText'], review['reviewTime'])).cache() training_reviews = reviews.filter(lambda (rating, review_text, review_date): review_date.tm_year < 2014) testing_reviews = reviews.filter(lambda (rating, review_text, review_date): review_date.tm_year == 2014) training_data = training_reviews.map(lambda (rating, review_text, review_date): (rating, review_text)).zipWithIndex().cache() testing_data = testing_reviews.map(lambda (rating, review_text, review_date): (rating, review_text)).zipWithIndex().cache() training_rating = training_data.map(lambda ((rating, review_text), review_index): (review_index, rating)) training_review_text = training_data.map(lambda ((rating, review_text), review_index): (review_index, review_text)) training_review_text_flat = training_review_text.flatMapValues(myf) training_review_text_flat = training_review_text_flat.map(lambda (review_index, review_word): (review_word, review_index)) testing_rating = testing_data.map(lambda ((rating, review_text), review_index): (review_index, rating)) testing_review_text = testing_data.map(lambda ((rating, review_text), review_index): (review_index, review_text)) testing_review_text_flat = testing_review_text.flatMapValues(myf) testing_review_text_flat = testing_review_text_flat.map(lambda (review_index, review_word): (review_word, review_index)) word2vec_model = generate_word2vec_model(reviews_txt) mv = word2vec_model.getVectors() # this step seems redundant but necessary mvdct = [] for k,v in mv.items(): vec = [f for f in v] mvdct.append((k,vec)) dct_rdd = sc.parallelize(mvdct) training_feature_vecs = dct_rdd.join(training_review_text_flat) training_vecs = training_feature_vecs.map(lambda (w,(feature_vec, review_index)): (review_index, (feature_vec, 1))) training_reduce_vecs = training_vecs.reduceByKey(lambda v1,v2: (np.sum([v1[0],v2[0]], axis=0),v1[1]+v2[1])) training_avg_vecs = training_reduce_vecs.map(lambda (review_index, (feature_vec, ct)): (review_index, np.array(feature_vec)/float(ct))) training_rating_avgf = training_rating.join(training_avg_vecs) training_lps = training_rating_avgf.map(get_lp) testing_feature_vecs = dct_rdd.join(testing_review_text_flat) testing_vecs = testing_feature_vecs.map(lambda (w,(feature_vec, review_index)): (review_index, (feature_vec, 1))) testing_reduce_vecs = testing_vecs.reduceByKey(lambda v1,v2: (np.sum([v1[0],v2[0]], axis=0),v1[1]+v2[1])) testing_avg_vecs = testing_reduce_vecs.map(lambda (review_index, (feature_vec, ct)): (review_index, np.array(feature_vec)/float(ct))) testing_rating_avgf = testing_rating.join(testing_avg_vecs) testing_lps = testing_rating_avgf.map(get_lp) trees_nums = range(2,10) results = [] for trees_num in trees_nums: rf_model = RandomForest.trainRegressor(training_lps, categoricalFeaturesInfo={}, numTrees=trees_num, featureSubsetStrategy="auto", impurity='variance', maxDepth=10, maxBins=32) predictions = rf_model.predict(testing_lps.map(lambda x: x.features)) labelsAndPredictions = testing_lps.map(lambda lp: lp.label).zip(predictions) MSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() /float(testing_lps.count()) RMSE = math.sqrt(MSE) result = 'tree nums: ' + str(trees_num) + ', RMSE: ' + str(RMSE) results.append(result) outdata = sc.parallelize(results) outdata.saveAsTextFile(output)
"Root Mean Squared Error (RMSE) for Decision Tree on test data = %g" % rmse) r2_dt = ecisionTree_model_evaluator = RegressionEvaluator( labelCol="MPG", predictionCol="prediction", metricName="r2") print("R Squared (R2) for Decision Tree on test data = %g" % r2_dt.evaluate(decisionTree_model_predictions)) ############################---RANDOM FOREST REGRESSION---################################## train_rdd_rf = train_df.rdd.map( lambda row: LabeledPoint(row[-1], Vectors.dense(row[0:-1]))) test_rdd_rf = test_df.rdd.map( lambda row: LabeledPoint(row[-1], Vectors.dense(row[0:-1]))) RandomForest_model = RandomForest.trainRegressor( train_rdd_rf, categoricalFeaturesInfo={}, numTrees=50, featureSubsetStrategy="auto", maxDepth=10, maxBins=100) predictions = RandomForest_model.predict( test_rdd_rf.map(lambda x: x.features)) labelsAndPredictions = test_rdd_rf.map(lambda lp: lp.label).zip( predictions) metrics = RegressionMetrics(labelsAndPredictions) print("RMSE of randomForest on Test data = %s" % metrics.rootMeanSquaredError) print("R-squared of randomForest on Test data = %s" % metrics.r2)
.join( avgTemperature ) \ .map( lambda row: [ item for sublist in row for item in sublist ] ) \ .map( lambda row: LabeledPoint( row[ 2 ][ 1 ], [ row[ 2 ][ 0 ], row[ 1 ], row[ 3 ] ] ) ) \ .cache( ); crimeCounts.unpersist( ); # Split the crime counts into training and test datasets ( training, test ) = joinedData.randomSplit( ( 0.7, 0.3 ) ); # Categorical features dictionary featuresInfo = { 0: len( beatsDict ), 1: 53 }; # Train a Random Forest model to predict crimes model = RandomForest.trainRegressor( training, categoricalFeaturesInfo = featuresInfo, numTrees = 5, featureSubsetStrategy = "auto", impurity = 'variance', maxDepth = 10, maxBins = len( beatsDict ) ); # Measure the model performance on test dataset predictions = model.predict( test.map( lambda x: x.features ) ) \ .cache( ); meanCrimes = test.map( lambda x: x.label ).mean( ); labelsAndPredictions = test.map( lambda x: x.label ).zip( predictions ); testMSE = labelsAndPredictions.map( lambda ( v, p ): ( v - p ) * ( v - p ) ).sum( ) / float( test.count( ) ); testSSE = labelsAndPredictions.map( lambda ( v, p ): ( v - p ) * ( v - p ) ).sum( ); testSST = labelsAndPredictions.map( lambda ( v, p ): ( v - meanCrimes ) * ( v - meanCrimes ) ).sum( ); Rsq = 1 - testSSE / testSST; #### Predicting crimes for next week ####
############# ############# ############# ############# #############
from pyspark.ml import Pipeline from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder, Tokenizer, HashingTF, VectorIndexer #feature_cols = ["int_"+x for x in char_col_toUse_names] + num_col_toUse_names string_indexers = [ StringIndexer(inputCol=x, outputCol="int_{0}".format(x)) for x in char_col_toUse_names ] assembler = VectorAssembler( inputCols= ["int_"+x for x in char_col_toUse_names] + num_col_toUse_names, outputCol="features" ) pipeline = Pipeline(stages=string_indexers + [assembler]) model = pipeline.fit(taxi_df) indexed = model.transform(taxi_df) ml_df = indexed.select(col("Tool Days").cast("int").alias("label"), col("features")).map(lambda row: LabeledPoint(row.label, row.features)) training, test = ml_df.randomSplit([0.8, 0.2], seed=0) rfm = RandomForest.trainRegressor(sc.parallelize(training.collect()), categoricalFeaturesInfo={0:24,1:3,2:4,3:5,4:107}, numTrees=10, featureSubsetStrategy="auto", impurity='variance', maxDepth=5, maxBins=120) predictions = rfm.predict(test.map(lambda x: x.features)) labelsAndPredictions = test.map(lambda x: x.label).zip(predictions) error = 1.0 * labelsAndPredictions.filter(lambda (p, a): a!=0).map(lambda (p, a): abs(p-a)/a).reduce(lambda a, b: a+b) / test.count() error
#We have to turn it into a list of observations visual_training_data = [] for i in range(0, len(visual_training_outcome_array)): visual_training_data.append( (visual_training_outcome_array[i], visual_training_image_array[i])) visual_training_rdd = sc.parallelize(visual_training_data) visual_data_flattened = visual_training_rdd.map( lambda x: (x[0], averageBrightness4By4(x[1]))) visual_data_labeled_points = visual_data_flattened.map( lambda x: varsToLabeledPoint(x)) toprint = visual_data_labeled_points.take(1) print(str(toprint)) visual_model = RandomForest.trainRegressor(visual_data_labeled_points, categoricalFeaturesInfo={}, numTrees=1000, featureSubsetStrategy="auto", impurity='variance', maxDepth=5, maxBins=100) #visual_model = LinearRegressionWithSGD.train(visual_data_labeled_points, iterations=3,intercept=True) visual_training_vectors = visual_data_flattened.map( lambda x: featuresToVectors(x[1])) toprint = visual_training_vectors.take(1) print(str(toprint)) visual_in_sample_predictions = visual_model.predict( visual_training_vectors) visual_in_sample_labels_and_predictions = visual_data_labeled_points.map( lambda lp: lp.label).zip(visual_in_sample_predictions) visual_in_sample_labels_and_predictions.foreach(printline) squaresdf = visual_in_sample_labels_and_predictions.map(lambda p: (p[0], p[
from pyspark.context import SparkContext from pyspark.mllib.util import MLUtils from pyspark.mllib.tree import RandomForest, RandomForestModel sc = SparkContext('yarn', 'weather_predictor') data = MLUtils.loadLibSVMFile(sc, 'hdfs:///users/wfvining/'+sys.argv[1]) (train, test) = data.randomSplit([0.7, 0.3]) model = RandomForest.trainRegressor(trainData, categoricalFeaturesInfo={x:2 for x in range(654, 615)}, numTrees=10, featureSubsetStrategy='auto', maxDepth=5) predictions = model.predict(test.map(lambda x:x.features)) labelsAndPredictions = test.map(lambda lp:lp.label).zip(predictions) testErr = labelsAndPredictions.map( lambda (v, p): (v - p) * (v - p)).sum() / float(test.count()) print('Mean Squared Error: ' + str(testErr)e
def cross_validate(df_clean, games, n): """ :param k n: number of users for CV :return: list of accuracies for each of n users, avg acc """ missing = set() ### COLLECTING LIBRARIES ### gamesOwned = get_gamesOwned('/media/sf_AdvancedML/Final/userData.txt') print "Done collecting ownedGames." ### VALIDATING ### all_accur = {'model1': [], 'model2': [], 'model3': [], 'model4': []} for i in range(n): # initialize empty frame with appropriate columns df = pd.DataFrame(columns = list(df_clean.columns.values)+['playtime']) # Randomly select user's library user = random.choice(gamesOwned.values()) # Connect playtime to game df for games owned if len(user.values()) > 0: #print user.values()[0] for k, v in user.values()[0].iteritems(): if k in games: row = df_clean.loc[df_clean['name'] == k] row['playtime'] = np.log(v) df = df.append(row) else: missing.add(k) df = df.drop('genres', 1) df = df.drop('name', 1) df = df.drop('appID', 1) df = df.drop('cat', 1) df = df.drop('tags', 1) df = df.drop('type', 1) # Pass User DF to Spark df.to_csv('/media/sf_AdvancedML/Final/RF_train.csv') data = sc.textFile('/media/sf_AdvancedML/Final/RF_train.csv') header = data.first() data = data.filter(lambda x: x != header) data = data.map(lambda line: convertUni(line)) data = data.map(lambda line: line.split(',')) # RDD of (label, features) pairs data = data.map(lambda line: LabeledPoint(line[-1], line[:len(line)])) # Split into training, test (trainingData, testData) = data.randomSplit([0.8, 0.2]) try: # Training model model1 = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo = {}, numTrees = 70, featureSubsetStrategy = "auto", impurity = 'variance', maxDepth = 4) model2 = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo = {}, numTrees = 100, featureSubsetStrategy = "auto", impurity = 'variance', maxDepth = 4) model3 = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo = {}, numTrees = 120, featureSubsetStrategy = "auto", impurity = 'variance', maxDepth = 4) model4 = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo = {}, numTrees = 100, featureSubsetStrategy = "auto", impurity = 'variance', maxDepth = 6) models = [model1, model2, model3, model4] modelNames = ['model1', 'model2', 'model3', 'model4'] for i in range(len(models)): m = models[i] name = modelNames[i] # Evaluate on test data, compute error predictions = m.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testMSE = labelsAndPredictions.map(lambda (v, p) : (v-p)*(v-p)).sum() /\ float(testData.count()) all_accur[name] += [testMSE] except: pass avgDict = {} for k,v in all_accur.iteritems(): avgDict[k] = np.mean(v) return all_accur, avgDict
sc = spark.sparkContext # Load and parse the data file into an RDD of LabeledPoint. data = MLUtils.loadLibSVMFile(sc, 'data/diamonds_price.data') # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3], seed=123) # Train a RandomForest model. # Empty categoricalFeaturesInfo indicates all features are continuous. # Note: Use larger numTrees in practice. # Setting featureSubsetStrategy="auto" lets the algorithm choose. model = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo={}, numTrees=25, featureSubsetStrategy="auto", impurity='variance', maxDepth=20, maxBins=32, seed=123) # Evaluate model on test instances and compute test error predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testRMSE = math.sqrt( labelsAndPredictions.map(lambda lp: (lp[0] - lp[1])**2).sum() / float(testData.count())) result = testData.zip(predictions).collect() # Print the predictions to output file
inputCols=["tf_idf", "gilded", "distinguished", "controversiality"], outputCol="features") mergedDF = assembler.transform(mergedDF) mergedDF.show() scoreFeaturesPair = mergedDF.map(lambda x: (x[7], x[0])).repartition(500) features = scoreFeaturesPair.map(lambda x: x[0]) scores = scoreFeaturesPair.map(lambda x: int(x[1])) zipped_data = ( scores.zip(features).map(lambda x: LabeledPoint(x[0], x[1])).cache()) # Do a random split so we can test our model on non-trained data training, test = zipped_data.randomSplit([0.7, 0.3]) # Train our model model = RandomForest.trainRegressor(training, {1048577: 4, 1048578: 2}, 10) #model = LinearRegressionWithSGD.train(training) # Use our model to predict train_preds = (training.map(lambda x: x.label).zip( model.predict(training.map(lambda x: x.features)))) test_preds = (test.map(lambda x: x.label).zip( model.predict(test.map(lambda x: x.features)))) # Ask PySpark for some metrics on how our model predictions performed trained_metrics = RegressionMetrics( train_preds.map(lambda x: (float(x[1]), x[0]))) test_metrics = RegressionMetrics(test_preds.map(lambda x: (float(x[1]), x[0]))) with open('reSampleResult2.txt', 'w+') as f: f.write(str(trained_metrics.explainedVariance) + '\n')
filtered_car_data = car_data.map( lambda d: [toInteger(d["prc"]), toAge(d["fr"]), toFuel(d["fl"]), toInteger(d["ma"]), d["pk"], d["po"], d["ei"]] ) filtered_car_data.first() labeled_car_data = filtered_car_data.map(lambda row: LabeledPoint(row[0], row[1:])) labeled_car_data.first() labeled_car_data.collect() """ (3) Run the Random Forest. """ model = RandomForest.trainRegressor( labeled_car_data, numTrees=750, categoricalFeaturesInfo={}, impurity="variance", maxDepth=5, maxBins=32 ) predictions = model.predict(labeled_car_data.map(lambda x: x.features)) labelsAndPredictions = labeled_car_data.map(lambda lp: [lp.label, lp.features]).zip(predictions) labelsAndPredictions.first() model_error = labelsAndPredictions.map(lambda row: (row[1] - row[0][0], row)) """ (4) Get the extremes! Best & Worst deal.
features=rdd.map(lambda t: (t[0],t[1],t[2],t[5],t[6],t[9],t[10],t[11],t[12],t[15],t[16])) standardizer = StandardScaler() model = standardizer.fit(features) features_transform = model.transform(features) #select value we want to predict #lab = rdd.map(lambda row: row[8])#time lab = rdd.map(lambda row: row[7])#fare transformedData = lab.zip(features_transform) transformedData = transformedData.map(lambda row: LabeledPoint(row[0],[row[1]])) #split into training and testing datasets trainingData, testingData = transformedData.randomSplit([0.9,0.1],seed=1234) #do the training and get predictions model = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo={},impurity='variance',numTrees=25, seed=42, maxDepth=8) predictions = model.predict(testingData.map(lambda x: x.features)) valuesAndPreds = testingData.map(lambda lp: lp.label).zip(predictions) results = valuesAndPreds.toDF().toPandas() results.columns = ['truth', 'pred'] results = results[results['truth'] > 0] truth = np.array(results["truth"].tolist()) pred = np.array(results["pred"].tolist()) diff_fare = 100*(truth - pred)/truth print 'mean = ' + str(diff_fare.mean()) #R-squared metrics = RegressionMetrics(valuesAndPreds) print("R-squared = %s" % metrics.r2)
from __future__ import print_function from pyspark import SparkContext from pyspark.mllib.tree import RandomForest, RandomForestModel from pyspark.mllib.util import MLUtils if __name__ == "__main__": sc = SparkContext(appName="PythonRandomForestRegressionExample") Load and parse the data file into an RDD of LabeledPoint. data = MLUtils.loadLibSVMFile(sc, 'data/mllib/sample_libsvm_data.txt') data = MLUtils.loadLibSVMFile(sc, 'data/mllib/newborn2013.txt') (trainingData, testData) = data.randomSplit([0.7, 0.3]) model = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo={0:3,1:4,2:2}, numTrees=4, featureSubsetStrategy="auto", impurity='variance', maxDepth=4, maxBins=12) predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions) testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() /\ float(testData.count()) print('Test Mean Squared Error = ' + str(testMSE)) print('Learned regression forest model:') print(model.toDebugString()) model.save(sc, "target/tmp/myRandomForestRegressionModel") sameModel = RandomForestModel.load(sc, "target/tmp/myRandomForestRegressionModel")
# Dictionary mapping each beat to an index. Useful when converting to LabeledPoint. Otherwise converts to numeric. beatsDict = dict(beatList.zipWithIndex().map(lambda x: (x[0],x[1])).collect()) # Data points as LabeledPoints # (crime count, [beat, week]) predArrayLP = joinedData.map(lambda x: LabeledPoint(x[0], [weekDict[x[1][0]], beatsDict[x[1][1]], x[1][2]])) # Split into training and testing set. 70-30 split. (train, test) = predArrayLP.randomSplit([0.7, 0.3]) # Feature categories : featuresCat = {0: len(beatsDict), 1: 53} maxBins = max(len(beatsDict),len(weekDict)) model = RandomForest.trainRegressor(train, categoricalFeaturesInfo=featuresCat, numTrees=10, featureSubsetStrategy="auto", impurity='variance', maxDepth=5, maxBins=maxBins) # Evaluate model on test instances and compute test error predictions = model.predict(test.map(lambda x: x.features)) #rschoolCountBeats = schoolCount.map(lambda x: x[0]) predOutput = predictions.collect() labelsAndPredictions = test.map(lambda lp: lp.label).zip(predictions) testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(test.count()) print('Test Mean Squared Error = ' + str(testMSE)) ### Write output to file ### with open("predictions.txt", 'wb') as f: writer = csv.writer(f) writer.writerows(predOutput)
def run(jobNm, sc, sqlContext, inputFile, lPolygon, dictFile, nDataType=0, inputPartitions=-1, sNum=30, modelSavePath=None, bWriteMonitor=False, writeFileOutput=True, strStop=''): if bWriteMonitor: import plotting bc_lTargetPolygons = sc.broadcast(lPolygon) stopSet = set(strStop.split(',')) if strStop != '' else set() #Create monitoring plot and associated vectors mPX = range(7) mPY = [0.] * 7 mSL = [ "Initial Read", "Calculate IDF", "Partition for M.L.", "Create Training Vector", "Train Model", "Apply Model", "Prepare Output Data" ] mInd = 0 t0 = time.time() #Read in data and filter out entries with no valid words t1 = time.time() print 'inputFile ', inputFile print 'inputPartitions ', inputPartitions records = aggregatedComparison.loadPoint(sc, sqlContext, inputFile, inputPartitions) nGoodTweets = records.count() t2 = time.time() print "Number of good tweets:", nGoodTweets diff = t2 - t1 print "Time to read in and filter nonscorable words", diff if bWriteMonitor: mPY[mInd] = diff mInd = mInd + 1 plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm) #Find the word document frequency for the corpus #this is used for an idf score used in feature vector formation t1 = time.time() revLookup = [] lStop = [] fDict = None if dictFile[:3] == 's3:' or dictFile[:5] == 'hdfs:': # read dict file from hdfs fDict = sc.textFile(dictFile).collect() else: # read from local file fDict = open(dictFile, "r") for line in fDict: terms = line.split("\t") revLookup.append(terms[0]) if terms[0] in stopSet: lStop.append(terms[1]) nVecLen = len(revLookup) t2 = time.time() diff = t2 - t1 print "Time to read dict:", diff if bWriteMonitor: mPY[mInd] = diff mInd = mInd + 1 plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm) # Split data into training and apply samples # training data is 2 parts, inside r.o.i., and a sample of the areas outside the r.o.i. t1 = time.time() sqlContext.registerFunction( "inRegionOfInterest", lambda lat, lon: fspLib.inROI(lat, lon, bc_lTargetPolygons), returnType=BooleanType()) df1 = sqlContext.sql( "SELECT * from records WHERE inRegionOfInterest(records.lat,records.lon)" ).cache() df1.registerTempTable("df1") nIn = df1.count() dfn1 = sqlContext.sql( "SELECT * from records WHERE NOT inRegionOfInterest(records.lat,records.lon)" ).cache() dfn1.registerTempTable("dfn1") nOut = dfn1.count() modelDict = aggregatedComparison.exemplarDict(df1, revLookup) t2 = time.time() diff = t2 - t1 print "Time to find in and out of ROI", diff print "N in:", nIn, ", N out:", nOut if bWriteMonitor: mPY[mInd] = diff mInd = mInd + 1 plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm) # Create training vectors from in region data, and sample of out region data t1 = time.time() #grouped = aggregatedComparison.createAggregatedLabledPoint(df1, False, fBinSize, bc_dIDF, True, bc_lStopWords, nGoodTweets, 1.0) #grouped2 = aggregatedComparison.createAggregatedLabledPoint(dfn1, False, fBinSize, bc_dIDF, True, bc_lStopWords, nGoodTweets, -1.0) #nSignal = float(grouped.count()) #nBack = float(grouped2.count()) groupedIn = df1.map(lambda x: (x.key, [ LabeledPoint(1.0, x.vector), x.lat, x.lon, x.size, x.binSize ])).cache() groupedOut = dfn1.map(lambda x: (x.key, [ LabeledPoint(-1.0, x.vector), x.lat, x.lon, x.size, x.binSize ])).cache() scaleFactor = (10. * nIn) / float(nOut) (mlApply, groupedUse) = groupedOut.randomSplit([1 - scaleFactor, scaleFactor]) mlTrain = groupedIn.union(groupedUse).cache() if len(lStop) != 0: mlTrain = mlTrain.map( lambda x: aggregatedComparison.removeStopWords(x, lStop)) nTotTrain = mlTrain.count() mlApply.cache() nApply = mlApply.count() t2 = time.time() print nTotTrain, "entries for training" diff = t2 - t1 print "Time to get data ready for model by time", diff if bWriteMonitor: mPY[mInd] = diff mInd = mInd + 1 plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm) # train model t1 = time.time() model_Tree = RandomForest.trainRegressor(mlTrain.map(lambda x: x[1][0]), categoricalFeaturesInfo={}, numTrees=100, featureSubsetStrategy="auto", impurity="variance", maxDepth=4, maxBins=32) if modelSavePath is not None: if modelSavePath[-1] != "/": modelSavePath = modelSavePath + "/" model_Tree.save(sc, modelSavePath + jobNm) t2 = time.time() diff = t2 - t1 print "Time to train model", diff if bWriteMonitor: mPY[mInd] = diff mInd = mInd + 1 plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm) # apply model t1 = time.time() predictions_Tree = model_Tree.predict( mlApply.map(lambda x: x[1][0].features)) vecAndPredictions = mlApply.zip(predictions_Tree) vecAndPredictions.cache() vecAndPredictions.count() t2 = time.time() diff = t2 - t1 print "Time to apply model: ", diff if bWriteMonitor: mPY[mInd] = diff mInd = mInd + 1 plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm) #Get the results t1 = time.time() resultSet = clustering.locationBasedOutputV2(False, jobNm, vecAndPredictions, sNum, revLookup, writeFileOutput, modelDict) t2 = time.time() diff = t2 - t1 print "Time to create json objects for output: ", diff if bWriteMonitor: mPY[mInd] = diff plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm) diff = time.time() - t0 print "<----------BOOM GOES THE DYNOMITE!---------->" print "< total number of tweets:,", nGoodTweets print "< total process Time:", diff print "< total idf vector length:", nVecLen print "<------------------------------------------->" return resultSet
def randomForestRegression(trainingData, testData, trainingSize, testSize): ''' random forest for regression ''' # parameter range maxDepthValList = [30] maxBinsValList = [16, 24, 32] numTreesValList = [10, 20] # best parameters bestMaxDepthVal = 10 bestMaxBinsVal = 16 bestNumTreesVal = 10 bestTrainingRMSE = 1e10 for maxDepthVal, maxBinsVal, numTreesVal in itertools.product( maxDepthValList, maxBinsValList, numTreesValList): model = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo={}, numTrees=numTreesVal, featureSubsetStrategy="auto", impurity='variance', maxDepth=maxDepthVal, maxBins=maxBinsVal) predictions = model.predict(trainingData.map(lambda x: x.features)) ValsAndPreds = trainingData.map(lambda x: x.label).zip(predictions) trainingRMSE = math.sqrt( ValsAndPreds.map(lambda (v, p): (v - p)**2).reduce(lambda x, y: x + y) / trainingSize) if trainingRMSE: if trainingRMSE < bestTrainingRMSE: bestMaxDepthVal = maxDepthVal bestMaxBinsVal = maxBinsVal bestNumTreesVal = numTreesVal bestTrainingRMSE = trainingRMSE print maxDepthVal, maxBinsVal, numTreesVal, trainingRMSE print bestMaxDepthVal, bestMaxBinsVal, bestNumTreesVal, bestTrainingRMSE model = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo={}, numTrees=bestNumTreesVal, featureSubsetStrategy="auto", impurity='variance', maxDepth=bestMaxDepthVal, maxBins=bestMaxBinsVal) # evaluating the model on training data predictions = model.predict(trainingData.map(lambda x: x.features)) ValsAndPreds = trainingData.map(lambda x: x.label).zip(predictions) trainingRMSE = math.sqrt( ValsAndPreds.map(lambda (v, p): (v - p)**2).reduce(lambda x, y: x + y) / trainingSize) print trainingRMSE # evaluating the model on test data predictions = model.predict(testData.map(lambda x: x.features)) ValsAndPreds = testData.map(lambda x: x.label).zip(predictions) testRMSE = math.sqrt( ValsAndPreds.map(lambda (v, p): (v - p)**2).reduce(lambda x, y: x + y) / testSize) print testRMSE
def run(jobNm,sc,sqlContext,inputFile,lPolygon,dictFile, nDataType=0, inputPartitions=-1, sNum=30, modelSavePath=None, bWriteMonitor=False, writeFileOutput=True, strStop=''): if bWriteMonitor: import plotting bc_lTargetPolygons = sc.broadcast(lPolygon) stopSet = set(strStop.split(',')) if strStop !='' else set() #Create monitoring plot and associated vectors mPX = range(7) mPY = [0.]*7 mSL = ["Initial Read", "Calculate IDF", "Partition for M.L.", "Create Training Vector", "Train Model", "Apply Model", "Prepare Output Data"] mInd = 0 t0 = time.time() #Read in data and filter out entries with no valid words t1 = time.time() records = aggregatedComparison.loadPoint(sc, sqlContext, inputFile, inputPartitions) nGoodTweets = records.count() t2 = time.time() print "Number of good points:", nGoodTweets diff = t2-t1 print "Time to read in and filter nonscorable words", diff if bWriteMonitor: mPY[mInd] = diff mInd = mInd+1 plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm) #Find the word document frequency for the corpus #this is used for an idf score used in feature vector formation t1 = time.time() revLookup = [] lStop = [] if dictFile[:3] == 's3:' or dictFile[:5] == 'hdfs:': # read dict file from hdfs fDict = sc.textFile(dictFile).collect() else: # read from local file fDict = open(dictFile,"r") for line in fDict: terms = line.split("\t") revLookup.append(terms[0]) if terms[0] in stopSet: lStop.append(terms[1]) nVecLen = len(revLookup) t2 = time.time() diff = t2-t1 print "Time to read dict: ", diff if bWriteMonitor: mPY[mInd] = diff mInd = mInd+1 plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm) # Split data into training and apply samples # training data is 2 parts, as well as prepare application data # i.) In both the region, and in the time window # ii.) In the region, but outside the time window # iii.) Out of region, data to apply model to t1 = time.time() sqlContext.registerFunction("inRegionOfInterest", lambda lat,lon: fspLib.inROI(lat,lon,bc_lTargetPolygons),returnType=BooleanType()) sqlContext.registerFunction("inEventOfInterest", lambda lat,lon,date: fspLib.inEOI(lat,lon,date,bc_lTargetPolygons),returnType=BooleanType()) sqlContext.registerFunction("outOfEventOfInterest", lambda lat,lon,dt: fspLib.outEOI(lat,lon,dt,bc_lTargetPolygons),returnType=BooleanType()) df1 = sqlContext.sql("SELECT * from records WHERE inRegionOfInterest(records.lat,records.lon)").cache() df1.registerTempTable("df1") df1_inTime = sqlContext.sql("SELECT * from df1 WHERE inEventOfInterest(df1.lat,df1.lon,df1.dt)").cache() #df1_outTime = sqlContext.sql("SELECT * from df1 WHERE outOfEventOfInterest(df1.lat,df1.lon,df1.dt)").cache() dfn1 = sqlContext.sql("SELECT * from records WHERE NOT inRegionOfInterest(records.lat,records.lon)") df1_inTime.registerTempTable("df1_inTime") #df1_outTime.registerTempTable("df1_outTime") #nL1T1 = df1_inTime.count() #nL1T0 = df1_outTime.count() exempDict = aggregatedComparison.exemplarDict(df1_inTime, revLookup) t2 = time.time() #print nL1T1, "events in region in time,", nL1T0, "events in region out of time" diff = t2-t1 print "Time to partition by time", diff if bWriteMonitor: mPY[mInd] = diff mInd = mInd+1 plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm) # Create training vectors from in region data t1 = time.time() groupedIn = df1_inTime.map(lambda x: (x.key, [LabeledPoint(1.0, x.vector), x.lat, x.lon, x.size, x.binSize])).cache() #groupedOut = df1_outTime.map(lambda x: (x.key, [LabeledPoint(-1.0, x.vector), x.lat, x.lon, x.size, x.binSize])).cache() groupedOut = dfn1.map(lambda x: (x.key, [LabeledPoint(-1.0, x.vector), x.lat, x.lon, x.size, x.binSize, x.dt])).cache() nSignal = float(groupedIn.count()) nBack = float(groupedOut.count()) scaleFactor = 10.*nSignal/nBack (mlApply, groupedUse) = groupedOut.randomSplit([1-scaleFactor,scaleFactor]) mlApply.cache() mlTrain = groupedIn.union(groupedUse).cache() if len(lStop) != 0: mlTrain = mlTrain.map(lambda x: aggregatedComparison.removeStopWords(x, lStop)) mlTrain.cache() nTotTrain = mlTrain.count() t2 = time.time() print nTotTrain, "entries for training" diff = t2-t1 print "Time to get data ready for model by time", diff if bWriteMonitor: mPY[mInd] = diff mInd = mInd+1 plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm) # train model t1 = time.time() model_Tree = RandomForest.trainRegressor(mlTrain.map(lambda x: x[1][0]), categoricalFeaturesInfo={}, numTrees=2000, featureSubsetStrategy="auto", impurity="variance", maxDepth=4, maxBins=32) if modelSavePath is not None: if modelSavePath[-1] != "/": modelSavePath = modelSavePath+"/" model_Tree.save(sc, modelSavePath + jobNm) t2 = time.time() diff = t2-t1 print "Time to train model", diff if bWriteMonitor: mPY[mInd] = diff mInd = mInd+1 plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm) # Apply Model to out of region data t1 = time.time() predictions_Tree = model_Tree.predict(mlApply.map(lambda x: x[1][0].features)) vecAndPredictions = mlApply.zip(predictions_Tree) vecAndPredictions.cache() vecAndPredictions.count() t2 = time.time() #print "Number of points to score:", nApply diff = t2-t1 print "Time aggregate and label points: ", diff if bWriteMonitor: mPY[mInd] = diff mInd = mInd+1 plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm) #Get the results t1 = time.time() resultSet = clustering.locationBasedOutputV2(True, jobNm, vecAndPredictions, sNum, revLookup, writeFileOutput, exempDict) t2 = time.time() diff = t2-t1 print "Time to create json objects for output: ", diff if bWriteMonitor: mPY[mInd] = diff plotting.updateMonitorPlot(mPX, mPY, mSL, jobNm) diff = time.time() - t0 print "<----------BOOM GOES THE DYNOMITE!---------->" print "< total number of tweets:,", nGoodTweets print "< total process Time:", diff print "< total idf vector length:", nVecLen print "<------------------------------------------->" return resultSet
joinedData = allCrimeCounts.map( lambda row: ((row[0][1]), (row[0][0], row[1]))).join(temperature).map( lambda row: ((row[0].weekday(), row[1][0][0], row[1][1]), row[1][0][1]) ).reduceByKey(lambda x, y: x + y).map( lambda row: LabeledPoint(row[1], [row[0][0], row[0][1], row[0][2]])) print joinedData.top(2) # Split the crime counts into training and test datasets (training, test) = joinedData.randomSplit((0.9, 0.1)) # Train a Random Forest model to predict crimes model = RandomForest.trainRegressor(training, categoricalFeaturesInfo={ 0: 7, 1: len(PCTsDict) }, numTrees=7, featureSubsetStrategy="auto", impurity='variance', maxDepth=10, maxBins=len(PCTsDict)) #### Predicting crimes for a day#### PCTsDictInverse = dict((v, k) for k, v in PCTsDict.items()) data = [] for weekday in range(7): for tempForecast in range(10, 100, 5): # Test dataset for each beat with next week's info predictday = sc.parallelize( tuple([(weekday, PCT, tempForecast) for PCT in range(len(PCTsDict))]))
from __future__ import print_function # $example on$ from pyspark import SparkContext from pyspark.ml import Pipeline from pyspark.ml.classification import DecisionTreeClassifier from pyspark.ml.feature import StringIndexer, VectorIndexer from pyspark.ml.evaluation import MulticlassClassificationEvaluator from pyspark.mllib.tree import DecisionTree, DecisionTreeModel from pyspark.mllib.util import MLUtils from pyspark.mllib.tree import RandomForest, RandomForestModel from pyspark.mllib.util import MLUtils if __name__ == "__main__": sc = SparkContext(appName="PythonRandomForestRegxample") data = MLUtils.loadLibSVMFile(sc,"file:///home/yl408/yuhao_datasets/phishing") #data = spark.read.format("libsvm").load("file:///home/yl408/yuhao_datasets/rcv1_train.binary") model = RandomForest.trainRegressor(data, categoricalFeaturesInfo={}, numTrees=3, featureSubsetStrategy="auto", impurity='variance', maxDepth=4, maxBins=32) model.save(sc, "file:///home/yl408/spark-ml/myrandomForestModel")
# Get file paths from arguments if len(sys.argv) != 4: print "Usage: random_forest.py FEATURES_FILE MODEL_FOLDER DISTRICTS_FILE" sys.exit() features_file, model_folder, districts_file = sys.argv[1:] spark_context, sql_context = create_spark_application("train_random_forest") data_loader = DataLoader(spark_context, sql_context, features_file) #for the random forest scaling and onehot-encoding are disabled because they better fit to linear regression models data_loader.initialize(do_scaling=False, do_onehot=False) #in the case of decision trees categorical features make more sense maxBins = 32 categorical_features_info = data_loader.get_categorical_features_info() if categorical_features_info and max(categorical_features_info.values()) > maxBins: maxBins = max(categorical_features_info.values()) # train and store a model for each district in the districts file for lat, lon in read_districts_file(districts_file): print("Training District: %f, %f" % (lat, lon)) start = time.time() model = RandomForest.trainRegressor(data_loader.get_train_data((lat, lon)), categoricalFeaturesInfo=categorical_features_info, numTrees=5, maxDepth=15, maxBins=maxBins) #save the model in the specified model_folder model.save(spark_context, '%s/model_%s_%s' % (model_folder, str(lat), str(lon))) print("Done training district. Took %f s." % (time.time() - start))