def randomForestRegression(df,arguments): from pyspark.ml.regression import RandomForestRegressor maxDepth = 5 minInstancesPerNode = 1 numTrees = 20 impurity = "variance" if arguments.maxDepth != None: maxDepth = float(arguments.maxDepth) if arguments.minInstancesPerNode != None: minInstancesPerNode = float(arguments.minInstancesPerNode) if arguments.numTrees != None: numTrees = float(arguments.numTrees) if arguments.impurity != None: impurity = arguments.impurity rf = RandomForestRegressor(numTrees=numTrees, maxDepth=maxDepth, minInstancesPerNode=minInstancesPerNode, impurity=impurity) model = rf.fit(df) return model
def _getBasePredictor(self, randomSeed): f = open(self._baseDataPath, "w") f.truncate() f.close() self._lowTrainData = self._trainData.sample(fraction=self._lowRatio, seed=randomSeed).cache() self._midTrainData = self._trainData.sample(fraction=self._midRatio, seed=randomSeed).cache() cs = self.getPCS() scenario = Scenario({ "run_obj": "quality", "runcount-limit": self._BPDS, "cs": cs, "deterministic": "true" }) # Optimize, using a SMAC-object smac = SMAC(scenario=scenario, rng=np.random.RandomState(42), tae_runner=self._baseEval) smac.optimize() df = self._spark.read.format("libsvm").load(self._baseDataPath) rf = RandomForestRegressor() rfModel = rf.fit(df) self._lowTrainData.unpersist() self._midTrainData.unpersist() return rfModel
def main(): # Set bounds for random forest's hyperparameters hparams = [(2, 25), # num_trees (2, 6), # max_depth (15, 30)] # max_bins # Run hyperparameter optimization using Gaussian processes optim_results = gp_minimize(objective, hparams, n_calls=20, verbose=True, random_state=0) print('\nHyperparameter Optimization Results:') print('Best validation RMSE = {}'.format(optim_results.fun)) # Get best hyperparameters from optimization num_trees = optim_results.x[0] max_depth = optim_results.x[1] max_bins = optim_results.x[2] # Instantiate a RandomForest model using best hyperparameter settings rf = RandomForestRegressor(numTrees=num_trees, maxDepth=max_depth, maxBins=max_bins) # Train model. model = rf.fit(train) # Make predictions. predictions = model.transform(test) # Select (prediction, true label) and compute test error evaluator = RegressionEvaluator( labelCol="label", predictionCol="prediction", metricName="rmse") rmse = evaluator.evaluate(predictions) print('\nFinal Results on Test Set with Optimized Hyperparameters:') print("Root Mean Squared Error on test set = %g" % rmse)
def main(): parser = argparse.ArgumentParser(description='Pyspark Training') parser.add_argument( '--data', type=str, default="../../../data/sample_linear_regression_data.txt", help='Data location.') args = parser.parse_args() data = spark.read.format("libsvm").load(args.data) # Split the data into training and test sets (30% held out for testing) (train, test) = data.randomSplit([0.7, 0.3]) # Train a RandomForest model. rf = RandomForestRegressor() # Train model. This also runs the indexer. model = rf.fit(train) # Make predictions. predictions = model.transform(test) # Select example rows to display. predictions.select("prediction", "label", "features").show(5) # Select (prediction, true label) and compute test error evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse") rmse = evaluator.evaluate(predictions) print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)
def test_multi_target_random_forest(): import shap import numpy as np from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestRegressor X_train, X_test, Y_train, _ = train_test_split(*shap.datasets.linnerud(), test_size=0.2, random_state=0) est = RandomForestRegressor(random_state=202, n_estimators=10, max_depth=10) est.fit(X_train, Y_train) predicted = est.predict(X_test) explainer = shap.TreeExplainer(est) expected_values = np.asarray(explainer.expected_value) assert len( expected_values ) == est.n_outputs_, "Length of expected_values doesn't match n_outputs_" shap_values = np.asarray(explainer.shap_values(X_test)).reshape( est.n_outputs_ * X_test.shape[0], X_test.shape[1]) phi = np.hstack((shap_values, np.repeat(expected_values, X_test.shape[0]).reshape(-1, 1))) assert np.allclose(phi.sum(1), predicted.flatten(order="F"), atol=1e-4)
def spark_ml(): diff_cat_in_train_test=test.select('Product_ID').subtract(train.select('Product_ID')) diff_cat_in_train_test.distinct().count() from pyspark.ml.feature import StringIndexer plan_indexer = StringIndexer(inputCol = 'Product_ID', outputCol = 'product_ID') labeller = plan_indexer.fit(train) Train1 = labeller.transform(train) Test1 = labeller.transform(test) Train1.show() from pyspark.ml.feature import RFormula formula = RFormula(formula="Purchase ~ Age+ Occupation +City_Category+Stay_In_Current_City_Years+Product_Category_1+Product_Category_2+ Gender",featuresCol="features",labelCol="label") t1 = formula.fit(Train1) train1 = t1.transform(Train1) test1 = t1.transform(Test1) train1.show() train1.select('features').show() train1.select('label').show() from pyspark.ml.regression import RandomForestRegressor rf = RandomForestRegressor() (train_cv, test_cv) = train1.randomSplit([0.7, 0.3]) model1 = rf.fit(train_cv) predictions = model1.transform(test_cv) from pyspark.ml.evaluation import RegressionEvaluator evaluator = RegressionEvaluator() mse = evaluator.evaluate(predictions,{evaluator.metricName:"mse" }) import numpy as np np.sqrt(mse), mse model = rf.fit(train1) predictions1 = model.transform(test1) df = predictions1.selectExpr("User_ID as User_ID", "Product_ID as Product_ID", 'prediction as Purchase') df.toPandas().to_csv('submission.csv')
def traintest(data, outputfile): print('in traintest ***********************************') print('\n') train, test = data.randomSplit([0.7, 0.3]) train = train.cache() test = test.cache() #first cross validation to find hyperparameters. hyperparameters with less error #learn on all train data to find parameters rf_regressor = RandomForestRegressor(featuresCol="features", labelCol="sale", seed=40) myFeatures = [ "pack", "bottlessold", "volumesoldl", "volumesoldg", "latitude", "longitude" ] # ,"vendornumber","bottlevolume","statebottlecost" assembler = VectorAssembler(inputCols=myFeatures, outputCol="features") # "vendornumber","bottlevolume","itemnumber" #"statebottlecost", pipeline = Pipeline(stages=[assembler, rf_regressor]) rf = RandomForestRegressor() paramGrid = ParamGridBuilder().addGrid( rf_regressor.maxDepth, [2, 5, 10, 15]).addGrid(rf_regressor.minInfoGain, [0.01]).addGrid(rf_regressor.numTrees, [20, 30, 100]).build() #paramGrid = ParamGridBuilder().addGrid(rf_regressor.maxDepth, [2]).addGrid(rf_regressor.minInfoGain, [0.01]).addGrid(rf_regressor.numTrees, [5]).build() # Run cross-validation, and choose the best set of parameters. crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=RegressionEvaluator( predictionCol='prediction', labelCol='sale', metricName='rmse'), numFolds=8) # use 3+ folds in practice # Run cross-validation, and choose the best set of parameters. cvModel = crossval.fit(train) #model = pipeline.fit(train) predictions = cvModel.transform(test) r2_evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='sale', metricName='r2') r2 = r2_evaluator.evaluate(predictions) rmse_evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='sale', metricName='rmse') rmse = rmse_evaluator.evaluate(predictions) print('r2 =', r2) print('rmse =', rmse) output = open(outputfile, "w") output.write("r2=" + str(r2) + "\n") output.write("rmse=" + str(rmse) + "\n")
def model_dev_rf(df_train, df_test, n_trees, max_bins, max_depth): rf_start_time = time() # Create an Initial Model Instance mod_rf = RandomForestRegressor(labelCol='label', featuresCol='features', impurity='variance', featureSubsetStrategy='all', numTrees=n_trees, maxBins=max_bins, maxDepth=max_depth) # Training The Model rf_final_model = mod_rf.fit(df_train) # Scoring The Model On Test Sample rf_transformed = rf_final_model.transform(df_test) rf_test_results = rf_transformed.select(['prediction', 'label']) # Collecting The Model Statistics rf_evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="label") rf_r2 = round( rf_evaluator.evaluate(rf_test_results, {rf_evaluator.metricName: "r2"}), 3) rf_mse = round( rf_evaluator.evaluate(rf_test_results, {rf_evaluator.metricName: "mse"}), 3) rf_rmse = round( rf_evaluator.evaluate(rf_test_results, {rf_evaluator.metricName: "rmse"}), 3) rf_mae = round( rf_evaluator.evaluate(rf_test_results, {rf_evaluator.metricName: "mae"}), 3) # Printing The Model Statitics print("\n++++++ Printing Random Forest Model Accuracy ++++++\n") print("R Square: " + str(rf_r2 * 100) + "%") print("Mean Squared Error: " + str(rf_mse)) print("Root Mean Squared Error: " + str(rf_rmse)) print("Mean Absolute Error: " + str(rf_mae)) rf_end_time = time() rf_elapsed_time = (rf_end_time - rf_start_time) / 60 rf_model_stat = pd.DataFrame({ "Model Name": ["Random Forest"], "R Square": rf_r2, "Mean Squared Error": rf_mse, "Root Mean Squared Error": rf_rmse, "Mean Absolute Error": rf_mae, "Time (Min.)": round(rf_elapsed_time, 3) }) rf_output = (rf_final_model, rf_model_stat) return (rf_output)
def RF(trainingData, testData): """ Random Forest Tree Regression Model :param trainingData: :param testData: :param args :return: Trained model, predictions, nt (int), md (int) """ nt,md=120,20 rf = RandomForestRegressor( numTrees=nt, featureSubsetStrategy="auto",\ impurity='variance', maxDepth=md, maxBins=100) #120,20 model = rf.fit(trainingData) predictions = model.transform(testData) return model, predictions, nt, md
def testRegression(train, test): # Train a RandomForest model. # Note: Use larger numTrees in practice. rf = RandomForestRegressor(labelCol="indexedLabel", numTrees=3, maxDepth=4) model = rf.fit(train) predictionAndLabels = model.transform(test).select("prediction", "indexedLabel") \ .map(lambda x: (x.prediction, x.indexedLabel)) metrics = RegressionMetrics(predictionAndLabels) print("rmse %.3f" % metrics.rootMeanSquaredError) print("r2 %.3f" % metrics.r2) print("mae %.3f" % metrics.meanAbsoluteError)
def test_OutputNonNumericalGridSearch(self): assembler = VectorAssembler(inputCols=self.data.columns[1:(-1)], outputCol="features") stratifyCol = "foldID" featureAssembledData = assembler.transform(self.data).select("y", "features", stratifyCol) rf = RandomForestRegressor(featuresCol="features", labelCol="y", minInstancesPerNode=1) evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="y") strategyGrid = ["sqrt", "5"] depthGrid = [3, 15] paramGrid = ParamGridBuilder()\ .addGrid(rf.maxDepth, depthGrid)\ .addGrid(rf.featureSubsetStrategy, strategyGrid)\ .build() validator = CrossValidatorWithStratificationID(estimator=rf, estimatorParamMaps=paramGrid, evaluator=evaluator, stratifyCol=stratifyCol) cvModel = validator.fit(featureAssembledData) metrics = cvModel.avgMetrics.drop("paramSetID") collectedMetrics = metrics.collect() roundedMetrics = [localRoundMetricValue(x, "metric for CV rmse") for x in collectedMetrics] self.assertEqual(len(roundedMetrics), 4, "Incorrect number of returned metric values.") expectedMetricStructure = [\ {'metric for CV rmse': 1.073, 'maxDepth': 3.0, 'featureSubsetStrategy': 'sqrt'}, {'metric for CV rmse': 1.07, 'maxDepth': 3.0, 'featureSubsetStrategy': '5'}, {'metric for CV rmse': 1.111, 'maxDepth': 15.0, 'featureSubsetStrategy': 'sqrt'}, {'metric for CV rmse': 1.108, 'maxDepth': 15.0, 'featureSubsetStrategy': '5'}\ ] for metric in roundedMetrics: self.assertTrue(metric in expectedMetricStructure, "{0} is not expected. The expected {1}.".format(metric, expectedMetricStructure))
def engineerFeatures(): sectionCV = CountVectorizer(inputCol='sections', outputCol="sectionVector") subsectionCV = CountVectorizer(inputCol='subsections', outputCol="subsectionVector") newsdeskCV = CountVectorizer(inputCol='newsdesks', outputCol="newsdeskVector") materialCV = CountVectorizer(inputCol='materials', outputCol="materialVector") keywordCV = CountVectorizer(inputCol='keywords', outputCol="keywordVector") symbolSI = StringIndexer(inputCol="Symbol",outputCol="indexedSymbol", handleInvalid='keep') va = VectorAssembler(inputCols=['sectionVector', 'subsectionVector', 'newsdeskVector', 'materialVector', 'indexedSymbol', 'keywordVector'], outputCol='features') articleRfr = RandomForestRegressor(featuresCol="features", labelCol="PriceChange", predictionCol="pPriceChange", maxBins=5700) stages = [sectionCV, subsectionCV, newsdeskCV, materialCV, keywordCV, symbolSI, va, articleRfr] return stages
def create_rf_pipeline(): """Wrapper function that creates a pipeline including a Vector Assembler and a Random Forest Regressor. Args: None Returns: cols_to_keep (list): a list of the names of feature the model will train on pipeline: a pipeline of the feature assembler and the random forest param_grid: a grid of parameters for the grid search step """ cols_to_keep = [ 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'day_of_week', 'hour_of_day', 'trip_distance', 'haversine_dist', 'lat_dist', 'long_dist' ] feature_assembler = VectorAssembler(inputCols=cols_to_keep, outputCol='features') rf = RandomForestRegressor(labelCol='log_min_duration', featuresCol='features') pipeline = Pipeline(stages=[feature_assembler, rf]) param_grid = ParamGridBuilder()\ .addGrid(rf.numTrees, [20, 50, 100]) \ .addGrid(rf.maxDepth, [5, 10, 15])\ .build() return cols_to_keep, pipeline, param_grid
def score_rf(split_input_train_df, split_input_validation_df, model_evaluator): global model_rmse, model_dict, model_count print( "###################### Random Forest Regression #########################" ) rf_regressor = RandomForestRegressor(featuresCol='features', labelCol='total_delivery_duration') print("CrossValidation...") rf_paramGrid = ParamGridBuilder()\ .addGrid(rf_regressor.maxBins, [5700, 6000])\ .addGrid(rf_regressor.maxMemoryInMB, [256, 512])\ .addGrid(rf_regressor.subsamplingRate, [0.1, 1.0])\ .build() rf_cross_val = CrossValidator(estimator=rf_regressor, estimatorParamMaps=rf_paramGrid, evaluator=model_evaluator, numFolds=3) print("Done") print("Fitting training data...") rf_cv_model = rf_cross_val.fit(split_input_train_df) print("Done") print("Evaluating on validation data...") rmse = model_evaluator.evaluate( rf_cv_model.transform(split_input_validation_df)) model_rmse.append(rmse) model_count += 1 model_dict[model_count] = {} model_dict[model_count]["RF"] = rf_cv_model print("RMSE on validation data: %f" % rmse)
def get_best_weather_model(data): train, test = data.randomSplit([0.75, 0.25]) train = train.cache() test = test.cache() estimator_gridbuilders = [ estimator_gridbuilder(RandomForestRegressor(), dict(maxDepth=[5], maxBins=[5], numTrees=[20])), estimator_gridbuilder(GBTRegressor(maxIter=100), dict()) ] metricName = 'r2' tvs_list = make_weather_trainers( .2, # fraction of data for training estimator_gridbuilders, metricName) ev = tvs_list[0].getEvaluator() scorescale = 1 if ev.isLargerBetter() else -1 model_name_scores = [] # print(list(tvs_list).count()) for tvs in tvs_list: model = tvs.fit(train) test_pred = model.transform(test) score = ev.evaluate(test_pred) * scorescale model_name_scores.append( (model, get_estimator_name(tvs.getEstimator()), score)) best_model, best_name, best_score = max(model_name_scores, key=lambda triplet: triplet[2]) print("Best model is %s with validation data %s score %f" % (best_name, ev.getMetricName(), best_score * scorescale)) return best_model
def UsefulnessPredictionLDAWithoutCV(trainingdata, model): # Data Preprocessing tokenizer = Tokenizer(inputCol="review_text", outputCol="tokens_word") remover = StopWordsRemover(inputCol="tokens_word", outputCol="filtered_tokens_word") cv = CountVectorizer(inputCol="filtered_tokens_word", outputCol="raw_features", minDF=2.0, vocabSize=250) idf = IDF(inputCol="raw_features", outputCol="features") # Extract LDA topic feature lda = LDA(k=30, maxIter=10) if model == 'RandomForest': model = RandomForestRegressor(featuresCol="topicDistribution") pipeline = Pipeline(stages=[tokenizer, remover, cv, idf, lda, model]) evaluator_rmse = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse") cvModel = pipeline.fit(trainingdata) # Explain params for the selected model print cvModel.explainParams() return cvModel
def main(inputs, model_file): data = spark.read.csv(inputs, schema=tmax_schema) data.registerTempTable('yesterday') #wthr_query = """SELECT dayofyear(date) as dayofyr, latitude, longitude, elevation,tmax FROM __THIS__""" wthr_query = """SELECT dayofyear(today.date) as dayofyr,today.latitude, today.longitude, today.elevation, today.tmax, yesterday.tmax as yesterday_tmax FROM __THIS__ as today INNER JOIN __THIS__ as yesterday ON date_sub(today.date, 1) = yesterday.date AND today.station = yesterday.station""" train, validation = data.randomSplit([0.75, 0.25]) train = train.cache() validation = validation.cache() #define the assembler and regressor assembler = VectorAssembler(inputCols=["latitude", "longitude", "elevation", "dayofyr" ], outputCol="features") regressor = RandomForestRegressor(maxDepth=10, minInstancesPerNode=2, minInfoGain=0.5, labelCol = "tmax") trans_query = SQLTransformer(statement = wthr_query) #define pipeline and model wthr_pipeline = Pipeline(stages=[trans_query, assembler, regressor]) wthr_model = wthr_pipeline.fit(train) #define the regression evaluator evaluator = RegressionEvaluator(labelCol="tmax", predictionCol="prediction") predictions = wthr_model.transform(validation) err = evaluator.evaluate(predictions) wthr_model.write().overwrite().save(model_file) print('Root Mean Square Error(rmse) : ' + str(err))
def rf_train(self, data, stages): """ Random forest training using Grid Search CV """ rf = RandomForestRegressor(featuresCol='features', labelCol="submission_ratio") stages.append(rf) pipeline = Pipeline(stages=stages) paramGrid = ParamGridBuilder() \ .addGrid(rf.numTrees, [int(x) for x in np.linspace(start=10, stop=50, num=3)]) \ .addGrid(rf.maxDepth, [int(x) for x in np.linspace(start=5, stop=25, num=3)]) \ .build() self.evaluator = RegressionEvaluator( predictionCol='prediction', labelCol='submission_ratio', metricName='rmse', ) cross_val = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=self.evaluator, numFolds=3) self.model = cross_val.fit(data) pip_model = self.model.bestModel pip_model.save("../data/model")
def main(): # 1. Configure Spark conf = SparkConf().setAppName(APP_NAME) conf = conf.setMaster("local[*]") sc = SparkContext(conf=conf) spark = SparkSession(sc) text_file = sc.textFile("s3a://spotifybuck/albumfeatures/2017/*/*/*/*/*") #3. Transform data af = (text_file.map(getVals)) #4. Create a DataFrame out of this using the toDF method and cache it afdf = af.toDF([ 'acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'loudness', 'duration' ]).cache() # Automatically identify categorical features, and index them. # Set maxCategories so features with > 4 distinct values are treated as continuous. featureIndexer = \ VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(afdf) #5. Create a train/test split with 70% of data in training set and 30% of data in test set afdf_train, afdf_test = afdf.randomSplit([0.7, 0.3], seed=123) # Train a RandomForest model. rf = RandomForestRegressor(featuresCol="indexedFeatures") # Chain indexer and forest in a Pipeline pipeline = Pipeline(stages=[featureIndexer, rf]) # Train model. This also runs the indexer. model = pipeline.fit(afdf_train) # Make predictions. predictions = model.transform(afdf_test) # Select example rows to display. predictions.select("prediction", "label", "features").show(5) # Select (prediction, true label) and compute test error evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse") rmse = evaluator.evaluate(predictions) print("Root Mean Squared Error (RMSE) on test data = %g" % rmse) rfModel = model.stages[1] print(rfModel) # summary only #Step 3: Building our Pipelines rfModel.save('s3a://spotifybuck/model-export' + datetime.now().strftime('%Y%m%d%H%M')) pipeline.save('s3a://spotifybuck/pipeline-export' + datetime.now().strftime('%Y%m%d%H%M')) sc.stop()
def feature_imp_pyspark(self): num_var = [i[0] for i in self.data_frame.dtypes if ((i[1]=='int') | (i[1]=='double')) & (i[0]!=self.target)] num_var = [col for col in num_var if not col.endswith('indexed')] # labels_count = [len(self.data_frame.select(col).distinct().collect()) for col in num_var] labels_count = [len(self.data_frame.agg((F.collect_set(col).alias(col))).first().asDict()[col]) for col in num_var] labels_count.sort() max_count = labels_count[-1] #one_hot = [col for col in self.data_frame.columns if col.endswith('_indexed_encoded')] #num_var.extend(one_hot) label_indexes = StringIndexer(inputCol = self.target , outputCol = 'label', handleInvalid = 'keep') assembler = VectorAssembler(inputCols = num_var , outputCol = "features") if self.problem_type == 'REGRESSION': model = RandomForestRegressor(labelCol="label", \ featuresCol="features", seed = 8464,\ numTrees=10, cacheNodeIds = True,\ subsamplingRate = 0.7) else: model = RandomForestClassifier(labelCol="label", \ featuresCol="features", seed = 8464,\ numTrees=10, cacheNodeIds = True,\ subsamplingRate = 0.7,maxBins = max_count+2) pipe = Pipeline(stages =[assembler, label_indexes, model]) mod_fit = pipe.fit(self.data_frame) df2 = mod_fit.transform(self.data_frame) cols = MLUtils.ExtractFeatureImp(mod_fit.stages[-1].featureImportances, df2, "features") cols_considered = cols.loc[cols['score'] > 0] cols_considered = list(cols_considered['name']) #tree_fs = list(set(cols_considered) & set(self.data_frame.columns)) #tree_fs.extend(list(set([encoded for encoded in one_hot for column in cols_considered if column.startswith(encoded)]))) self.data_change_dict['SelectedColsTree'] = cols_considered if self.target not in cols_considered: cols_considered.append(self.target) return cols_considered
def randomForestRun(train, test, featureIndexer, zillow_test, test_cols): print("Training Data Table") train.show() print("Training...") rf = RandomForestRegressor(featuresCol="indexedFeatures") pipe = Pipeline(stages=[featureIndexer, rf]) model = pipe.fit(train) print("Training... Done") predictions = model.transform(test) evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2") r2 = evaluator.evaluate(predictions) print("Random Forest Prediction") print("R-squared on test data = %g" % r2) evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="mae") mae = evaluator.evaluate(predictions) print("Mean Absolute Error on test data = %g" % mae) print("Features Importances") k = model.stages[-1].featureImportances print(k) print("Predicted Price by Zillow") zillow_test.show() zillow_rdd = zillow_test.rdd input_ = zillow_rdd.map(lambda line: (line[0], line[1], line[2], line[3], line[4], line[5], Vectors.dense(line[0:-1]))) zillow_test = spark.createDataFrame(input_, test_cols + ["features"]) pred_zillow = model.transform(zillow_test) pred_zillow = pred_zillow.withColumn('prediction price/ night', exp(pred_zillow.prediction)) pred_zillow = pred_zillow.withColumn('prediction price/ month', 30* exp(pred_zillow.prediction)) pred_zillow.show() return pred_zillow
def UsefulnessPredictionLDA(trainingdata, model): # Data Preprocessing tokenizer = Tokenizer(inputCol="review_text", outputCol="tokens_word") remover = StopWordsRemover(inputCol="tokens_word", outputCol="filtered_tokens_word") cv = CountVectorizer(inputCol="filtered_tokens_word", outputCol="raw_features", minDF=2.0) idf = IDF(inputCol="raw_features", outputCol="features") # Extract LDA topic feature lda = LDA(k=30, maxIter=10) if model == 'RandomForest': model = RandomForestRegressor(featuresCol="topicDistribution") pipeline = Pipeline(stages=[tokenizer, remover, cv, idf, lda, model]) evaluator_rmse = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse") paramGrid = ParamGridBuilder() \ .addGrid(cv.vocabSize, [150, 200, 250]) \ .build() crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=evaluator_rmse, numFolds=4) # use 3+ folds in practice cvModel = crossval.fit(trainingdata) # Explain params for the selected model print cvModel.explainParams() return cvModel
def engineerFeatures(): actionSI = StringIndexer(inputCol="Action", outputCol="indexedAction", handleInvalid='keep') fromSI = StringIndexer(inputCol="From", outputCol="indexedFrom", handleInvalid='keep') toSI = StringIndexer(inputCol="To", outputCol="indexedTo", handleInvalid='keep') firmSI = StringIndexer(inputCol="Research Firm", outputCol="indexedFirm", handleInvalid='keep') symbolSI = StringIndexer(inputCol="Symbol", outputCol="indexedSymbol", handleInvalid='keep') va = VectorAssembler(inputCols=[ 'indexedAction', 'indexedFrom', 'indexedTo', 'indexedFirm', 'indexedSymbol' ], outputCol='features') analystRfr = RandomForestRegressor(featuresCol="features", labelCol="PriceChange", predictionCol="pPriceChange", maxBins=5700) stages = [actionSI, fromSI, toSI, firmSI, symbolSI, va, analystRfr] return stages
def main(model_file): keyspace='technoaces' data = spark.read.format("org.apache.spark.sql.cassandra")\ .options(table='imdb_movies_data', keyspace=keyspace).load() data = data.where(data['imdb_score']!=0).where(data['runtimemins']!=0).where(data['meta_score']!=0).where(data['votes']!=0) train, validation = data.randomSplit([0.75, 0.25]) imdb_assembler = VectorAssembler( inputCols=['year','runtimemins','meta_score', 'votes'], outputCol='features') imdbclassifier = RandomForestRegressor( numTrees=2,featuresCol='features', labelCol='imdb_score',maxDepth=30,seed=1000) pipeline = Pipeline(stages=[imdb_assembler, imdbclassifier]) model = pipeline.fit(train) predictions = model.transform(validation) predictions.select('imdb_id','title','runtimemins','meta_score', 'imdb_score','votes'\ ,predictions['prediction'].alias('Predicted Votes')).show() r2_evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='imdb_score', metricName='r2') r2 = r2_evaluator.evaluate(predictions) return 0
def test_boston_dataset(spark_session: SparkSession): boston = load_boston() feature_names = boston.feature_names.tolist() output_name = 'outcome' boston_columns = feature_names + [output_name] X = boston.data.tolist() y = boston.target.tolist() Xy = [(i + [j]) for (i, j) in zip(X, y)] boston_df = spark_session.createDataFrame(Xy, boston_columns) print(feature_names) must_include_features = [] # must_include_features = ['TAX', 'INDUS'] # %% Ranking features ranked_features = feature_ranker( df=boston_df, feature_columns=feature_names, output_column=output_name, must_include_features=must_include_features) print(ranked_features) # %% Feature selection scores = feature_selector(df=boston_df, ranked_features=ranked_features, output_column=output_name, estimator_obj=RandomForestRegressor(), feature_inclusion_increments=1, train_test_split_ratio=[0.66, 0.33], cv=-1, evaluation_metric='r2') print(scores)
def Distr_RandomForestRegressor(xy_train, xy_test): rf = RandomForestRegressor(minInstancesPerNode=20, maxDepth=25) evalu = RegressionEvaluator() grid_1 = ParamGridBuilder()\ .addGrid(rf.numTrees, [100])\ .addGrid(rf.featureSubsetStrategy, ['0.5','0.8','1.0'])\ .build() cv_1 = CrossValidator(estimator=rf, estimatorParamMaps=grid_1, evaluator=evalu, numFolds=5) #寻找模型的最佳组合参数,cvModel将返回估计的最佳模型 cvModel_1 = cv_1.fit(xy_train) print "Grid scores: " best_params_1 = Get_best_params(cvModel_1, 'reg')['featureSubsetStrategy'] grid = ParamGridBuilder()\ .addGrid(rf.numTrees, [300,500])\ .addGrid(rf.featureSubsetStrategy, [best_params_1,])\ .build() cv = CrossValidator(estimator=rf, estimatorParamMaps=grid, evaluator=evalu, numFolds=5) #寻找模型的最佳组合参数,cvModel将返回估计的最佳模型 cvModel = cv.fit(xy_train) best_params = Get_best_params(cvModel, 'reg') print "Best parameters set found: %s" % best_params return cvModel.bestModel
def UsefulnessPredictionSentmentWithoutCV(trainingdata, model): # Data Preprocessing assembler = VectorAssembler(inputCols=[ 'num', 'sentiment_neg', 'sentiment_neu', 'sentiment_pos', 'sentiment_compound', 'Character_adj', 'Character_noun', 'Character_verb', 'Character_adv' ], outputCol="features") featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4) if model == 'RandomForest': model = RandomForestRegressor(featuresCol="indexedFeatures") pipeline = Pipeline(stages=[assembler, featureIndexer, model]) evaluator_rmse = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse") Model = pipeline.fit(trainingdata) return Model
def test_random_forrest_regression(self): this_script_dir = os.path.dirname( os.path.abspath(inspect.getfile(inspect.currentframe()))) input_path = os.path.join(this_script_dir, "data", "sample_libsvm_data.txt") original_data = self.spark.read.format("libsvm").load(input_path) # # truncate the features # feature_count = 5 self.spark.udf.register( "truncateFeatures", lambda x: SparseVector(feature_count, range(0, feature_count), x.toArray()[125:130]), VectorUDT()) data = original_data.selectExpr( "cast(label as string) as label", "truncateFeatures(features) as features") label_indexer = StringIndexer(inputCol="label", outputCol="indexedLabel") feature_indexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=10, handleInvalid='error') rf = RandomForestRegressor(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees=10) pipeline = Pipeline(stages=[label_indexer, feature_indexer, rf]) model = pipeline.fit(data) model_onnx = convert_sparkml( model, 'Sparkml RandomForest Regressor', [('label', StringTensorType([1, 1])), ('features', FloatTensorType([1, feature_count]))], spark_session=self.spark) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data.limit(1)) data_np = { 'label': data.limit(1).toPandas().label.values, 'features': data.limit(1).toPandas().features.apply( lambda x: pandas.Series(x.toArray())).values.astype( numpy.float32) } expected = [ predicted.toPandas().indexedLabel.values.astype(numpy.int64), predicted.toPandas().prediction.values.astype(numpy.float32) ] paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlRandomForestRegressor") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['indexedLabel', 'prediction'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def model_define(self): """Returns a model with the hyperparameters inputted in :func: `get_parameters`. Returns: (pyspark.ml.regression.RandomForestRegressor) Random Forest Regression model """ return RandomForestRegressor()
def main(inputs, out_model): data = spark.read.csv(inputs, schema=tmax_schema) train, validation = data.randomSplit([0.75, 0.25]) train = train.cache() validation = validation.cache() query = "SELECT dayofyear(today.date) as doy, today.latitude, today.longitude, today.elevation,today.tmax,yesterday.tmax AS yesterday_tmax \ FROM __THIS__ as today \ INNER JOIN __THIS__ as yesterday \ ON date_sub(today.date, 1) = yesterday.date AND today.station = yesterday.station" #query ="SELECT station,date, dayofyear(date) as doy, latitude, longitude, elevation,tmax FROM __THIS__" getDOY = SQLTransformer(statement=query) feature_cols = ['latitude', 'longitude', 'elevation', 'doy'] column_names = dict(featuresCol="features", labelCol="tmax", predictionCol="prediction") feature_assembler = VectorAssembler(inputCols=feature_cols, outputCol=column_names["featuresCol"]) # Testing different models to fit the best one!!! #est=GBTRegressor(maxIter=400,maxDepth=20) est = RandomForestRegressor(featureSubsetStrategy="log2", minInfoGain=0.5, numTrees=40) #est=DecisionTreeRegressor(maxDepth=10,minInstancesPerNode=4,minInfoGain=0.5) est = est.setParams(**column_names) pl = Pipeline(stages=[getDOY, feature_assembler, est]) model = pl.fit(train) predictions = model.transform(validation) predictions.show() r2_evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='tmax', metricName='r2') r2 = r2_evaluator.evaluate(predictions) print('\n\nr2=', r2) rmse_evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='tmax', metricName='rmse') rmse = rmse_evaluator.evaluate(predictions) print('\n\nrmse=', rmse) model.write().overwrite().save(out_model)
def _train_model_spark(self, data): df = self._prepare_data_spark(data) input_num = len(data.keys().difference({self.CHANGE_AMOUNT, self.CHANGE_DIRECTION, self.TARGET_PRICE, self.TODAY_PRICE})) if self.ann_hidden_nodes_num is None: self.ann_hidden_nodes_num = input_num / 2 + 1 ann_layers = [input_num, # input_num / 3 * 2, # input_num / 3, self.ann_hidden_nodes_num, 2] self.logger.info('layer settings are {}'.format(ann_layers)) self.logger.info('training method is {}'.format(self._train_method)) self.logger.info('trees num is {}'.format(self.random_forest_tree_number)) if isinstance(self._train_method, dict): if self._model is not None and self._train_method[self.CHANGE_AMOUNT] == self.ARTIFICIAL_NEURAL_NETWORK: self._model[self.CHANGE_AMOUNT].stop_server() self._model = {self.CHANGE_AMOUNT: None, self.CHANGE_DIRECTION: None} if self._train_method[self.CHANGE_AMOUNT] == self.LINEAR_REGRESSION: lr = LinearRegression(featuresCol="features", labelCol=self.CHANGE_AMOUNT, maxIter=self.linear_regression_training_times, regParam=self.linear_regression_regularization_parameter, predictionCol='AmountPrediction') self._model[self.CHANGE_AMOUNT] = lr.fit(df) elif self._train_method[self.CHANGE_AMOUNT] == self.RANDOM_FOREST: rfr = RandomForestRegressor(featuresCol="features", labelCol=self.CHANGE_AMOUNT, numTrees=self.random_forest_tree_number, maxDepth=self.random_forest_tree_max_depth, predictionCol='AmountPrediction') self._model[self.CHANGE_AMOUNT] = rfr.fit(df) elif self._train_method[self.CHANGE_AMOUNT] == self.ARTIFICIAL_NEURAL_NETWORK: ann_layers[-1] = 1 self._model[self.CHANGE_AMOUNT] = KerasNeuralNetworkSpark(layers=ann_layers, spark=self._spark, num_workers=self.spark_worker_numbers, epoch=self.ann_epoch_number, featuresCol="features", labelCol=self.CHANGE_AMOUNT, predictionCol='AmountPrediction' ) self._model[self.CHANGE_AMOUNT].fit(df) else: self.logger.warn('Unsupported training method {}'.format(self._train_method)) raise ValueError('Unsupported training method {}'.format(self._train_method)) if self._train_method[self.CHANGE_DIRECTION] == self.LOGISTIC_REGRESSION: lr = LogisticRegression(featuresCol="features", labelCol=self.CHANGE_DIRECTION, maxIter=self.logistic_regression_training_times, regParam=self.linear_regression_regularization_parameter, predictionCol='DirPrediction') self._model[self.CHANGE_DIRECTION] = lr.fit(df) elif self._train_method[self.CHANGE_DIRECTION] == self.RANDOM_FOREST: rfc = RandomForestClassifier(featuresCol="features", labelCol=self.CHANGE_DIRECTION, numTrees=self.random_forest_tree_number, maxDepth=self.random_forest_tree_max_depth, predictionCol='DirPrediction') self._model[self.CHANGE_DIRECTION] = rfc.fit(df) elif self._train_method[self.CHANGE_DIRECTION] == self.ARTIFICIAL_NEURAL_NETWORK: ann_layers[-1] = 2 mlpc = MultilayerPerceptronClassifier(featuresCol="features", labelCol=self.CHANGE_DIRECTION, layers=ann_layers, predictionCol='DirPrediction') self._model[self.CHANGE_DIRECTION] = mlpc.fit(df) else: self.logger.warn('Unsupported training method {}'.format(self._train_method)) raise ValueError('Unsupported training method {}'.format(self._train_method)) else: if self._train_method == self.LINEAR_REGRESSION: lr = LinearRegression(featuresCol="features", labelCol=self.TARGET_PRICE, predictionCol='prediction', regParam=self.linear_regression_regularization_parameter, maxIter=self.linear_regression_training_times) self._model = lr.fit(df) elif self._train_method == self.RANDOM_FOREST: rfr = RandomForestRegressor(featuresCol="features", labelCol=self.TARGET_PRICE, predictionCol='prediction', numTrees=self.random_forest_tree_number, maxDepth=self.random_forest_tree_max_depth) self._model = rfr.fit(df) elif self._train_method == self.ARTIFICIAL_NEURAL_NETWORK: ann_layers[-1] = 1 if self._model is not None: self._model.stop_server() self.logger.warn('layers are {}'.format(ann_layers)) self._model = KerasNeuralNetworkSpark(layers=ann_layers, spark=self._spark, num_workers=self.spark_worker_numbers, epoch=100, featuresCol="features", labelCol=self.TARGET_PRICE, predictionCol='prediction' ) self._model.fit(df) else: self.logger.warn('Unsupported training method {}'.format(self._train_method)) raise ValueError('Unsupported training method {}'.format(self._train_method)) return self._model
encoder = OneHotEncoder(inputCol=eCol, outputCol=eCol+"classVec") stages += [encoder] #label_stringIdx = StringIndexer(inputCol = "verified_purchase", outputCol = "label") #stages += [label_stringIdx] numericCols = ["trip_distance", "passenger_count", "fare_amount","tip_amount"] assemblerInputs = map(lambda c: c + "classVec", categoricalColumns) + map(lambda c: c + "classVec", encColumns) + numericCols assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features") stages += [assembler] pipeline = Pipeline(stages=stages) pipelineModel = pipeline.fit(train_X4) dataset = pipelineModel.transform(train_X4) from pyspark.ml.regression import RandomForestRegressor rf = RandomForestRegressor(numTrees=4,featuresCol="features",labelCol='total_amount', maxDepth=2, seed=42) rf_model = rf.fit(dataset) rf_model.write().overwrite().save("./nyc-01020304-6vm-18-RF-model") import sys sys.exit(0) """ from pyspark.ml.feature import VectorAssembler #vectorAssembler = VectorAssembler(inputCols = ['key', 'passenger_count', 'Travel_Distance', 'Peak_Time', 'weekend'], outputCol = 'fare_amount') #newDF_test1=df_test1.withColumn('Travel_Distance',fun_dist_udf(df_test1["pickup_latitude"],df_test1["pickup_longitude"],df_test1["dropoff_latitude"],df_test1["dropoff_longitude"])) #vectorAssembler = VectorAssembler(inputCols = ['pickup_latitude','pickup_longitude','dropoff_latitude','dropoff_longitude','passenger_count', 'Travel_Distance', 'Peak_Time', 'weekend'], outputCol = 'features') vectorAssembler = VectorAssembler(inputCols = ['passenger_count', 'Travel_Distance', 'Peak_Time', 'weekend'], outputCol = 'features') vhouse_df = vectorAssembler.transform(train_X4) vhouse_df = vhouse_df.select(['features', 'fare_amount']) vhouse_df.show(3)
glrModel = glr.fit(df) # COMMAND ---------- from pyspark.ml.regression import DecisionTreeRegressor dtr = DecisionTreeRegressor() print dtr.explainParams() dtrModel = dtr.fit(df) # COMMAND ---------- from pyspark.ml.regression import RandomForestRegressor from pyspark.ml.regression import GBTRegressor rf = RandomForestRegressor() print rf.explainParams() rfModel = rf.fit(df) gbt = GBTRegressor() print gbt.explainParams() gbtModel = gbt.fit(df) # COMMAND ---------- from pyspark.ml.evaluation import RegressionEvaluator from pyspark.ml.regression import GeneralizedLinearRegression from pyspark.ml import Pipeline from pyspark.ml.tuning import CrossValidator, ParamGridBuilder glr = GeneralizedLinearRegression().setFamily("gaussian").setLink("identity") pipeline = Pipeline().setStages([glr])