def testClassification(data): # Train a GradientBoostedTrees model. stringIndexer = StringIndexer(inputCol="label", outputCol="indexLabel") si_model = stringIndexer.fit(data) td = si_model.transform(data) rf = RandomForestClassifier(numTrees=5, maxDepth=4, labelCol="indexLabel",seed=13) trainData,testData = td.randomSplit([0.8,0.2],13) predictionDF = rf.fit(trainData).transform(testData) selected = predictionDF\ .select('label','indexLabel','prediction','rawPrediction','probability') for row in selected.collect(): print row scoresAndLabels = predictionDF\ .map(lambda x: (float(x.probability.toArray()[1]), x.indexLabel)) for sl in scoresAndLabels.collect(): print sl evaluator = BinaryClassificationEvaluator(labelCol='indexLabel',metricName='areaUnderROC') metric = evaluator.evaluate(selected) print metric
def randomForestClassification(df,arguments): from pyspark.ml.classification import RandomForestClassifier maxDepth = 5 minInstancesPerNode = 1 numTrees = 20 impurity = "gini" if arguments.maxDepth != None: maxDepth = float(arguments.maxDepth) if arguments.minInstancesPerNode != None: minInstancesPerNode = float(arguments.minInstancesPerNode) if arguments.numTrees != None: numTrees = float(arguments.numTrees) if arguments.impurity != None: impurity = arguments.impurity rf = RandomForestClassifier(numTrees=numTrees, maxDepth=maxDepth, minInstancesPerNode=minInstancesPerNode, impurity=impurity) model = rf.fit(df) return model
def rf(ss, data, label_index, feature_indexs, project_url): # 1.构造训练数据集 def func(x): features_data = [] for feature in feature_indexs: features_data.append(x[feature]) return Row(label=label_index, features=Vectors.dense(features_data)) training_set = data.rdd.map(list).map(lambda x: func(x)).toDF() # 2.训练模型 rf_param = RandomForestClassifier(numTrees=50) rf_model = rf_param.fit(training_set) # 3.保存模型 model_path = project_url + '/model/multipleClassification/rf' rf_model.write().overwrite().save(model_path) # 4.读取模型 rf2 = rf_model.load(model_path) # 5.预测 rf_pred = rf2.transform(training_set) rf_pred.select("prediction", "features").show() from pyspark.ml.evaluation import MulticlassClassificationEvaluator # 6.评估 rf_accuracy = MulticlassClassificationEvaluator( metricName='accuracy').evaluate(rf_pred) print("RF's accuracy is %f" % rf_accuracy) rf_precision = MulticlassClassificationEvaluator( metricName='weightedPrecision').evaluate(rf_pred) print("RF's precision is %f" % rf_precision)
def random_forest(df, columns, input_col): """ Runs a random forest for input DataFrame. :param df: Pyspark dataframe to analyze. :param columns: List of columns to select for prediction. :param input_col: Column to predict. :return: DataFrame with random forest and prediction run. """ assert_spark_df(df) assert isinstance(columns, list), "Error, columns must be a list" assert isinstance(input_col, str), "Error, input column must be a string" data = df.select(columns) feats = data.columns feats.remove(input_col) transformer = op.DataFrameTransformer(data) transformer.string_to_index(input_cols=input_col) transformer.vector_assembler(input_cols=feats) model = RandomForestClassifier() transformer.rename_col(columns=[(input_col + "_index", "label")]) rf_model = model.fit(transformer.df) df_model = rf_model.transform(transformer.df) return df_model, rf_model
def consulting_project(spark, resources_folder): data = spark.read.csv(resources_folder + 'dog_food.csv', header=True, inferSchema=True) data.printSchema() data.show() data.describe().show() # data.filter((data['Spoiled']==0)).show() assembler = VectorAssembler(inputCols=['A', 'B', 'C', 'D'], outputCol='features') data_prepared = assembler.transform(data) rfc = RandomForestClassifier(labelCol='Spoiled', featuresCol='features') rfc_model = rfc.fit(data_prepared) print(rfc_model) rfc_model_pred = rfc_model.transform(data_prepared) print("Predicciones del modelo") print(rfc_model_pred) rfc_model_pred.show() print("Evaluación del modelo") my_binary_evaluator = BinaryClassificationEvaluator(labelCol='Spoiled') print(my_binary_evaluator.evaluate(rfc_model_pred)) print("featureImportances") print(rfc_model.featureImportances) print(type(rfc_model.featureImportances))
def fit_nb(train): rf = RandomForestClassifier(numTrees=20, maxDepth=20, labelCol="label", seed=42) model = rf.fit(train) return model
def predict(df_train, df_test): # TODO: Train random forest classifier vecAssembler = VectorAssembler(inputCols=[ "count1", "count2", "count3", "count4", "count5", "count6", "count7", "count8" ], outputCol="features") new_df = vecAssembler.transform(df_train) rf = RandomForestClassifier(numTrees=5, maxDepth=5, labelCol="id", seed=0) model = rf.fit(new_df) new_df_test = vecAssembler.transform(df_test) prediction = model.transform(new_df_test) #prediction.show() mvv = prediction.select("prediction").rdd.flatMap(lambda x: x).collect() # Hint: Column names in the given dataframes need to match the column names # expected by the random forest classifier `train` and `transform` functions. # Or you can alternatively specify which columns the `train` and `transform` # functions should use # Result: Result should be a list with the trained model's predictions # for all the test data points return mvv
def randomForestClassifier(train, test): rf = RandomForestClassifier(featuresCol='features', labelCol='label') rfModel = rf.fit(train) predictions = rfModel.transform(test) predictions.select('age', 'job', 'label', 'rawPrediction', 'prediction', 'probability').show(10) return predictions
def test_multiclass_randomforest_classification_summary(self): df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)), (0.0, 2.0, Vectors.sparse(1, [], [])), (2.0, 2.0, Vectors.dense(2.0)), (2.0, 2.0, Vectors.dense(1.9))], ["label", "weight", "features"]) rf = RandomForestClassifier(weightCol="weight") model = rf.fit(df) self.assertTrue(model.hasSummary) s = model.summary # test that api is callable and returns expected types self.assertTrue(isinstance(s.predictions, DataFrame)) self.assertEqual(s.labelCol, "label") self.assertEqual(s.predictionCol, "prediction") self.assertEqual(s.totalIterations, 0) self.assertTrue(isinstance(s.labels, list)) self.assertTrue(isinstance(s.truePositiveRateByLabel, list)) self.assertTrue(isinstance(s.falsePositiveRateByLabel, list)) self.assertTrue(isinstance(s.precisionByLabel, list)) self.assertTrue(isinstance(s.recallByLabel, list)) self.assertTrue(isinstance(s.fMeasureByLabel(), list)) self.assertTrue(isinstance(s.fMeasureByLabel(1.0), list)) self.assertAlmostEqual(s.accuracy, 1.0, 2) self.assertAlmostEqual(s.weightedTruePositiveRate, 1.0, 2) self.assertAlmostEqual(s.weightedFalsePositiveRate, 0.0, 2) self.assertAlmostEqual(s.weightedRecall, 1.0, 2) self.assertAlmostEqual(s.weightedPrecision, 1.0, 2) self.assertAlmostEqual(s.weightedFMeasure(), 1.0, 2) self.assertAlmostEqual(s.weightedFMeasure(1.0), 1.0, 2) # test evaluation (with training dataset) produces a summary with same values # one check is enough to verify a summary is returned, Scala version runs full test sameSummary = model.evaluate(df) self.assertTrue(isinstance(sameSummary, RandomForestClassificationSummary)) self.assertFalse(isinstance(sameSummary, BinaryRandomForestClassificationSummary)) self.assertAlmostEqual(sameSummary.accuracy, s.accuracy)
def random_forest(df, columns, input_col, **kargs): """ Runs a random forest classifier for input DataFrame. :param df: Pyspark dataframe to analyze. :param columns: List of columns to select for prediction. :param input_col: Column to predict. :return: DataFrame with random forest and prediction run. """ if not is_dataframe(df): raise TypeError("Spark dataframe expected") columns = parse_columns(df, columns) assert isinstance(input_col, str), "Error, input column must be a string" data = df.select(columns) feats = data.columns feats.remove(input_col) df = string_to_index(df, input_cols=input_col) df = vector_assembler(df, input_cols=feats) model = RandomForestClassifier(**kargs) df = df.cols.rename([(input_col + "_index", "label")]) rf_model = model.fit(df) df_model = rf_model.transform(df) return df_model, rf_model
def train_random_forest(df): stringIndexer = StringIndexer(inputCol="label", outputCol="indexed") si_model = stringIndexer.fit(df) td = si_model.transform(df) rf = RandomForestClassifier(numTrees=3, maxDepth=2, labelCol="indexed", seed=int(random.random())) return rf, rf.fit(td)
def random_forest(df, columns, input_col, **kwargs): """ Runs a random forest classifier for input DataFrame. :param df: Pyspark dataframe to analyze. :param columns: List of columns to select for prediction. :param input_col: Column to predict. :return: DataFrame with random forest and prediction run. """ columns = parse_columns(df, columns) data = df.select(columns) feats = data.columns feats.remove(input_col) df = string_to_index(df, input_cols=input_col) df = vector_assembler(df, input_cols=feats, output_col="features") model = RandomForestClassifier(**kwargs) df.table() df = df.cols.rename(name_col(input_col, STRING_TO_INDEX), "label") rf_model = model.fit(df) df_model = rf_model.transform(df) return df_model, rf_model
def testClassification(data): # Train a GradientBoostedTrees model. stringIndexer = StringIndexer(inputCol="label", outputCol="indexLabel") si_model = stringIndexer.fit(data) td = si_model.transform(data) rf = RandomForestClassifier(numTrees=5, maxDepth=4, labelCol="indexLabel", seed=13) trainData, testData = td.randomSplit([0.8, 0.2], 13) predictionDF = rf.fit(trainData).transform(testData) selected = predictionDF\ .select('label','indexLabel','prediction','rawPrediction','probability') for row in selected.collect(): print row scoresAndLabels = predictionDF\ .map(lambda x: (float(x.probability.toArray()[1]), x.indexLabel)) for sl in scoresAndLabels.collect(): print sl evaluator = BinaryClassificationEvaluator(labelCol='indexLabel', metricName='areaUnderROC') metric = evaluator.evaluate(selected) print metric
def basic_example(spark, resources_folder): data = spark.read.format('libsvm').load(resources_folder + 'sample_libsvm_data.txt') data.printSchema() data.show() train_data, test_data = data.randomSplit([0.6, 0.4]) dtc = DecisionTreeClassifier() rfc = RandomForestClassifier() gbtc = GBTClassifier() dtc_model = dtc.fit(train_data) rfc_model = rfc.fit(train_data) gbtc_model = gbtc.fit(train_data) dtc_predictions = dtc_model.transform(test_data) rfc_predictions = rfc_model.transform(test_data) gbtc_predictions = gbtc_model.transform(test_data) dtc_predictions.show() rfc_predictions.show() # GBT No tiene rawPrediction Column, si esta haciendo un predictor de clasificacion binaria o multiclasificacion # puede que pida el rawPrediction como un input gbtc_predictions.show() acc_eval = MulticlassClassificationEvaluator(metricName='accuracy') print("DTC Accuracy") print(acc_eval.evaluate(dtc_predictions)) print("RFC Accuracy") print(acc_eval.evaluate(rfc_predictions)) print("GBTC Accuracy") print(acc_eval.evaluate(gbtc_predictions)) print(rfc_model.featureImportances)
def test_sklearn_interaction(): import sklearn from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier # train a simple sklean RF model on the iris dataset X, y = shap.datasets.iris() X_train, X_test, Y_train, Y_test = train_test_split(*shap.datasets.iris(), test_size=0.2, random_state=0) rforest = RandomForestClassifier(n_estimators=100, max_depth=None, min_samples_split=2, random_state=0) model = rforest.fit(X_train, Y_train) # verify symmetry of the interaction values (this typically breaks if anything is wrong) interaction_vals = shap.TreeExplainer(model).shap_interaction_values(X) for i in range(len(interaction_vals)): for j in range(len(interaction_vals[i])): for k in range(len(interaction_vals[i][j])): for l in range(len(interaction_vals[i][j][k])): assert abs(interaction_vals[i][j][k][l] - interaction_vals[i][j][l][k]) < 1e-4 # ensure the interaction plot works shap.summary_plot(interaction_vals[0], X, show=False)
def build_randomForest(path): df = load_data(path) avg_age=find_avg_age(df) df = data_preparation(df, avg_age) df = df.drop('Cabin') df = df.drop('Ticket') df = df.drop('Name') stringIndexer = StringIndexer(inputCol="Survived", outputCol="indexed") si_model = stringIndexer.fit(df) df = si_model.transform(df) df.show() rdf = RandomForestClassifier(labelCol='indexed') grid = ParamGridBuilder().addGrid(rdf.maxDepth, [1,2,3,5,6,8,10])\ .addGrid(rdf.numTrees,[1,5,10,30,50,100,200]).build() evaluator = BinaryClassificationEvaluator() cv = CrossValidator(estimator=rdf, estimatorParamMaps=grid, evaluator=evaluator) cvModel = rdf.fit(df) prediction = cvModel.transform(df) prediction.show() print "classification evaluation :" , evaluator.evaluate(prediction) return cvModel,avg_age
def fit(self, df, maxDepth=5, maxBins=32, numTrees=20, regParam=0.0, featuresCol="features", ignoreCols=["id"]): self.featuresCol = featuresCol self.labelCols = df.columns self.labelCols.remove("features") for c in ignoreCols: self.labelCols.remove(c) self.models = [] for c in self.labelCols: lr = RandomForestClassifier(featuresCol=featuresCol, labelCol=c, predictionCol=c + "_pred", probabilityCol=c + "_prob", rawPredictionCol=c + "_rpred", maxDepth=maxDepth, maxBins=maxBins, impurity="gini", numTrees=numTrees, seed=None) model = lr.fit(df) self.models.append(model)
def LearningCurve(df, target): df_t = df string_cols = [] for (a, b) in df.dtypes: if b == 'string' and a != target: string_cols.append(a) num_cols = [x for x in df.columns if x not in string_cols and x != target] encoded_cols = [x + "_index" for x in string_cols] indexers = [ StringIndexer(inputCol=column, outputCol=column + "_index").fit(df) for column in string_cols ] pipeline = Pipeline(stages=indexers) df_t = pipeline.fit(df_t).transform(df_t) cols_now = num_cols + encoded_cols assembler_features = VectorAssembler(inputCols=cols_now, outputCol='features') labelIndexer = StringIndexer(inputCol=target, outputCol="label") tmp = [assembler_features, labelIndexer] pipeline = Pipeline(stages=tmp) df_t = pipeline.fit(df_t).transform(df_t) df_t.cache() trainingData, testData = df_t.randomSplit([0.7, 0.3], seed=0) rf = RF(labelCol='label', featuresCol='features', numTrees=200) plot_points = [] #Variable to be adjusted for increment in data% step_var = 10 for i in range(step_var, 101, step_var): sample_size = (i * trainingData.count()) / 100 part_Data = trainingData.rdd.takeSample(False, sample_size, seed=i) part_Data = sqlContext.createDataFrame(part_Data) model = rf.fit(part_Data) evaluator = MulticlassClassificationEvaluator( labelCol="label", predictionCol="prediction", metricName="accuracy") #Calculating train error transformed = model.transform(part_Data) train_accuracy = evaluator.evaluate(transformed) train_error = 1 - train_accuracy #Calculating test error transformed = model.transform(testData) test_accuracy = evaluator.evaluate(transformed) test_error = 1 - test_accuracy plot_points.append([i, train_error, test_error]) return plot_points
def random_forest(train, test, numTrees, impurity): # Entrenamos el modelo. rf = RandomForestClassifier(featuresCol="features", labelCol="label", numTrees=numTrees, impurity=impurity, seed=13) model = rf.fit(train) evaluator = BinaryClassificationEvaluator() accuracy = evaluator.evaluate(model.transform(test)) return accuracy
def rf(df): trainingData, testData = df.randomSplit([0.7, 0.3], seed=0) rf = RF(labelCol='label', featuresCol='features', numTrees=100) fit = rf.fit(trainingData) # featureImp = fit.featureImportances fit.save("s3a://ffinsight/model_rf") prediction = fit.transform(testData) return prediction
def prediction(dataset): (training, test) = dataset.randomSplit([0.8, 0.2]) rf = RandomForestClassifier(labelCol='Survived', featuresCol='features', maxDepth=5) model = rf.fit(training) predictions = model.transform(test) return predictions
def predicted(): assem = VectorAssembler(inputCols=['Fare', 'Pclass', 'Age'], outputCol='features') feat_df = assem.transform( df.select('Fare', 'Pclass', 'Age', 'Survived').dropna()) rf = RandomForestClassifier(featuresCol='features', labelCol='Survived') model = rf.fit(feat_df) return model.transform(feat_df)
def run_random_forest_algorithm(tn_data, ts_data): rf = RandomForestClassifier(featuresCol="scaled_features", labelCol="output", predictionCol="prediction") rfModel = rf.fit(tn_data) predictions = rfModel.transform(ts_data) print_perf_eval(predictions)
def random_forests(self): features = self.select_feature() rf = RandomForestClassifier(labelCol='temperature', featuresCol='features') final_df = features.select('features', 'temperature') rf_model = rf.fit(final_df) print(rf_model.featureImportances) return rf_model.featureImportances
def test_sklearn_random_forest_multiclass(): import shap from sklearn.ensemble import RandomForestClassifier X, y = shap.datasets.iris() y[y == 2] = 1 model = RandomForestClassifier(n_estimators=100, max_depth=None, min_samples_split=2, random_state=0) model.fit(X, y) explainer = shap.TreeExplainer(model) shap_values = explainer.shap_values(X) assert np.abs(shap_values[0][0, 0] - 0.05) < 1e-3 assert np.abs(shap_values[1][0, 0] + 0.05) < 1e-3
def doRandomForestClassification(filename): stages = [] es.update(index='spark-jobs', doc_type='job', id=task_id, body={'doc': { 'current': 30, 'status': 'Reading file..' }}) df = openfile(filename) es.update(index='spark-jobs', doc_type='job', id=task_id, body={'doc': { 'current': 40, 'status': 'Mapping..' }}) categoricalColumns = checkCategoricalColumns(df) numericColumns = checkNumericColumns(df) cols = allColumns(df) stages, df = indexInputColumns(categoricalColumns, stages, df) stages, df = indexOutputColumn(stages, 'deposit', df) stages, df = vectorAsFeatures(categoricalColumns, numericColumns, stages, df) selectedCols, df = pipelane(df, stages, cols) es.update(index='spark-jobs', doc_type='job', id=task_id, body={ 'doc': { 'current': 50, 'status': 'Splitting data to train and test..' } }) train, test = splitDataToTrainAndTest(df) es.update(index='spark-jobs', doc_type='job', id=task_id, body={'doc': { 'current': 60, 'status': 'Training model..' }}) rf = RandomForestClassifier(featuresCol='features', labelCol='label') rfModel = rf.fit(train) predictions = rfModel.transform(test) predictions.select('age', 'job', 'label', 'rawPrediction', 'prediction', 'probability') es.update( index='spark-jobs', doc_type='job', id=task_id, body={'doc': { 'current': 80, 'status': 'Calculating accuracy..' }}) evaluator = binaryClassificationEvaluator(predictions) accuracy = evaluator.evaluate(predictions) return accuracy, predictions, rfModel
def trainModal(training_data): rf = RandomForestClassifier(labelCol='quality', featuresCol='features', maxDepth=15, maxBins=25, numTrees=40) # Fitting training model in current ML model model = rf.fit(training_data) return model
def random_Forest(train_data, test_data): # Create initial Random Forest model print("Accuracy of Random Forest Classifier :") rf = RandomForestClassifier() model = rf.fit(train_data) predictions = model.transform(test_data) evaluator = BinaryClassificationEvaluator(labelCol="label") accuracy = evaluator.evaluate(predictions) print "The accuracy = %g" % accuracy
def random_forest(data, test_data): stringIndexer = StringIndexer(inputCol="label", outputCol="indexed") si_model = stringIndexer.fit(data) data = si_model.transform(data) rf = RandomForestClassifier(numTrees=200, maxDepth=20, labelCol="indexed", featuresCol='features', seed=42) model = rf.fit(data) preds = model.transform(test_data) return preds
def Train_Model(Training_Dataset, Model_Type): # set seed for reproducibility (trainingData, testData) = Training_Dataset.randomSplit([0.7, 0.3], seed=100) print("Training Dataset Count: " + str(trainingData.count())) print("Test Dataset Count: " + str(testData.count())) if Model_Type == "LR": from pyspark.ml.classification import LogisticRegression lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0) model = lr.fit(trainingData) elif Model_Type == "LRCV": from pyspark.ml.classification import LogisticRegression from pyspark.ml.tuning import ParamGridBuilder, CrossValidator from pyspark.ml.evaluation import MulticlassClassificationEvaluator # define evaluator for cross validation evaluator = MulticlassClassificationEvaluator( predictionCol="prediction") # estimator for cross validation lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0) # Create ParamGrid for Cross Validation paramGrid = ( ParamGridBuilder().addGrid( lr.regParam, [0.1, 0.3, 0.5]) # regularization parameter .addGrid(lr.elasticNetParam, [0.0, 0.1, 0.2]) # Elastic Net Parameter (Ridge = 0) # .addGrid(model.maxIter, [10, 20, 50]) #Number of iterations # .addGrid(idf.numFeatures, [10, 100, 1000]) # Number of features .build()) # Create 5-fold CrossValidator cv = CrossValidator(estimator=lr, \ estimatorParamMaps=paramGrid, \ evaluator=evaluator, \ numFolds=5) model = cv.fit(trainingData) else: from pyspark.ml.classification import RandomForestClassifier rf = RandomForestClassifier(labelCol="label", \ featuresCol="features", \ numTrees = 100, \ maxDepth = 4, \ maxBins = 32) # Train model with Training Data model = rf.fit(trainingData) return model, testData
def test_single_row_random_forest(): import shap import numpy as np from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier import sklearn X_train, X_test, Y_train, _ = train_test_split(*shap.datasets.adult(), test_size=0.2, random_state=0) clf = RandomForestClassifier(random_state=202, n_estimators=10, max_depth=10) clf.fit(X_train, Y_train) predicted = clf.predict_proba(X_test) ex = shap.TreeExplainer(clf) shap_values = ex.shap_values(X_test.iloc[0, :]) assert np.abs(shap_values[0].sum() + ex.expected_value[0] - predicted[0,0]) < 1e-4, \ "SHAP values don't sum to model output!"
def RF_model(td, n, m, s=50): # td_new = change_column_datatype(td, "label", DoubleType) td_new = td.withColumn("label", td["label"].cast(DoubleType())) rf = RandomForestClassifier(numTrees=n, maxDepth=m, maxBins=32, labelCol="label", seed=s) model = rf.fit(td_new) return model
def transform_predictions(dataframe, spark): df_transformed = dataframe.drop("Patient addmited to regular ward (1=yes, 0=no)", "Patient addmited to semi-intensive unit (1=yes, 0=no)", "Patient addmited to intensive care unit (1=yes, 0=no)") df_transformed_no_missing = dismiss_missing_values(df_transformed) # build the dataset to be used as a rf_model base outcome_features = ["SARS-Cov-2 exam result"] required_features = ['Hemoglobin', 'Hematocrit', 'Platelets', 'Eosinophils', 'Red blood Cells', 'Lymphocytes', 'Leukocytes', 'Basophils', 'Monocytes'] assembler = VectorAssembler(inputCols=required_features, outputCol='features') model_data = assembler.transform(df_transformed_no_missing) # split the dataset into train/test subgroups (training_data, test_data) = model_data.randomSplit([0.8, 0.2], seed=2020) # Random Forest classifier rf = RandomForestClassifier(labelCol='SARS-Cov-2 exam result', featuresCol='features', maxDepth=5) rf_model = rf.fit(training_data) rf_predictions = rf_model.transform(test_data) multi_evaluator = MulticlassClassificationEvaluator(labelCol='SARS-Cov-2 exam result', metricName='accuracy') rf_accuracy = multi_evaluator.evaluate(rf_predictions) # Decision Tree Classifier dt = DecisionTreeClassifier(featuresCol='features', labelCol='SARS-Cov-2 exam result', maxDepth=3) dt_model = dt.fit(training_data) dt_predictions = dt_model.transform(test_data) dt_predictions.select(outcome_features + required_features).show(10) multi_evaluator = MulticlassClassificationEvaluator(labelCol='SARS-Cov-2 exam result', metricName='accuracy') dt_accuracy = multi_evaluator.evaluate(dt_predictions) # Logistic Regression Model lr = LogisticRegression(featuresCol='features', labelCol='SARS-Cov-2 exam result', maxIter=10) lr_model = lr.fit(training_data) lr_predictions = lr_model.transform(test_data) multi_evaluator = MulticlassClassificationEvaluator(labelCol='SARS-Cov-2 exam result', metricName='accuracy') lr_accuracy = multi_evaluator.evaluate(lr_predictions) # Gradient-boosted Tree classifier Model gb = GBTClassifier(labelCol='SARS-Cov-2 exam result', featuresCol='features') gb_model = gb.fit(training_data) gb_predictions = gb_model.transform(test_data) multi_evaluator = MulticlassClassificationEvaluator(labelCol='SARS-Cov-2 exam result', metricName='accuracy') gb_accuracy = multi_evaluator.evaluate(gb_predictions) rdd = spark.sparkContext.parallelize([rf_accuracy, dt_accuracy, lr_accuracy, gb_accuracy]) predictions_dataframe = spark.createDataFrame(rdd, FloatType()) return predictions_dataframe
def testClassification(train, test): # Train a RandomForest model. # Setting featureSubsetStrategy="auto" lets the algorithm choose. # Note: Use larger numTrees in practice. rf = RandomForestClassifier(labelCol="indexedLabel", numTrees=3, maxDepth=4) model = rf.fit(train) predictionAndLabels = model.transform(test).select("prediction", "indexedLabel") \ .map(lambda x: (x.prediction, x.indexedLabel)) metrics = MulticlassMetrics(predictionAndLabels) print("weighted f-measure %.3f" % metrics.weightedFMeasure()) print("precision %s" % metrics.precision()) print("recall %s" % metrics.recall())
# In[509]: pca = PCA(inputCol="features", outputCol="pca", k=15).fit(train_df) train_df = pca.transform(train_df) test_df = pca.transform(test_df) # ## Classification algorithms # In[ ]: rf = RandomForestClassifier(labelCol="indexedResult", featuresCol="pca", numTrees=5000) #rf = RandomForestClassifier(labelCol="indexedResult", featuresCol="features", numTrees=5000) model = rf.fit(train_df) # ## Evaluation & results # In[ ]: label_to_str_map = {'2': 'HOME', '1': 'DRAW', '0': 'AWAY'} str_to_labelmap = {'HOME': '2', 'DRAW': '1', 'AWAY': '0'} predictions = model.transform(test_df).select("home_name", "away_name", "B365A", "B365D", "B365H", "probability", "indexedResult") length = test_df.count() correct = 0 total_profit = 0 for prediction in predictions.collect():
# MAGIC %md # MAGIC ####Random Forest # MAGIC # MAGIC Random Forests uses an ensemble of trees to improve model accuracy. # MAGIC # MAGIC You can read more about Random Forest from the programming guide [here](http://spark.apache.org/docs/latest/mllib-ensembles.html#random-forests). # COMMAND ---------- from pyspark.ml.classification import RandomForestClassifier # Create an initial RandomForest model. rf = RandomForestClassifier(labelCol="label", featuresCol="features") # Train model with Training Data rfModel = rf.fit(trainingData) # COMMAND ---------- # Make predictions on test data using the Transformer.transform() method. predictions = rfModel.transform(testData) # COMMAND ---------- predictions.printSchema() # COMMAND ---------- # View model's predictions and probabilities of each prediction class selected = predictions.select("label", "prediction", "probability", "age", "occupation") display(selected)
# 1.], # [ 1., 0., 1., 0., 2., 183., 0., 1., 0., # 1.], # [ 1., 0., 0., 0., 0., 0., 192., 1., 1., # 0.], # [ 0., 0., 0., 0., 0., 0., 1., 187., 5., # 0.], # [ 0., 1., 2., 0., 0., 0., 1., 5., 172., # 4.], # [ 0., 0., 0., 0., 3., 0., 0., 2., 2., # 176.]]) #section 8.3.2 from pyspark.ml.classification import RandomForestClassifier rf = RandomForestClassifier(maxDepth=20) rfmodel = rf.fit(pendttrain) # RandomForestModel doesn't expose trees field in Python rfpredicts = rfmodel.transform(pendtvalid) rfresrdd = rfpredicts.select("prediction", "label").map(lambda row: (row.prediction, row.label)) rfmm = MulticlassMetrics(rfresrdd) rfmm.precision() #0.9894640403114979 print(rfmm.confusionMatrix()) #DenseMatrix([[ 211., 0., 1., 0., 0., 0., 0., 0., 0., # 0.], # [ 0., 220., 0., 1., 0., 0., 0., 0., 0., # 0.], # [ 0., 1., 211., 0., 0., 0., 0., 0., 0., # 0.], # [ 0., 0., 0., 175., 1., 0., 0., 0., 0., # 0.],
def _train_model_spark(self, data): df = self._prepare_data_spark(data) input_num = len(data.keys().difference({self.CHANGE_AMOUNT, self.CHANGE_DIRECTION, self.TARGET_PRICE, self.TODAY_PRICE})) if self.ann_hidden_nodes_num is None: self.ann_hidden_nodes_num = input_num / 2 + 1 ann_layers = [input_num, # input_num / 3 * 2, # input_num / 3, self.ann_hidden_nodes_num, 2] self.logger.info('layer settings are {}'.format(ann_layers)) self.logger.info('training method is {}'.format(self._train_method)) self.logger.info('trees num is {}'.format(self.random_forest_tree_number)) if isinstance(self._train_method, dict): if self._model is not None and self._train_method[self.CHANGE_AMOUNT] == self.ARTIFICIAL_NEURAL_NETWORK: self._model[self.CHANGE_AMOUNT].stop_server() self._model = {self.CHANGE_AMOUNT: None, self.CHANGE_DIRECTION: None} if self._train_method[self.CHANGE_AMOUNT] == self.LINEAR_REGRESSION: lr = LinearRegression(featuresCol="features", labelCol=self.CHANGE_AMOUNT, maxIter=self.linear_regression_training_times, regParam=self.linear_regression_regularization_parameter, predictionCol='AmountPrediction') self._model[self.CHANGE_AMOUNT] = lr.fit(df) elif self._train_method[self.CHANGE_AMOUNT] == self.RANDOM_FOREST: rfr = RandomForestRegressor(featuresCol="features", labelCol=self.CHANGE_AMOUNT, numTrees=self.random_forest_tree_number, maxDepth=self.random_forest_tree_max_depth, predictionCol='AmountPrediction') self._model[self.CHANGE_AMOUNT] = rfr.fit(df) elif self._train_method[self.CHANGE_AMOUNT] == self.ARTIFICIAL_NEURAL_NETWORK: ann_layers[-1] = 1 self._model[self.CHANGE_AMOUNT] = KerasNeuralNetworkSpark(layers=ann_layers, spark=self._spark, num_workers=self.spark_worker_numbers, epoch=self.ann_epoch_number, featuresCol="features", labelCol=self.CHANGE_AMOUNT, predictionCol='AmountPrediction' ) self._model[self.CHANGE_AMOUNT].fit(df) else: self.logger.warn('Unsupported training method {}'.format(self._train_method)) raise ValueError('Unsupported training method {}'.format(self._train_method)) if self._train_method[self.CHANGE_DIRECTION] == self.LOGISTIC_REGRESSION: lr = LogisticRegression(featuresCol="features", labelCol=self.CHANGE_DIRECTION, maxIter=self.logistic_regression_training_times, regParam=self.linear_regression_regularization_parameter, predictionCol='DirPrediction') self._model[self.CHANGE_DIRECTION] = lr.fit(df) elif self._train_method[self.CHANGE_DIRECTION] == self.RANDOM_FOREST: rfc = RandomForestClassifier(featuresCol="features", labelCol=self.CHANGE_DIRECTION, numTrees=self.random_forest_tree_number, maxDepth=self.random_forest_tree_max_depth, predictionCol='DirPrediction') self._model[self.CHANGE_DIRECTION] = rfc.fit(df) elif self._train_method[self.CHANGE_DIRECTION] == self.ARTIFICIAL_NEURAL_NETWORK: ann_layers[-1] = 2 mlpc = MultilayerPerceptronClassifier(featuresCol="features", labelCol=self.CHANGE_DIRECTION, layers=ann_layers, predictionCol='DirPrediction') self._model[self.CHANGE_DIRECTION] = mlpc.fit(df) else: self.logger.warn('Unsupported training method {}'.format(self._train_method)) raise ValueError('Unsupported training method {}'.format(self._train_method)) else: if self._train_method == self.LINEAR_REGRESSION: lr = LinearRegression(featuresCol="features", labelCol=self.TARGET_PRICE, predictionCol='prediction', regParam=self.linear_regression_regularization_parameter, maxIter=self.linear_regression_training_times) self._model = lr.fit(df) elif self._train_method == self.RANDOM_FOREST: rfr = RandomForestRegressor(featuresCol="features", labelCol=self.TARGET_PRICE, predictionCol='prediction', numTrees=self.random_forest_tree_number, maxDepth=self.random_forest_tree_max_depth) self._model = rfr.fit(df) elif self._train_method == self.ARTIFICIAL_NEURAL_NETWORK: ann_layers[-1] = 1 if self._model is not None: self._model.stop_server() self.logger.warn('layers are {}'.format(ann_layers)) self._model = KerasNeuralNetworkSpark(layers=ann_layers, spark=self._spark, num_workers=self.spark_worker_numbers, epoch=100, featuresCol="features", labelCol=self.TARGET_PRICE, predictionCol='prediction' ) self._model.fit(df) else: self.logger.warn('Unsupported training method {}'.format(self._train_method)) raise ValueError('Unsupported training method {}'.format(self._train_method)) return self._model
def main(base_path): APP_NAME = "train_spark_mllib_model.py" # If there is no SparkSession, create the environment try: sc and spark except NameError as e: import findspark findspark.init() import pyspark import pyspark.sql sc = pyspark.SparkContext() spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate() # # { # "ArrDelay":5.0,"CRSArrTime":"2015-12-31T03:20:00.000-08:00","CRSDepTime":"2015-12-31T03:05:00.000-08:00", # "Carrier":"WN","DayOfMonth":31,"DayOfWeek":4,"DayOfYear":365,"DepDelay":14.0,"Dest":"SAN","Distance":368.0, # "FlightDate":"2015-12-30T16:00:00.000-08:00","FlightNum":"6109","Origin":"TUS" # } # from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, DateType, TimestampType from pyspark.sql.types import StructType, StructField from pyspark.sql.functions import udf schema = StructType([ StructField("ArrDelay", DoubleType(), True), StructField("CRSArrTime", TimestampType(), True), StructField("CRSDepTime", TimestampType(), True), StructField("Carrier", StringType(), True), StructField("DayOfMonth", IntegerType(), True), StructField("DayOfWeek", IntegerType(), True), StructField("DayOfYear", IntegerType(), True), StructField("DepDelay", DoubleType(), True), StructField("Dest", StringType(), True), StructField("Distance", DoubleType(), True), StructField("FlightDate", DateType(), True), StructField("FlightNum", StringType(), True), StructField("Origin", StringType(), True), StructField("Route", StringType(), True), StructField("TailNum", StringType(), True), StructField("EngineManufacturer", StringType(), True), StructField("EngineModel", StringType(), True), StructField("Manufacturer", StringType(), True), StructField("ManufacturerYear", StringType(), True), StructField("OwnerState", StringType(), True), ]) input_path = "{}/data/simple_flight_delay_features_airplanes.json".format( base_path ) features = spark.read.json(input_path, schema=schema) features.first() # # Add the hour of day of scheduled arrival/departure # from pyspark.sql.functions import hour features_with_hour = features.withColumn( "CRSDepHourOfDay", hour(features.CRSDepTime) ) features_with_hour = features_with_hour.withColumn( "CRSArrHourOfDay", hour(features.CRSArrTime) ) features_with_hour.select("CRSDepTime", "CRSDepHourOfDay", "CRSArrTime", "CRSArrHourOfDay").show() # # Check for nulls in features before using Spark ML # null_counts = [(column, features_with_hour.where(features_with_hour[column].isNull()).count()) for column in features_with_hour.columns] cols_with_nulls = filter(lambda x: x[1] > 0, null_counts) print("\nNull Value Report") print("-----------------") print(tabulate(cols_with_nulls, headers=["Column", "Nulls"])) # # Use pysmark.ml.feature.Bucketizer to bucketize ArrDelay into on-time, slightly late, very late (0, 1, 2) # from pyspark.ml.feature import Bucketizer # Setup the Bucketizer splits = [-float("inf"), -15.0, 0, 30.0, float("inf")] arrival_bucketizer = Bucketizer( splits=splits, inputCol="ArrDelay", outputCol="ArrDelayBucket" ) # Save the model arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(base_path) arrival_bucketizer.write().overwrite().save(arrival_bucketizer_path) # Apply the model ml_bucketized_features = arrival_bucketizer.transform(features_with_hour) ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show() # # Extract features tools in with pyspark.ml.feature # from pyspark.ml.feature import StringIndexer, VectorAssembler # Turn category fields into indexes string_columns = ["Carrier", "Origin", "Dest", "Route", "TailNum"] for column in string_columns: string_indexer = StringIndexer( inputCol=column, outputCol=column + "_index" ) string_indexer_model = string_indexer.fit(ml_bucketized_features) ml_bucketized_features = string_indexer_model.transform(ml_bucketized_features) # Save the pipeline model string_indexer_output_path = "{}/models/string_indexer_model_4.0.{}.bin".format( base_path, column ) string_indexer_model.write().overwrite().save(string_indexer_output_path) # Combine continuous, numeric fields with indexes of nominal ones # ...into one feature vector numeric_columns = [ "DepDelay", "Distance", "DayOfYear", "CRSDepHourOfDay", "CRSArrHourOfDay"] index_columns = [column + "_index" for column in string_columns] vector_assembler = VectorAssembler( inputCols=numeric_columns + index_columns, outputCol="Features_vec" ) final_vectorized_features = vector_assembler.transform(ml_bucketized_features) # Save the numeric vector assembler vector_assembler_path = "{}/models/numeric_vector_assembler_5.0.bin".format(base_path) vector_assembler.write().overwrite().save(vector_assembler_path) # Drop the index columns for column in index_columns: final_vectorized_features = final_vectorized_features.drop(column) # Inspect the finalized features final_vectorized_features.show() # # Cross validate, train and evaluate classifier: loop 5 times for 4 metrics # from collections import defaultdict scores = defaultdict(list) feature_importances = defaultdict(list) metric_names = ["accuracy", "weightedPrecision", "weightedRecall", "f1"] split_count = 3 for i in range(1, split_count + 1): print("\nRun {} out of {} of test/train splits in cross validation...".format( i, split_count, ) ) # Test/train split training_data, test_data = final_vectorized_features.randomSplit([0.8, 0.2]) # Instantiate and fit random forest classifier on all the data from pyspark.ml.classification import RandomForestClassifier rfc = RandomForestClassifier( featuresCol="Features_vec", labelCol="ArrDelayBucket", predictionCol="Prediction", maxBins=4896, ) model = rfc.fit(training_data) # Save the new model over the old one model_output_path = "{}/models/spark_random_forest_classifier.flight_delays.baseline.bin".format( base_path ) model.write().overwrite().save(model_output_path) # Evaluate model using test data predictions = model.transform(test_data) # Evaluate this split's results for each metric from pyspark.ml.evaluation import MulticlassClassificationEvaluator for metric_name in metric_names: evaluator = MulticlassClassificationEvaluator( labelCol="ArrDelayBucket", predictionCol="Prediction", metricName=metric_name ) score = evaluator.evaluate(predictions) scores[metric_name].append(score) print("{} = {}".format(metric_name, score)) # # Collect feature importances # feature_names = vector_assembler.getInputCols() feature_importance_list = model.featureImportances for feature_name, feature_importance in zip(feature_names, feature_importance_list): feature_importances[feature_name].append(feature_importance) # # Evaluate average and STD of each metric and print a table # import numpy as np score_averages = defaultdict(float) # Compute the table data average_stds = [] # ha for metric_name in metric_names: metric_scores = scores[metric_name] average_accuracy = sum(metric_scores) / len(metric_scores) score_averages[metric_name] = average_accuracy std_accuracy = np.std(metric_scores) average_stds.append((metric_name, average_accuracy, std_accuracy)) # Print the table print("\nExperiment Log") print("--------------") print(tabulate(average_stds, headers=["Metric", "Average", "STD"])) # # Persist the score to a sccore log that exists between runs # import pickle # Load the score log or initialize an empty one try: score_log_filename = "{}/models/score_log.pickle".format(base_path) score_log = pickle.load(open(score_log_filename, "rb")) if not isinstance(score_log, list): score_log = [] except IOError: score_log = [] # Compute the existing score log entry score_log_entry = { metric_name: score_averages[metric_name] for metric_name in metric_names } # Compute and display the change in score for each metric try: last_log = score_log[-1] except (IndexError, TypeError, AttributeError): last_log = score_log_entry experiment_report = [] for metric_name in metric_names: run_delta = score_log_entry[metric_name] - last_log[metric_name] experiment_report.append((metric_name, run_delta)) print("\nExperiment Report") print("-----------------") print(tabulate(experiment_report, headers=["Metric", "Score"])) # Append the existing average scores to the log score_log.append(score_log_entry) # Persist the log for next run pickle.dump(score_log, open(score_log_filename, "wb")) # # Analyze and report feature importance changes # # Compute averages for each feature feature_importance_entry = defaultdict(float) for feature_name, value_list in feature_importances.items(): average_importance = sum(value_list) / len(value_list) feature_importance_entry[feature_name] = average_importance # Sort the feature importances in descending order and print import operator sorted_feature_importances = sorted( feature_importance_entry.items(), key=operator.itemgetter(1), reverse=True ) print("\nFeature Importances") print("-------------------") print(tabulate(sorted_feature_importances, headers=['Name', 'Importance'])) # # Compare this run's feature importances with the previous run's # # Load the feature importance log or initialize an empty one try: feature_log_filename = "{}/models/feature_log.pickle".format(base_path) feature_log = pickle.load(open(feature_log_filename, "rb")) if not isinstance(feature_log, list): feature_log = [] except IOError: feature_log = [] # Compute and display the change in score for each feature try: last_feature_log = feature_log[-1] except (IndexError, TypeError, AttributeError): last_feature_log = defaultdict(float) for feature_name, importance in feature_importance_entry.items(): last_feature_log[feature_name] = importance # Compute the deltas feature_deltas = {} for feature_name in feature_importances.keys(): run_delta = feature_importance_entry[feature_name] - last_feature_log[feature_name] feature_deltas[feature_name] = run_delta # Sort feature deltas, biggest change first import operator sorted_feature_deltas = sorted( feature_deltas.items(), key=operator.itemgetter(1), reverse=True ) # Display sorted feature deltas print("\nFeature Importance Delta Report") print("-------------------------------") print(tabulate(sorted_feature_deltas, headers=["Feature", "Delta"])) # Append the existing average deltas to the log feature_log.append(feature_importance_entry) # Persist the log for next run pickle.dump(feature_log, open(feature_log_filename, "wb"))
rdd = sc.textFile("/user/demo/train.csv").filter(lambda x: x != titile).\ map(lambda x:x.split(",")) D = 2 ** 24 def helper1(r): features=[] try: fe = r[1:-1] for i in range(len(fe)): features.append(float(abs(hash("VAR_"+'{0:04}'.format(i)+fe[i])))%D) target = float(r[-1]) ID=float(r[0]) return target, Vectors.dense(features) except: return (0.0,[0.0]*1932) new_rdd = rdd.filter(lambda i : len(i)==1934) rdd_after_trans = new_rdd.map(helper1) rdd_after_trans.cache() df = sqlContext.createDataFrame(rdd_after_trans,["label", "features"]) (trainingData, testData) = df.randomSplit([0.7, 0.3]) stringIndexer = StringIndexer(inputCol="label", outputCol="indexed") si_model = stringIndexer.fit(trainingData) td = si_model.transform(trainingData) rf = RandomForestClassifier(numTrees=50, maxDepth=25, labelCol="indexed", seed=42) model = rf.fit(td) result = model.transform(testData).rdd.map(lambda r: str(r.label)+','+str(r.probability[0])) result.saveAsTextFile("/user/demo/rf_50_25")
# Check out the features final_vectorized_features.show() # # Cross validate, train and evaluate classifier # # Test/train split training_data, test_data = final_vectorized_features.randomSplit([0.7, 0.3]) # Instantiate and fit random forest classifier from pyspark.ml.classification import RandomForestClassifier rfc = RandomForestClassifier( featuresCol="Features_vec", labelCol="ArrDelayBucket", maxBins=4657, maxMemoryInMB=1024 ) model = rfc.fit(training_data) # Evaluate model using test data predictions = model.transform(test_data) from pyspark.ml.evaluation import MulticlassClassificationEvaluator evaluator = MulticlassClassificationEvaluator(labelCol="ArrDelayBucket", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("Accuracy = {}".format(accuracy)) # Check a sample predictions.sample(False, 0.001, 18).orderBy("CRSDepTime").show(6)
'title_subjectivity', 'title_sentiment_polarity', 'abs_title_subjectivity', 'abs_title_sentiment_polarity'],outputCol='features' ) new_data = assembler.transform(data) final_data = new_data.select('features','shares') from pyspark.ml.feature import QuantileDiscretizer discretizer = QuantileDiscretizer(numBuckets=2, inputCol="shares", outputCol="result") result = discretizer.fit(final_data).transform(final_data) finalData = result.select('result','features') from pyspark.ml.classification import RandomForestClassifier rfc = RandomForestClassifier(numTrees=250,labelCol='result',featuresCol='features') train_data,test_data = finalData.randomSplit([0.7,0.3]) rfc_model = rfc.fit(train_data) result = rfc_model.transform(test_data); from pyspark.ml.evaluation import BinaryClassificationEvaluator acc_eval = BinaryClassificationEvaluator(labelCol='result') print(acc_eval.evaluate(result)) test_data.head(1) # import os, sys # import pandas # import plotly.plotly as py # from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot # import cufflinks as cf # import plotly.graph_objs as go # init_notebook_mode(connected=True) # sys.path.append("".join([os.environ["HOME"]]))
#segregating the labels and features selectData = transformDF.select("label","features","id") #Creating RDD of LabeledPoints lpSelectData = selectData.map(lambda x : (x.id, LabeledPoint(x.label,x.features))) #Instantiating string indexer for random forest stringIndexer = StringIndexer(inputCol="label", outputCol="indexed") #fitting the data in stringindexer si_model = stringIndexer.fit(selectData) #transforming the data transformData = si_model.transform(selectData) #Spliting the data for training and test (trainingData, testData) = transformData.randomSplit([0.6, 0.4]) #instantiating Random forest model randomForest = RandomForestClassifier(numTrees=2, maxDepth=2, labelCol="indexed", seed=42) #training the model randomForestModel = randomForest.fit(trainingData) #trsnforming test data result = randomForestModel.transform(testData) #calculating the accuracy and printing it. accuracy = result.filter(result.label == result.prediction).count() / float(testData.count()) print("Accuracy = " + str(accuracy))