def train_random_forest(df): stringIndexer = StringIndexer(inputCol="label", outputCol="indexed") si_model = stringIndexer.fit(df) td = si_model.transform(df) rf = RandomForestClassifier(numTrees=3, maxDepth=2, labelCol="indexed", seed=int(random.random())) return rf, rf.fit(td)
def build_randomForest(path): df = load_data(path) avg_age=find_avg_age(df) df = data_preparation(df, avg_age) df = df.drop('Cabin') df = df.drop('Ticket') df = df.drop('Name') stringIndexer = StringIndexer(inputCol="Survived", outputCol="indexed") si_model = stringIndexer.fit(df) df = si_model.transform(df) df.show() rdf = RandomForestClassifier(labelCol='indexed') grid = ParamGridBuilder().addGrid(rdf.maxDepth, [1,2,3,5,6,8,10])\ .addGrid(rdf.numTrees,[1,5,10,30,50,100,200]).build() evaluator = BinaryClassificationEvaluator() cv = CrossValidator(estimator=rdf, estimatorParamMaps=grid, evaluator=evaluator) cvModel = rdf.fit(df) prediction = cvModel.transform(df) prediction.show() print "classification evaluation :" , evaluator.evaluate(prediction) return cvModel,avg_age
def testClassification(data): # Train a GradientBoostedTrees model. stringIndexer = StringIndexer(inputCol="label", outputCol="indexLabel") si_model = stringIndexer.fit(data) td = si_model.transform(data) rf = RandomForestClassifier(numTrees=5, maxDepth=4, labelCol="indexLabel",seed=13) trainData,testData = td.randomSplit([0.8,0.2],13) predictionDF = rf.fit(trainData).transform(testData) selected = predictionDF\ .select('label','indexLabel','prediction','rawPrediction','probability') for row in selected.collect(): print row scoresAndLabels = predictionDF\ .map(lambda x: (float(x.probability.toArray()[1]), x.indexLabel)) for sl in scoresAndLabels.collect(): print sl evaluator = BinaryClassificationEvaluator(labelCol='indexLabel',metricName='areaUnderROC') metric = evaluator.evaluate(selected) print metric
def testClassification(train, test): # Train a RandomForest model. # Setting featureSubsetStrategy="auto" lets the algorithm choose. # Note: Use larger numTrees in practice. rf = RandomForestClassifier(labelCol="indexedLabel", numTrees=3, maxDepth=4) model = rf.fit(train) predictionAndLabels = model.transform(test).select("prediction", "indexedLabel") \ .map(lambda x: (x.prediction, x.indexedLabel)) metrics = MulticlassMetrics(predictionAndLabels) print("weighted f-measure %.3f" % metrics.weightedFMeasure()) print("precision %s" % metrics.precision()) print("recall %s" % metrics.recall())
mkdir(workdir + f'data/urf_{i}') spark = init_spark() neg_samples = get_negative_samples(spark).sample(1.0) pos_samples = get_positive_samples(spark) imbalance_ratio = (neg_samples.count() / pos_samples.count()) train_set, test_set = get_dataset_df(spark, pos_samples, neg_samples) train_set, test_set = train_set.persist(), test_set.persist() rf = RandomForestClassifier(labelCol="label", featuresCol="features", cacheNodeIds=True, maxDepth=17, impurity='entropy', featureSubsetStrategy='sqrt', minInstancesPerNode=10, numTrees=100, subsamplingRate=1.0, maxMemoryInMB=768) ru = ( RandomUnderSampler().setIndexCol('sample_id').setTargetImbalanceRatio(1.0)) pipeline = Pipeline().setStages([ru, rf]) model = pipeline.fit(train_set) # Write model hyper-parameters def write_params(model, path): with open(path, 'w') as file: for stage in model.stages: params = stage.extractParamMap()
from pyspark.ml.classification import LogisticRegression lr = LogisticRegression(featuresCol='features', labelCol='label', maxIter=100) lrModel = lr.fit(train_ML) # In[36]: predictions = lrModel.transform(test_ML) # - Random Forest # In[37]: from pyspark.ml.classification import RandomForestClassifier rf = RandomForestClassifier(labelCol='label', featuresCol='features', numTrees=20, maxDepth=20) rfModel = rf.fit(train_ML) # In[38]: predictionsrf = rfModel.transform(test_ML) # - Evaluation Metrics # In[40]: from pyspark.mllib.evaluation import MulticlassMetrics results = predictions.select(['prediction', 'label']) predictionAndLabels = results.rdd
# Index labels, adding metadata to the label column. # Fit on whole dataset to include all labels in index. labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data) # Automatically identify categorical features, and index them. # Set maxCategories so features with > 4 distinct values are treated as continuous. featureIndexer =\ VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data) # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = data.randomSplit([0.7, 0.3]) # Train a RandomForest model. rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", numTrees=10) # Convert indexed labels back to original labels. labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel", labels=labelIndexer.labels) # Chain indexers and forest in a Pipeline pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf, labelConverter]) # Train model. This also runs the indexers. model = pipeline.fit(trainingData) # Make predictions. predictions = model.transform(testData)
'imu_46', 'imu_47', 'imu_48', 'imu_49', 'imu_50'],outputCol='VectorFeatures') #Random Forest classifier from pyspark.ml.classification import RandomForestClassifier from pyspark.ml import Pipeline from pyspark.ml.tuning import CrossValidator,ParamGridBuilder from pyspark.mllib.evaluation import BinaryClassificationMetrics from pyspark.ml.evaluation import BinaryClassificationEvaluator import time from pyspark.ml.evaluation import MulticlassClassificationEvaluator classifier = RandomForestClassifier(labelCol='Activity_id',featuresCol='VectorFeatures',numTrees=200) int_pipe = Pipeline(stages=[vec_assembler,classifier]) starttime = time.time() model = int_pipe.fit(train) prediction = model.transform(test) t = time.time() - starttime print("Time Taken = ", t) acc_eval = MulticlassClassificationEvaluator(metricName='accuracy', labelCol='Activity_id') a = acc_eval.evaluate(prediction) print("Accuracy = ", a) f = open("/N/u/risnaga/time.txt", 'a+') f.write("Time Taken = " + str(t) + '\n') f.close() f = open("/N/u/risnaga/accu.txt", 'a+') f.write("Accuracy = " + str(a) + '\n') f.close()
pipeline = Pipeline(stages=stages_feat) pipelineModel = pipeline.fit(df_yog) df_yog = pipelineModel.transform(df_yog) selected_Cols = ['Pred_Label', 'features_all'] + cloum_set df_yog = df_yog.select(selected_Cols) # df_yog.printSchema() # splits = df_yog.randomSplit([0.6,0.4], 1234) training_data, testing_data = df_yog.randomSplit([0.6805, 0.3195], seed=99999999) print("Training Dataset Count: " + str(training_data.count())) print("Test Dataset Count: " + str(testing_data.count())) #Model RandomForest = RandomForestClassifier(labelCol="Pred_Label", featuresCol="features_all", numTrees=10) start = time.time() RandomForestModel = RandomForest.fit(training_data) end = time.time() start1 = time.time() f_predictions = RandomForestModel.transform(testing_data) end1 = time.time() # #PRINT CONFUSION MATRIX # Cm=f_predictions.select("PoutLabel","label").distinct().toPandas() # f_predictions.groupBy("PoutLabel","prediction").count().show() print("Time to train:") print(end - start)
# In[53]: data.show() (trainingData, testData) = data.randomSplit([0.7, 0.3]) # In[54]: from pyspark.ml import Pipeline from pyspark.ml.classification import (RandomForestClassifier, GBTClassifier, DecisionTreeClassifier) from pyspark.ml.evaluation import MulticlassClassificationEvaluator (trainingData, testData) = data.randomSplit([0.7, 0.3]) rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=20) model_rf = rf.fit(trainingData) # In[55]: prediction_rf = model_rf.transform(testData) # In[56]: prediction_rf.show() evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(prediction_rf) accuracy
df.describe().show() from pyspark.ml.feature import VectorAssembler assembler = VectorAssembler(inputCols=['A','B','C','D'], outputCol='features') output = assembler.transform(df) output.printSchema() from pyspark.ml.classification import RandomForestClassifier,GBTClassifier, DecisionTreeClassifier rfc = RandomForestClassifier(labelCol='Spoiled', featuresCol='features') final_data = output.select('features', 'Spoiled') final_data.show() rfcModel = rfc.fit(final_data)
def main(base_path): # Default to "." try: base_path except NameError: base_path = "." if not base_path: base_path = "." APP_NAME = "train_spark_mllib_model.py" # If there is no SparkSession, create the environment try: sc and spark except (NameError, UnboundLocalError) as e: import findspark findspark.init() import pyspark import pyspark.sql sc = pyspark.SparkContext() spark = pyspark.sql.SparkSession(sc).builder.appName( APP_NAME).getOrCreate() # # { # "ArrDelay":5.0,"CRSArrTime":"2015-12-31T03:20:00.000-08:00","CRSDepTime":"2015-12-31T03:05:00.000-08:00", # "Carrier":"WN","DayOfMonth":31,"DayOfWeek":4,"DayOfYear":365,"DepDelay":14.0,"Dest":"SAN","Distance":368.0, # "FlightDate":"2015-12-30T16:00:00.000-08:00","FlightNum":"6109","Origin":"TUS" # } # from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, DateType, TimestampType from pyspark.sql.types import StructType, StructField from pyspark.sql.functions import udf schema = StructType([ StructField("ArrDelay", DoubleType(), True), # "ArrDelay":5.0 StructField("CRSArrTime", TimestampType(), True), # "CRSArrTime":"2015-12-31T03:20:00.000-08:00" StructField("CRSDepTime", TimestampType(), True), # "CRSDepTime":"2015-12-31T03:05:00.000-08:00" StructField("Carrier", StringType(), True), # "Carrier":"WN" StructField("DayOfMonth", IntegerType(), True), # "DayOfMonth":31 StructField("DayOfWeek", IntegerType(), True), # "DayOfWeek":4 StructField("DayOfYear", IntegerType(), True), # "DayOfYear":365 StructField("DepDelay", DoubleType(), True), # "DepDelay":14.0 StructField("Dest", StringType(), True), # "Dest":"SAN" StructField("Distance", DoubleType(), True), # "Distance":368.0 StructField("FlightDate", DateType(), True), # "FlightDate":"2015-12-30T16:00:00.000-08:00" StructField("FlightNum", StringType(), True), # "FlightNum":"6109" StructField("Origin", StringType(), True), # "Origin":"TUS" ]) input_path = "{}/data/simple_flight_delay_features.jsonl.bz2".format( base_path) features = spark.read.json(input_path, schema=schema) features.first() # # Check for nulls in features before using Spark ML # null_counts = [(column, features.where(features[column].isNull()).count()) for column in features.columns] cols_with_nulls = filter(lambda x: x[1] > 0, null_counts) print(list(cols_with_nulls)) # # Add a Route variable to replace FlightNum # from pyspark.sql.functions import lit, concat features_with_route = features.withColumn( 'Route', concat(features.Origin, lit('-'), features.Dest)) features_with_route.show(6) # # Use pysmark.ml.feature.Bucketizer to bucketize ArrDelay into on-time, slightly late, very late (0, 1, 2) # from pyspark.ml.feature import Bucketizer # Setup the Bucketizer splits = [-float("inf"), -15.0, 0, 30.0, float("inf")] arrival_bucketizer = Bucketizer(splits=splits, inputCol="ArrDelay", outputCol="ArrDelayBucket") # Save the bucketizer arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format( base_path) arrival_bucketizer.write().overwrite().save(arrival_bucketizer_path) # Setup the Departure Bucketizer for other examples splits = [-float("inf"), -15.0, 0, 30.0, float("inf")] departure_bucketizer = Bucketizer(splits=splits, inputCol="DepDelay", outputCol="DepDelayBucket") # Save the departure bucketizer departure_bucketizer_path = "{}/models/departure_bucketizer.bin".format( base_path) departure_bucketizer.write().overwrite().save(departure_bucketizer_path) # Apply the bucketizer ml_bucketized_features = arrival_bucketizer.transform(features_with_route) ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show() # # Extract features tools in with pyspark.ml.feature # from pyspark.ml.feature import StringIndexer, VectorAssembler # Turn category fields into indexes for column in ["Carrier", "Origin", "Dest", "Route"]: string_indexer = StringIndexer(inputCol=column, outputCol=column + "_index") string_indexer_model = string_indexer.fit(ml_bucketized_features) ml_bucketized_features = string_indexer_model.transform( ml_bucketized_features) # Drop the original column ml_bucketized_features = ml_bucketized_features.drop(column) # Save the pipeline model string_indexer_output_path = "{}/models/string_indexer_model_{}.bin".format( base_path, column) string_indexer_model.write().overwrite().save( string_indexer_output_path) # Combine continuous, numeric fields with indexes of nominal ones # ...into one feature vector numeric_columns = [ "DepDelay", "Distance", "DayOfMonth", "DayOfWeek", "DayOfYear" ] index_columns = [ "Carrier_index", "Origin_index", "Dest_index", "Route_index" ] vector_assembler = VectorAssembler(inputCols=numeric_columns + index_columns, outputCol="Features_vec") final_vectorized_features = vector_assembler.transform( ml_bucketized_features) # Save the numeric vector assembler vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format( base_path) vector_assembler.write().overwrite().save(vector_assembler_path) # Drop the index columns for column in index_columns: final_vectorized_features = final_vectorized_features.drop(column) # Inspect the finalized features final_vectorized_features.show() # Instantiate and fit random forest classifier on all the data from pyspark.ml.classification import RandomForestClassifier rfc = RandomForestClassifier(featuresCol="Features_vec", labelCol="ArrDelayBucket", predictionCol="Prediction", maxBins=4657, maxMemoryInMB=1024) model = rfc.fit(final_vectorized_features) # Save the new model over the old one model_output_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format( base_path) model.write().overwrite().save(model_output_path) # Evaluate model using test data predictions = model.transform(final_vectorized_features) from pyspark.ml.evaluation import MulticlassClassificationEvaluator evaluator = MulticlassClassificationEvaluator(predictionCol="Prediction", labelCol="ArrDelayBucket", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("Accuracy = {}".format(accuracy)) # Check the distribution of predictions predictions.groupBy("Prediction").count().show() # Check a sample predictions.sample(False, 0.001, 18).orderBy("CRSDepTime").show(6)
def main(base_path): APP_NAME = "train_spark_mllib_model.py" # If there is no SparkSession, create the environment try: sc and spark except NameError as e: import findspark findspark.init() import pyspark import pyspark.sql sc = pyspark.SparkContext() spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate() # # { # "ArrDelay":5.0,"CRSArrTime":"2015-12-31T03:20:00.000-08:00","CRSDepTime":"2015-12-31T03:05:00.000-08:00", # "Carrier":"WN","DayOfMonth":31,"DayOfWeek":4,"DayOfYear":365,"DepDelay":14.0,"Dest":"SAN","Distance":368.0, # "FlightDate":"2015-12-30T16:00:00.000-08:00","FlightNum":"6109","Origin":"TUS" # } # from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, DateType, TimestampType from pyspark.sql.types import StructType, StructField from pyspark.sql.functions import udf schema = StructType([ StructField("ArrDelay", DoubleType(), True), StructField("CRSArrTime", TimestampType(), True), StructField("CRSDepTime", TimestampType(), True), StructField("Carrier", StringType(), True), StructField("DayOfMonth", IntegerType(), True), StructField("DayOfWeek", IntegerType(), True), StructField("DayOfYear", IntegerType(), True), StructField("DepDelay", DoubleType(), True), StructField("Dest", StringType(), True), StructField("Distance", DoubleType(), True), StructField("FlightDate", DateType(), True), StructField("FlightNum", StringType(), True), StructField("Origin", StringType(), True), StructField("Route", StringType(), True), StructField("TailNum", StringType(), True), StructField("EngineManufacturer", StringType(), True), StructField("EngineModel", StringType(), True), StructField("Manufacturer", StringType(), True), StructField("ManufacturerYear", StringType(), True), StructField("OwnerState", StringType(), True), ]) input_path = "{}/data/simple_flight_delay_features_airplanes.json".format( base_path ) features = spark.read.json(input_path, schema=schema) features.first() # # Add the hour of day of scheduled arrival/departure # from pyspark.sql.functions import hour features_with_hour = features.withColumn( "CRSDepHourOfDay", hour(features.CRSDepTime) ) features_with_hour = features_with_hour.withColumn( "CRSArrHourOfDay", hour(features.CRSArrTime) ) features_with_hour.select("CRSDepTime", "CRSDepHourOfDay", "CRSArrTime", "CRSArrHourOfDay").show() # # Check for nulls in features before using Spark ML # null_counts = [(column, features_with_hour.where(features_with_hour[column].isNull()).count()) for column in features_with_hour.columns] cols_with_nulls = filter(lambda x: x[1] > 0, null_counts) print("\nNull Value Report") print("-----------------") print(tabulate(cols_with_nulls, headers=["Column", "Nulls"])) # # Use pysmark.ml.feature.Bucketizer to bucketize ArrDelay into on-time, slightly late, very late (0, 1, 2) # from pyspark.ml.feature import Bucketizer # Setup the Bucketizer splits = [-float("inf"), -15.0, 0, 30.0, float("inf")] arrival_bucketizer = Bucketizer( splits=splits, inputCol="ArrDelay", outputCol="ArrDelayBucket" ) # Save the model arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(base_path) arrival_bucketizer.write().overwrite().save(arrival_bucketizer_path) # Apply the model ml_bucketized_features = arrival_bucketizer.transform(features_with_hour) ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show() # # Extract features tools in with pyspark.ml.feature # from pyspark.ml.feature import StringIndexer, VectorAssembler # Turn category fields into indexes string_columns = ["Carrier", "Origin", "Dest", "Route", "TailNum"] for column in string_columns: string_indexer = StringIndexer( inputCol=column, outputCol=column + "_index" ) string_indexer_model = string_indexer.fit(ml_bucketized_features) ml_bucketized_features = string_indexer_model.transform(ml_bucketized_features) # Save the pipeline model string_indexer_output_path = "{}/models/string_indexer_model_4.0.{}.bin".format( base_path, column ) string_indexer_model.write().overwrite().save(string_indexer_output_path) # Combine continuous, numeric fields with indexes of nominal ones # ...into one feature vector numeric_columns = [ "DepDelay", "Distance", "DayOfYear", "CRSDepHourOfDay", "CRSArrHourOfDay"] index_columns = [column + "_index" for column in string_columns] vector_assembler = VectorAssembler( inputCols=numeric_columns + index_columns, outputCol="Features_vec" ) final_vectorized_features = vector_assembler.transform(ml_bucketized_features) # Save the numeric vector assembler vector_assembler_path = "{}/models/numeric_vector_assembler_5.0.bin".format(base_path) vector_assembler.write().overwrite().save(vector_assembler_path) # Drop the index columns for column in index_columns: final_vectorized_features = final_vectorized_features.drop(column) # Inspect the finalized features final_vectorized_features.show() # # Cross validate, train and evaluate classifier: loop 5 times for 4 metrics # from collections import defaultdict scores = defaultdict(list) feature_importances = defaultdict(list) metric_names = ["accuracy", "weightedPrecision", "weightedRecall", "f1"] split_count = 3 for i in range(1, split_count + 1): print("\nRun {} out of {} of test/train splits in cross validation...".format( i, split_count, ) ) # Test/train split training_data, test_data = final_vectorized_features.randomSplit([0.8, 0.2]) # Instantiate and fit random forest classifier on all the data from pyspark.ml.classification import RandomForestClassifier rfc = RandomForestClassifier( featuresCol="Features_vec", labelCol="ArrDelayBucket", predictionCol="Prediction", maxBins=4896, ) model = rfc.fit(training_data) # Save the new model over the old one model_output_path = "{}/models/spark_random_forest_classifier.flight_delays.baseline.bin".format( base_path ) model.write().overwrite().save(model_output_path) # Evaluate model using test data predictions = model.transform(test_data) # Evaluate this split's results for each metric from pyspark.ml.evaluation import MulticlassClassificationEvaluator for metric_name in metric_names: evaluator = MulticlassClassificationEvaluator( labelCol="ArrDelayBucket", predictionCol="Prediction", metricName=metric_name ) score = evaluator.evaluate(predictions) scores[metric_name].append(score) print("{} = {}".format(metric_name, score)) # # Collect feature importances # feature_names = vector_assembler.getInputCols() feature_importance_list = model.featureImportances for feature_name, feature_importance in zip(feature_names, feature_importance_list): feature_importances[feature_name].append(feature_importance) # # Evaluate average and STD of each metric and print a table # import numpy as np score_averages = defaultdict(float) # Compute the table data average_stds = [] # ha for metric_name in metric_names: metric_scores = scores[metric_name] average_accuracy = sum(metric_scores) / len(metric_scores) score_averages[metric_name] = average_accuracy std_accuracy = np.std(metric_scores) average_stds.append((metric_name, average_accuracy, std_accuracy)) # Print the table print("\nExperiment Log") print("--------------") print(tabulate(average_stds, headers=["Metric", "Average", "STD"])) # # Persist the score to a sccore log that exists between runs # import pickle # Load the score log or initialize an empty one try: score_log_filename = "{}/models/score_log.pickle".format(base_path) score_log = pickle.load(open(score_log_filename, "rb")) if not isinstance(score_log, list): score_log = [] except IOError: score_log = [] # Compute the existing score log entry score_log_entry = { metric_name: score_averages[metric_name] for metric_name in metric_names } # Compute and display the change in score for each metric try: last_log = score_log[-1] except (IndexError, TypeError, AttributeError): last_log = score_log_entry experiment_report = [] for metric_name in metric_names: run_delta = score_log_entry[metric_name] - last_log[metric_name] experiment_report.append((metric_name, run_delta)) print("\nExperiment Report") print("-----------------") print(tabulate(experiment_report, headers=["Metric", "Score"])) # Append the existing average scores to the log score_log.append(score_log_entry) # Persist the log for next run pickle.dump(score_log, open(score_log_filename, "wb")) # # Analyze and report feature importance changes # # Compute averages for each feature feature_importance_entry = defaultdict(float) for feature_name, value_list in feature_importances.items(): average_importance = sum(value_list) / len(value_list) feature_importance_entry[feature_name] = average_importance # Sort the feature importances in descending order and print import operator sorted_feature_importances = sorted( feature_importance_entry.items(), key=operator.itemgetter(1), reverse=True ) print("\nFeature Importances") print("-------------------") print(tabulate(sorted_feature_importances, headers=['Name', 'Importance'])) # # Compare this run's feature importances with the previous run's # # Load the feature importance log or initialize an empty one try: feature_log_filename = "{}/models/feature_log.pickle".format(base_path) feature_log = pickle.load(open(feature_log_filename, "rb")) if not isinstance(feature_log, list): feature_log = [] except IOError: feature_log = [] # Compute and display the change in score for each feature try: last_feature_log = feature_log[-1] except (IndexError, TypeError, AttributeError): last_feature_log = defaultdict(float) for feature_name, importance in feature_importance_entry.items(): last_feature_log[feature_name] = importance # Compute the deltas feature_deltas = {} for feature_name in feature_importances.keys(): run_delta = feature_importance_entry[feature_name] - last_feature_log[feature_name] feature_deltas[feature_name] = run_delta # Sort feature deltas, biggest change first import operator sorted_feature_deltas = sorted( feature_deltas.items(), key=operator.itemgetter(1), reverse=True ) # Display sorted feature deltas print("\nFeature Importance Delta Report") print("-------------------------------") print(tabulate(sorted_feature_deltas, headers=["Feature", "Delta"])) # Append the existing average deltas to the log feature_log.append(feature_importance_entry) # Persist the log for next run pickle.dump(feature_log, open(feature_log_filename, "wb"))
rdd = sc.textFile("/user/demo/train.csv").filter(lambda x: x != titile).\ map(lambda x:x.split(",")) D = 2 ** 24 def helper1(r): features=[] try: fe = r[1:-1] for i in range(len(fe)): features.append(float(abs(hash("VAR_"+'{0:04}'.format(i)+fe[i])))%D) target = float(r[-1]) ID=float(r[0]) return target, Vectors.dense(features) except: return (0.0,[0.0]*1932) new_rdd = rdd.filter(lambda i : len(i)==1934) rdd_after_trans = new_rdd.map(helper1) rdd_after_trans.cache() df = sqlContext.createDataFrame(rdd_after_trans,["label", "features"]) (trainingData, testData) = df.randomSplit([0.7, 0.3]) stringIndexer = StringIndexer(inputCol="label", outputCol="indexed") si_model = stringIndexer.fit(trainingData) td = si_model.transform(trainingData) rf = RandomForestClassifier(numTrees=50, maxDepth=25, labelCol="indexed", seed=42) model = rf.fit(td) result = model.transform(testData).rdd.map(lambda r: str(r.label)+','+str(r.probability[0])) result.saveAsTextFile("/user/demo/rf_50_25")
def main(args): textFiles = sc.wholeTextFiles(maindir + '4').map(readContents) #print "READ second {} check ".format(textFiles.take(10)) ''' filter the rows based on all the index available in training file else drop http://stackoverflow.com/questions/24718697/pyspark-drop-rows ''' htmldf = sqlContext.createDataFrame(textFiles) htmldf.cache() traindf = getCleanedRDD(maindir + 'train_v2.csv', ["id", "images", "links", "text", "label"], htmldf) traindf.write.save(maindir+"output/train_4.parquet", format="parquet") # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr. tokenizer = Tokenizer(inputCol="text", outputCol="words") hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features") lr = LogisticRegression(maxIter=20, regParam=0.01) rf = GBTClassifier(maxIter=30, maxDepth=4, labelCol="label") rf = RandomForestClassifier(labelCol="features", numTrees=3, maxDepth=4) #https://databricks.com/blog/2015/07/29/new-features-in-machine-learning-pipelines-in-spark-1-4.html #http://spark.apache.org/docs/latest/api/python/pyspark.ml.html #w2v = Word2Vec(inputCol="text", outputCol="w2v") rfc = RandomForestClassifier(labelCol="label", numTrees=3, maxDepth=4) pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) # Fit the pipeline to training documents. model = pipeline.fit(traindf) print '-----------------------------------------------------------------------------' testdf = getCleanedRDD(maindir + 'test.csv', ["id", "images", "links", "text", "label"], htmldf) #print testdf.count() # Make predictions on test documents and print columns of interest. prediction = model.transform(testdf) #print('prediction', prediction) ''' pand = prediction.toPandas() pand.to_csv('testpanda.csv', sep='\t', encoding='utf-8') print "Done!!! CSV" ''' #prediction.select('id','probability','prediction').write.format('com.databricks.spark.csv').option("header", "true").save(maindir + 'output/result_lr0.csv') # ('prediction', DataFrame[id: string, images: bigint, links: bigint, text: string, label: double, # words: array<string>, features: vector, rawPrediction: vector, probability: vector, prediction: double]) ''' #write in scala selected = prediction.select("id", "probability", "prediction") for row in selected.collect(): print row ''' sc.stop()
outputCol='features') # Consolidate predictor columns flites = assembler.transform(flites) print("Sample model input") print(flites.toPandas().sample(12)) # Split the data into training and testing sets flights_train, flights_test = flites.randomSplit([0.8, 0.2], seed=23) # Create model objects and train on training data #tree = DecisionTreeClassifier().fit(flights_train) #gbt = GBTClassifier().fit(flights_train) forest = RandomForestClassifier() # Create parameter grid params = ParamGridBuilder() # Add grids for two parameters params = params.addGrid(forest.featureSubsetStrategy, ['all', 'onethird', 'sqrt', 'log2']) \ .addGrid(forest.maxDepth, [2, 5, 10]) # Build the parameter grid params = params.build() # Compare AUC on testing data evaluator = BinaryClassificationEvaluator() # create cross-validation object cv = CrossValidator(estimator=forest,
# # Feature selection is not really supported yet in mllib, therefore, we just applied dim reduction using PCA # In[509]: pca = PCA(inputCol="features", outputCol="pca", k=15).fit(train_df) train_df = pca.transform(train_df) test_df = pca.transform(test_df) # ## Classification algorithms # In[ ]: rf = RandomForestClassifier(labelCol="indexedResult", featuresCol="pca", numTrees=5000) #rf = RandomForestClassifier(labelCol="indexedResult", featuresCol="features", numTrees=5000) model = rf.fit(train_df) # ## Evaluation & results # In[ ]: label_to_str_map = {'2': 'HOME', '1': 'DRAW', '0': 'AWAY'} str_to_labelmap = {'HOME': '2', 'DRAW': '1', 'AWAY': '0'} predictions = model.transform(test_df).select("home_name", "away_name", "B365A", "B365D", "B365H", "probability", "indexedResult") length = test_df.count() correct = 0
#transforming the words to vectors using the trained model transformDF = wvModel.transform(reviewDF) #segregating the labels and features selectData = transformDF.select("label","features","id") #Creating RDD of LabeledPoints lpSelectData = selectData.map(lambda x : (x.id, LabeledPoint(x.label,x.features))) #Instantiating string indexer for random forest stringIndexer = StringIndexer(inputCol="label", outputCol="indexed") #fitting the data in stringindexer si_model = stringIndexer.fit(selectData) #transforming the data transformData = si_model.transform(selectData) #Spliting the data for training and test (trainingData, testData) = transformData.randomSplit([0.6, 0.4]) #instantiating Random forest model randomForest = RandomForestClassifier(numTrees=2, maxDepth=2, labelCol="indexed", seed=42) #training the model randomForestModel = randomForest.fit(trainingData) #trsnforming test data result = randomForestModel.transform(testData) #calculating the accuracy and printing it. accuracy = result.filter(result.label == result.prediction).count() / float(testData.count()) print("Accuracy = " + str(accuracy))
'global_subjectivity', 'global_sentiment_polarity', 'title_subjectivity', 'title_sentiment_polarity', 'abs_title_subjectivity', 'abs_title_sentiment_polarity'],outputCol='features' ) new_data = assembler.transform(data) final_data = new_data.select('features','shares') from pyspark.ml.feature import QuantileDiscretizer discretizer = QuantileDiscretizer(numBuckets=2, inputCol="shares", outputCol="result") result = discretizer.fit(final_data).transform(final_data) finalData = result.select('result','features') from pyspark.ml.classification import RandomForestClassifier rfc = RandomForestClassifier(numTrees=250,labelCol='result',featuresCol='features') train_data,test_data = finalData.randomSplit([0.7,0.3]) rfc_model = rfc.fit(train_data) result = rfc_model.transform(test_data); from pyspark.ml.evaluation import BinaryClassificationEvaluator acc_eval = BinaryClassificationEvaluator(labelCol='result') print(acc_eval.evaluate(result)) test_data.head(1) # import os, sys # import pandas # import plotly.plotly as py # from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot # import cufflinks as cf # import plotly.graph_objs as go
# COMMAND ---------- # MAGIC %md # MAGIC ####Random Forest # MAGIC # MAGIC Random Forests uses an ensemble of trees to improve model accuracy. # MAGIC # MAGIC You can read more about Random Forest from the programming guide [here](http://spark.apache.org/docs/latest/mllib-ensembles.html#random-forests). # COMMAND ---------- from pyspark.ml.classification import RandomForestClassifier # Create an initial RandomForest model. rf = RandomForestClassifier(labelCol="label", featuresCol="features") # Train model with Training Data rfModel = rf.fit(trainingData) # COMMAND ---------- # Make predictions on test data using the Transformer.transform() method. predictions = rfModel.transform(testData) # COMMAND ---------- predictions.printSchema() # COMMAND ----------
def main(base_path): # 기본 값은 "." try: base_path except NameError: base_path = "." if not base_path: base_path = "." APP_NAME = "train_spark_mllib_model.py" # SparkSession이 없으면 환경 생성 try: sc and spark except (NameError, UnboundLocalError) as e: import findspark findspark.init() import pyspark import pyspark.sql sc = pyspark.SparkContext() spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate() # # { # "ArrDelay":5.0,"CRSArrTime":"2015-12-31T03:20:00.000-08:00","CRSDepTime":"2015-12-31T03:05:00.000-08:00", # "Carrier":"WN","DayOfMonth":31,"DayOfWeek":4,"DayOfYear":365,"DepDelay":14.0,"Dest":"SAN","Distance":368.0, # "FlightDate":"2015-12-30T16:00:00.000-08:00","FlightNum":"6109","Origin":"TUS" # } # from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, DateType, TimestampType from pyspark.sql.types import StructType, StructField from pyspark.sql.functions import udf schema = StructType([ StructField("ArrDelay", DoubleType(), True), # "ArrDelay":5.0 StructField("CRSArrTime", TimestampType(), True), # "CRSArrTime":"2015-12-31T03:20:00.000-08:00" StructField("CRSDepTime", TimestampType(), True), # "CRSDepTime":"2015-12-31T03:05:00.000-08:00" StructField("Carrier", StringType(), True), # "Carrier":"WN" StructField("DayOfMonth", IntegerType(), True), # "DayOfMonth":31 StructField("DayOfWeek", IntegerType(), True), # "DayOfWeek":4 StructField("DayOfYear", IntegerType(), True), # "DayOfYear":365 StructField("DepDelay", DoubleType(), True), # "DepDelay":14.0 StructField("Dest", StringType(), True), # "Dest":"SAN" StructField("Distance", DoubleType(), True), # "Distance":368.0 StructField("FlightDate", DateType(), True), # "FlightDate":"2015-12-30T16:00:00.000-08:00" StructField("FlightNum", StringType(), True), # "FlightNum":"6109" StructField("Origin", StringType(), True), # "Origin":"TUS" ]) input_path = "{}/data/simple_flight_delay_features.jsonl.bz2".format( base_path ) features = spark.read.json(input_path, schema=schema) features.first() # # Spark ML을 사용하기 전 특징에 널 값이 있는지 확인 # null_counts = [(column, features.where(features[column].isNull()).count()) for column in features.columns] cols_with_nulls = filter(lambda x: x[1] > 0, null_counts) print(list(cols_with_nulls)) # # FlightNum을 대체할 Route 변수 추가 # from pyspark.sql.functions import lit, concat features_with_route = features.withColumn( 'Route', concat( features.Origin, lit('-'), features.Dest ) ) features_with_route.show(6) # # pysmark.ml.feature.Bucketizer을 사용해서 ArrDelay를 on-time, slightly late, very late (0, 1, 2)으로 구간화 # from pyspark.ml.feature import Bucketizer # 구간 설정 모델 설정 splits = [-float("inf"), -15.0, 0, 30.0, float("inf")] arrival_bucketizer = Bucketizer( splits=splits, inputCol="ArrDelay", outputCol="ArrDelayBucket" ) # 모델 저장 arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(base_path) arrival_bucketizer.write().overwrite().save(arrival_bucketizer_path) # 모델 적용 ml_bucketized_features = arrival_bucketizer.transform(features_with_route) ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show() # # pyspark.ml.feature의 특징 도구 임포트 # from pyspark.ml.feature import StringIndexer, VectorAssembler # 범주 필드를 인덱스로 전환 for column in ["Carrier", "Origin", "Dest", "Route"]: string_indexer = StringIndexer( inputCol=column, outputCol=column + "_index" ) string_indexer_model = string_indexer.fit(ml_bucketized_features) ml_bucketized_features = string_indexer_model.transform(ml_bucketized_features) # 원래의 열 제거 ml_bucketized_features = ml_bucketized_features.drop(column) # 파이프라인 모델 저장 string_indexer_output_path = "{}/models/string_indexer_model_{}.bin".format( base_path, column ) string_indexer_model.write().overwrite().save(string_indexer_output_path) # 연속형 숫자 필드를 범주형 필드의 인덱스와 결합해서 하나의 특징 벡터를 만듦 numeric_columns = [ "DepDelay", "Distance", "DayOfMonth", "DayOfWeek", "DayOfYear"] index_columns = ["Carrier_index", "Origin_index", "Dest_index", "Route_index"] vector_assembler = VectorAssembler( inputCols=numeric_columns + index_columns, outputCol="Features_vec" ) final_vectorized_features = vector_assembler.transform(ml_bucketized_features) # 수치 벡터 어셈블러 저장 vector_assembler_path = "{}/models/numeric_vector_assembler.bin".format(base_path) vector_assembler.write().overwrite().save(vector_assembler_path) # 인덱스 열 제거 for column in index_columns: final_vectorized_features = final_vectorized_features.drop(column) # ㅔ 확정된 특징을 검사 final_vectorized_features.show() # 전체 데이터에 대해 랜덤 포레스트 분류 모델을 인스턴스화하고 적합시키기 from pyspark.ml.classification import RandomForestClassifier rfc = RandomForestClassifier( featuresCol="Features_vec", labelCol="ArrDelayBucket", predictionCol="Prediction", maxBins=4657, maxMemoryInMB=1024 ) model = rfc.fit(final_vectorized_features) # 예전 모델 대신 새 모델을 저장 model_output_path = "{}/models/spark_random_forest_classifier.flight_delays.5.0.bin".format( base_path ) model.write().overwrite().save(model_output_path) # 테스트 데이터를 사용하여 모델 평가 predictions = model.transform(final_vectorized_features) from pyspark.ml.evaluation import MulticlassClassificationEvaluator evaluator = MulticlassClassificationEvaluator( predictionCol="Prediction", labelCol="ArrDelayBucket", metricName="accuracy" ) accuracy = evaluator.evaluate(predictions) print("Accuracy = {}".format(accuracy)) # 예측 분포 확인 predictions.groupBy("Prediction").count().show() # 표본 확인 predictions.sample(False, 0.001, 18).orderBy("CRSDepTime").show(6)
def build_model(df_ml): ''' Function builds a classification model based on the user features INPUT: df_ml OUTPUT: model - final trained model ''' # split into train, test and validation sets (60% - 20% - 20%) df_ml = df_ml.withColumnRenamed("churn", "label") train, test_valid = df_ml.randomSplit([0.7, 0.3], seed=2048) test, validation = test_valid.randomSplit([0.5, 0.5], seed=2048) # index and encode categorical features gender, level and state stringIndexerGender = StringIndexer(inputCol="gender", outputCol="genderIndex", handleInvalid='skip') stringIndexerLevel = StringIndexer(inputCol="last_level", outputCol="levelIndex", handleInvalid='skip') stringIndexerState = StringIndexer(inputCol="last_state", outputCol="stateIndex", handleInvalid='skip') encoder = OneHotEncoderEstimator( inputCols=["genderIndex", "levelIndex", "stateIndex"], outputCols=["genderVec", "levelVec", "stateVec"], handleInvalid='keep') # create vector for features features = [ 'genderVec', 'levelVec', 'stateVec', 'days_active', 'avg_songs', 'avg_events', 'thumbs_up', 'thumbs_down', 'addfriend' ] assembler = VectorAssembler(inputCols=features, outputCol="rawFeatures") # normalize features normalizer = Normalizer(inputCol="rawFeatures", outputCol="features", p=1.0) # initialize random forest classifier with tuned hyperparameters rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=120, impurity='gini', maxDepth=5, featureSubsetStrategy='sqrt') # assemble pipeline pipeline = Pipeline(stages=[ stringIndexerGender, stringIndexerLevel, stringIndexerState, encoder, assembler, normalizer, rf ]) # fit model model = pipeline.fit(train) # predict churn pred_train = model.transform(train) pred_test = model.transform(test) pred_valid = model.transform(validation) # evaluate results predictionAndLabels = pred_train.rdd.map( lambda lp: (float(lp.prediction), float(lp.label))) # Instantiate metrics object metrics = MulticlassMetrics(predictionAndLabels) # print F1-score print("Train F1: %s" % metrics.fMeasure()) predictionAndLabels = pred_test.rdd.map( lambda lp: (float(lp.prediction), float(lp.label))) # Instantiate metrics object metrics = MulticlassMetrics(predictionAndLabels) # F1 score print("Test F1: %s" % metrics.fMeasure()) predictionAndLabels = pred_valid.rdd.map( lambda lp: (float(lp.prediction), float(lp.label))) # Instantiate metrics object metrics = MulticlassMetrics(predictionAndLabels) # F1 score print("Validation F1: %s" % metrics.fMeasure()) return model
# COMMAND ---------- #DEFINING THE MODELS AND PARAMETERS #NOTE: Due to the time Databricks was taking to run and that after 2 hours the clusters detaches, we are keeping only these combinations of hyperparameters in here. The models were run with more combinations of Hyperparameters in a Jupyter Notebook that is attached. #NAIVE BAYES MODEL nb = NaiveBayes() nbParams = ParamGridBuilder().addGrid(nb.smoothing, [0.01,1]).build() #LOGISTIC REGRESSION MODEL lr = LogisticRegression() lrParams = ParamGridBuilder().addGrid(lr.maxIter, [10, 150]).build() #RANDOM FOREST MODEL rfc = RandomForestClassifier() rfParams = ParamGridBuilder().addGrid(rfc.numTrees, [150, 300]).build() #DECISION TREE dt = DecisionTreeClassifier() dtParams = ParamGridBuilder().addGrid(dt.maxDepth, [4, 10]).build() #GRADIENT BOOSTING MODEL gb = GBTClassifier() gbParams = ParamGridBuilder().addGrid(gb.maxDepth,[2,4]).build() ### Hyperparameters used for the analysis (ran in Jypyter) ### # #Gradient Boosting # gb = GradientBoostingClassifier()
label_stringIdx = StringIndexer(inputCol="income", outputCol="label") pipeline = Pipeline(stages=indexers + [label_stringIdx, encoder, assembler]) encoded_df = pipeline.fit(df).transform(df) selectedCols = ['label', 'features'] + cols dataset = encoded_df.select(selectedCols) # Randomly split data into training and test sets. set seed for reproducibility (trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed=100) print(trainingData.count()) print(testData.count()) # Fit model and train rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=10) rf2 = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=100) model = rf.fit(trainingData) model2 = rf2.fit(trainingData) predictions = model.transform(testData) predictions2 = model2.transform(testData) evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) accuracy2 = evaluator.evaluate(predictions2)
data_df = event_df.select(*exprs) # In[ ]: (train_df, test_df) = data_df.randomSplit([0.9, 0.1]) # In[ ]: labelIndexer = StringIndexer(inputCol="target", outputCol="label").fit(train_df) featureAssembler = VectorAssembler( inputCols=[x for x in field_names if x.startswith('attr')], outputCol="features") rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=10) labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel", labels=labelIndexer.labels) pipeline = Pipeline( stages=[featureAssembler, labelIndexer, rf, labelConverter]) # In[ ]: model = pipeline.fit(train_df) # In[ ]: predict_df = model.transform(test_df)
# Select (prediction, true label) and compute test error evaluator = MulticlassClassificationEvaluator(labelCol="lowhigh", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("Accuracy= %g" % (accuracy)) print("Test Error = %g" % (1.0 - accuracy)) # COMMAND ---------- from pyspark.ml.classification import RandomForestClassifier from pyspark.ml.evaluation import MulticlassClassificationEvaluator rf = RandomForestClassifier(labelCol="lowhigh", featuresCol="features", numTrees=10, maxDepth=3) model = rf.fit(train_df) predictions = model.transform(test_df) predictions.select("prediction", "lowhigh", "features").show(5) # Select (prediction, true label) and compute test error evaluator = MulticlassClassificationEvaluator(labelCol="lowhigh", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("Accuracy= %g" % (accuracy)) print("Test Error = %g" % (1.0 - accuracy)) # COMMAND ----------
#计算TF-IDF hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=3000) featurizedData = hashingTF.transform(wordsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) forData = StringIndexer().setInputCol("label").setOutputCol("indexed").fit( rescaledData).transform(rescaledData) (trainingData, testData) = forData.randomSplit([0.8, 0.2], seed=0) print(trainingData.take(1)) rfClassifier = RandomForestClassifier(numTrees=10, maxDepth=10, seed=0, labelCol="indexed") start_time = time.time() modelClassifier = rfClassifier.fit(trainingData) end_time = time.time() cost_time = end_time - start_time print("spark rf time :", cost_time) predictionsClassifier = modelClassifier.transform(testData) evaluator = MulticlassClassificationEvaluator().setLabelCol( "indexed").setPredictionCol("prediction") print( "accuracy = ",
# data = data.select("*", F.when(data.X == ' <=50K', 1).when(data.X == ' >50K', 2).otherwise(0).alias('label')) data = data.withColumnRenamed("age", "label").select("label", "education-num", "hours-per-week") data = data.select(data.label.cast("double"), "education-num", "hours-per-week") # Create vector assembler for feature columns assembler = VectorAssembler(inputCols=data.columns[1:], outputCol="features") data = assembler.transform(data) data.show() # Split data into training and test data set training, test = data.select("label", "features").randomSplit([0.6, 0.4]) # Create Random Forest model and fit the model with training dataset rf = RandomForestClassifier() model = rf.fit(training) # Generate prediction from test dataset predictions = model.transform(test) # Evuluate the accuracy of the model evaluator = MulticlassClassificationEvaluator() accuracy = evaluator.evaluate(predictions) # Show model accuracy print("Accuracy:", accuracy) # Report predictionAndLabels = predictions.select("label", "prediction").rdd metrics = MulticlassMetrics(predictionAndLabels)
# Evaluate model using the AUC metric auc_dt_default_dev = evaluator.evaluate(dt_predictions_default_dev, {evaluator.metricName: 'areaUnderROC'}) # Print result to standard output print('Decision Tree, Default Parameters, Development Set, AUC: ' + str(auc_dt_default_dev)) # TODO: Check for signs of overfitting (by evaluating the model on the training set) # [FIX ME!] Write code below # TODO: Tune the decision tree model by changing one of its hyperparameters # Build and evalute decision trees with the following maxDepth values: 3 and 4. # [FIX ME!] Write code below # Train a random forest with default parameters (including numTrees=20) rf_classifier_default = RandomForestClassifier(labelCol = 'label', featuresCol = 'TFIDF', numTrees=20) # Create an ML pipeline for the random forest model rf_pipeline_default = Pipeline(stages=[label_indexer, rf_classifier_default]) # Apply pipeline and train model rf_model_default = rf_pipeline_default.fit(train_tfidf) # Apply model on development data rf_predictions_default_dev = rf_model_default.transform(dev_tfidf) # Evaluate model using the AUC metric auc_rf_default_dev = evaluator.evaluate(rf_predictions_default_dev, {evaluator.metricName: 'areaUnderROC'}) # Print result to standard output print('Random Forest, Default Parameters, Development Set, AUC:' + str(auc_rf_default_dev))
dt_train.show(5) # COMMAND ---------- from pyspark.ml.classification import RandomForestClassifier #assembler = VectorAssembler(inputCols =["Day","Temp","Lat","Long","Admin_index","Province_index"],outputCol="normfeatures") assembler = VectorAssembler(inputCols=["Date", "Day", "Temp"], outputCol="normfeatures") #assembler = VectorAssembler(inputCols =["Date","Year","Day","Temp"],outputCol="features") minMax = MinMaxScaler(inputCol=assembler.getOutputCol(), outputCol="nfeatures") featVect = VectorAssembler(inputCols=["nfeatures"], outputCol="features") dt = RandomForestClassifier(labelCol="label", featuresCol="features", impurity="gini", featureSubsetStrategy="auto", numTrees=10, maxDepth=30, maxBins=128, seed=1234) pipeline = Pipeline(stages=[assembler, minMax, featVect, dt]) piplineModel = pipeline.fit(dt_train) print("Pipeline complete!") prediction = piplineModel.transform(dt_test) predicted = prediction.select("features", "prediction", "trueLabel") predicted.show(100, truncate=False) from pyspark.ml.evaluation import MulticlassClassificationEvaluator evaluation = MulticlassClassificationEvaluator(labelCol="trueLabel", predictionCol="prediction", metricName="accuracy")
scaler = MinMaxScaler(inputCol="atributos", outputCol="scaledFeatures", min=0.0, max=1.0) scalerModel = scaler.fit(fluxoDF) scaledData = scalerModel.transform(fluxoDF) # Indexação é pré-requisito para Decision Trees stringIndexer = StringIndexer(inputCol="rotulo", outputCol="indexed") si_model = stringIndexer.fit(scaledData) obj_final = si_model.transform(scaledData) # Criando o modelo rfClassifer = RandomForestClassifier(labelCol="indexed", featuresCol="scaledFeatures", probabilityCol="probability", numTrees=20) gbtClassifer = GBTClassifier(labelCol="rotulo", featuresCol="scaledFeatures") (dados_treino, dados_teste) = obj_final.randomSplit([0.7, 0.3]) modelorf = rfClassifer.fit(dados_treino) modelogbt = gbtClassifer.fit(dados_treino) pred_rf = modelorf.transform(dados_teste) pred_gbt = modelogbt.transform(dados_teste) def mont_feat(pred1, pred2): predict = [ pred1['probability'][0], pred1['probability'][1], pred2['probability'][0], pred2['probability'][1]
def create_pipeline(columns): assembler = VectorAssembler(inputCols=columns, outputCol="features") labelIndexer = StringIndexer(inputCol="stars", outputCol="label") rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=30, maxDepth=10) return Pipeline(stages=[assembler, labelIndexer, rf])
assembler = VectorAssembler(inputCols=['A', 'B', 'C', 'D'], outputCol='features') # COMMAND ---------- fit_data = assembler.transform(data) # COMMAND ---------- fit_data.show(5) # COMMAND ---------- rfc = RandomForestClassifier(featuresCol='features', labelCol='Spoiled', numTrees=100) # COMMAND ---------- final_data = fit_data.select('features', 'Spoiled') # COMMAND ---------- final_data.show(5) # COMMAND ---------- rfc = RandomForestClassifier(labelCol='Spoiled') # COMMAND ----------
labelCol="label", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print "DecisionTree Test set accuracy = " + str(accuracy) + "\n" #treeModel = model.stages[2] # summary only #print(treeModel) """ Random Forest """ from pyspark.ml.classification import RandomForestClassifier from pyspark.ml.evaluation import MulticlassClassificationEvaluator # Train a RandomForest model. rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=10) # Train model. This also runs the indexers. model = rf.fit(train) # Make predictions. predictions = model.transform(test) # Select example rows to display. #predictions.select("prediction", "label", "features").show(5) # Select (prediction, true label) and compute test error evaluator = MulticlassClassificationEvaluator( labelCol="label", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) #rfModel = model.stages[2]
nb = NaiveBayes(labelCol="label", featuresCol="features") nbStages = dataPrepStages + [nb] nbPipeline = Pipeline(stages=nbStages) # COMMAND ---------- # MAGIC %md # MAGIC We do the same for a Random Forest model. # COMMAND ---------- from pyspark.ml.classification import RandomForestClassifier rf = RandomForestClassifier(labelCol="label", featuresCol="features") rfStages = dataPrepStages + [rf] rfPipeline = Pipeline(stages=rfStages) # COMMAND ---------- # MAGIC %md # MAGIC ## Define Experiment Parameters # COMMAND ---------- # MAGIC %md # MAGIC Some parameters are going to be used across all algorithms. # MAGIC We set the evaluator (default for `BinaryClassificationEvaluator` is AUC) and the number of folds for our cross-validation (k=10). # MAGIC Again, these parameters are going to be applied to all the algorithms we train.
assembler = VectorAssembler(inputCols=namesH, outputCol="features") higgs = assembler.transform(higgsRaw.dropna()) higgsData = higgs.select('features', 'label') print("vectorised: ") higgsData.show(10) higgsData.printSchema() (trainingData, testData) = higgsData.randomSplit([0.7, 0.3], 42) rfc = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=3, maxBins=20, maxDepth=7, featureSubsetStrategy="all") clasEv = BinaryClassificationEvaluator() mClasEv = MulticlassClassificationEvaluator(metricName="accuracy") rfcModel = rfc.fit(trainingData) pred = rfcModel.transform(testData) AUC = clasEv.evaluate(pred) accuracy = mClasEv.evaluate(pred) print("AUC: ", AUC)
print df_proper.printSchema() labelIndexer = StringIndexer(inputCol='Churn', outputCol='label') assembler = VectorAssembler(inputCols=[ "SeniorCitizen", "tenure", "MonthlyCharges", "TotalCharges" ], outputCol="features") featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4) (train, test) = df_proper.randomSplit([0.7, 0.3]) classifier = RandomForestClassifier(labelCol='label', featuresCol='features') pipeline = Pipeline(stages=[labelIndexer, assembler, classifier]) model = pipeline.fit(train) predictions = model.transform(test) evaluator = BinaryClassificationEvaluator() auroc = evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"}) test = int(auroc) print auroc f = open(sys.argv[2], 'w')
# Check out the features final_vectorized_features.show() # # Cross validate, train and evaluate classifier # # Test/train split training_data, test_data = final_vectorized_features.randomSplit([0.7, 0.3]) # Instantiate and fit random forest classifier from pyspark.ml.classification import RandomForestClassifier rfc = RandomForestClassifier( featuresCol="Features_vec", labelCol="ArrDelayBucket", maxBins=4657, maxMemoryInMB=1024 ) model = rfc.fit(training_data) # Evaluate model using test data predictions = model.transform(test_data) from pyspark.ml.evaluation import MulticlassClassificationEvaluator evaluator = MulticlassClassificationEvaluator(labelCol="ArrDelayBucket", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("Accuracy = {}".format(accuracy)) # Check a sample predictions.sample(False, 0.001, 18).orderBy("CRSDepTime").show(6)
exploder = TileExploder() exploded_tiles = exploder.transform(df_labeled) exploded_train, exploded_test = exploded_tiles.randomSplit([0.8, 0.2], seed=42) noDataFilter = NoDataFilter().setInputCols(['label', 'blue', 'green', 'red', 'NIR', 'SWIR1', 'SWIR2', 'brightness']) exploded_tiles_filtered_train = noDataFilter.transform(exploded_train) exploded_tiles_filtered_test = noDataFilter.transform(exploded_test) assembler = VectorAssembler().setInputCols(bands) \ .setOutputCol("features") assembled_df_train = assembler.transform(exploded_tiles_filtered_train) assembled_df_test = assembler.transform(exploded_tiles_filtered_test) classifier = RandomForestClassifier().setLabelCol('label') \ .setFeaturesCol(assembler.getOutputCol()) model = classifier.fit(assembled_df_train.cache()) prediction_df = model.transform(assembled_df_test).drop(assembler.getOutputCol()).cache() evaluator = MulticlassClassificationEvaluator( predictionCol=classifier.getPredictionCol(), labelCol=classifier.getLabelCol(), metricName='accuracy' ) accuracy = evaluator.evaluate(prediction_df) print("\nAccuracy:", accuracy) cnf_mtrx = prediction_df.groupBy(classifier.getPredictionCol()) \
def _train_model_spark(self, data): df = self._prepare_data_spark(data) input_num = len(data.keys().difference({self.CHANGE_AMOUNT, self.CHANGE_DIRECTION, self.TARGET_PRICE, self.TODAY_PRICE})) if self.ann_hidden_nodes_num is None: self.ann_hidden_nodes_num = input_num / 2 + 1 ann_layers = [input_num, # input_num / 3 * 2, # input_num / 3, self.ann_hidden_nodes_num, 2] self.logger.info('layer settings are {}'.format(ann_layers)) self.logger.info('training method is {}'.format(self._train_method)) self.logger.info('trees num is {}'.format(self.random_forest_tree_number)) if isinstance(self._train_method, dict): if self._model is not None and self._train_method[self.CHANGE_AMOUNT] == self.ARTIFICIAL_NEURAL_NETWORK: self._model[self.CHANGE_AMOUNT].stop_server() self._model = {self.CHANGE_AMOUNT: None, self.CHANGE_DIRECTION: None} if self._train_method[self.CHANGE_AMOUNT] == self.LINEAR_REGRESSION: lr = LinearRegression(featuresCol="features", labelCol=self.CHANGE_AMOUNT, maxIter=self.linear_regression_training_times, regParam=self.linear_regression_regularization_parameter, predictionCol='AmountPrediction') self._model[self.CHANGE_AMOUNT] = lr.fit(df) elif self._train_method[self.CHANGE_AMOUNT] == self.RANDOM_FOREST: rfr = RandomForestRegressor(featuresCol="features", labelCol=self.CHANGE_AMOUNT, numTrees=self.random_forest_tree_number, maxDepth=self.random_forest_tree_max_depth, predictionCol='AmountPrediction') self._model[self.CHANGE_AMOUNT] = rfr.fit(df) elif self._train_method[self.CHANGE_AMOUNT] == self.ARTIFICIAL_NEURAL_NETWORK: ann_layers[-1] = 1 self._model[self.CHANGE_AMOUNT] = KerasNeuralNetworkSpark(layers=ann_layers, spark=self._spark, num_workers=self.spark_worker_numbers, epoch=self.ann_epoch_number, featuresCol="features", labelCol=self.CHANGE_AMOUNT, predictionCol='AmountPrediction' ) self._model[self.CHANGE_AMOUNT].fit(df) else: self.logger.warn('Unsupported training method {}'.format(self._train_method)) raise ValueError('Unsupported training method {}'.format(self._train_method)) if self._train_method[self.CHANGE_DIRECTION] == self.LOGISTIC_REGRESSION: lr = LogisticRegression(featuresCol="features", labelCol=self.CHANGE_DIRECTION, maxIter=self.logistic_regression_training_times, regParam=self.linear_regression_regularization_parameter, predictionCol='DirPrediction') self._model[self.CHANGE_DIRECTION] = lr.fit(df) elif self._train_method[self.CHANGE_DIRECTION] == self.RANDOM_FOREST: rfc = RandomForestClassifier(featuresCol="features", labelCol=self.CHANGE_DIRECTION, numTrees=self.random_forest_tree_number, maxDepth=self.random_forest_tree_max_depth, predictionCol='DirPrediction') self._model[self.CHANGE_DIRECTION] = rfc.fit(df) elif self._train_method[self.CHANGE_DIRECTION] == self.ARTIFICIAL_NEURAL_NETWORK: ann_layers[-1] = 2 mlpc = MultilayerPerceptronClassifier(featuresCol="features", labelCol=self.CHANGE_DIRECTION, layers=ann_layers, predictionCol='DirPrediction') self._model[self.CHANGE_DIRECTION] = mlpc.fit(df) else: self.logger.warn('Unsupported training method {}'.format(self._train_method)) raise ValueError('Unsupported training method {}'.format(self._train_method)) else: if self._train_method == self.LINEAR_REGRESSION: lr = LinearRegression(featuresCol="features", labelCol=self.TARGET_PRICE, predictionCol='prediction', regParam=self.linear_regression_regularization_parameter, maxIter=self.linear_regression_training_times) self._model = lr.fit(df) elif self._train_method == self.RANDOM_FOREST: rfr = RandomForestRegressor(featuresCol="features", labelCol=self.TARGET_PRICE, predictionCol='prediction', numTrees=self.random_forest_tree_number, maxDepth=self.random_forest_tree_max_depth) self._model = rfr.fit(df) elif self._train_method == self.ARTIFICIAL_NEURAL_NETWORK: ann_layers[-1] = 1 if self._model is not None: self._model.stop_server() self.logger.warn('layers are {}'.format(ann_layers)) self._model = KerasNeuralNetworkSpark(layers=ann_layers, spark=self._spark, num_workers=self.spark_worker_numbers, epoch=100, featuresCol="features", labelCol=self.TARGET_PRICE, predictionCol='prediction' ) self._model.fit(df) else: self.logger.warn('Unsupported training method {}'.format(self._train_method)) raise ValueError('Unsupported training method {}'.format(self._train_method)) return self._model
inputcols = ['Source','Side','Wind_Direction','Weather_Condition','Sunrise_Sunset','State','Timezone'] indexers = [StringIndexer(inputCol=column, outputCol=column+"_index") for column in inputcols] pipeline = Pipeline(stages=indexers) df = pipeline.fit(df).transform(df) df = df.drop(*inputcols) return df def transform(df): cols = df.columns cols.remove('Severity') vecAssembler = VectorAssembler(inputCols=cols, outputCol="features") df_transformed = vecAssembler.transform(df) return df_transformed def evaluate_model(df): evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol='Severity') accuracy_rf = evaluator.evaluate(df) return accuracy_rf preprocessed_df = preprocessing(accident_df) indexed_df = indexing(preprocessed_df) transformed_df = transform(indexed_df) #Split data into Training and Testing train, test = transformed_df.randomSplit([0.7, 0.3], seed = 2000) #Using Random Forest Algorithm rf = RF(featuresCol='features',numTrees=12, maxDepth=16, labelCol="Severity",maxBins=150) model_rf = rf.fit(train) #Predicting on test data prediction_rf = model_rf.transform(test) accuracy = evaluate_model(prediction_rf) print("Accuracy is ",accuracy)
def test_pyspark_classifier_decision_tree(): try: import pyspark import sklearn.datasets from pyspark.sql import SparkSession from pyspark import SparkContext, SparkConf from pyspark.ml.feature import VectorAssembler, StringIndexer from pyspark.ml.classification import RandomForestClassifier, DecisionTreeClassifier, GBTClassifier import pandas as pd except: print("Skipping test_pyspark_classifier_decision_tree!") return import shap iris_sk = sklearn.datasets.load_iris() iris = pd.DataFrame(data=np.c_[iris_sk['data'], iris_sk['target']], columns=iris_sk['feature_names'] + ['target'])[:100] spark = SparkSession.builder.config( conf=SparkConf().set("spark.master", "local[*]")).getOrCreate() col = [ "sepal_length", "sepal_width", "petal_length", "petal_width", "type" ] iris = spark.createDataFrame(iris, col) iris = VectorAssembler(inputCols=col[:-1], outputCol="features").transform(iris) iris = StringIndexer(inputCol="type", outputCol="label").fit(iris).transform(iris) classifiers = [ GBTClassifier(labelCol="label", featuresCol="features"), RandomForestClassifier(labelCol="label", featuresCol="features"), DecisionTreeClassifier(labelCol="label", featuresCol="features") ] for classifier in classifiers: model = classifier.fit(iris) explainer = shap.TreeExplainer(model) X = pd.DataFrame(data=iris_sk.data, columns=iris_sk.feature_names)[:100] # pylint: disable=E1101 shap_values = explainer.shap_values(X) expected_values = explainer.expected_value predictions = model.transform(iris).select("rawPrediction")\ .rdd.map(lambda x:[float(y) for y in x['rawPrediction']]).toDF(['class0','class1']).toPandas() if str(type(model)).endswith("GBTClassificationModel'>"): diffs = expected_values + shap_values.sum(1) - predictions.class1 assert np.max( np.abs(diffs) ) < 1e-4, "SHAP values don't sum to model output for class0!" else: normalizedPredictions = (predictions.T / predictions.sum(1)).T diffs = expected_values[0] + shap_values[0].sum( 1) - normalizedPredictions.class0 assert np.max( np.abs(diffs) ) < 1e-4, "SHAP values don't sum to model output for class0!" + model diffs = expected_values[1] + shap_values[1].sum( 1) - normalizedPredictions.class1 assert np.max( np.abs(diffs) ) < 1e-4, "SHAP values don't sum to model output for class1!" + model assert (np.abs(expected_values - normalizedPredictions.mean()) < 1e-1).all(), "Bad expected_value!" + model spark.stop()
# [ 2., 2., 1., 8., 197., 0., 0., 2., 3., # 1.], # [ 1., 0., 1., 0., 2., 183., 0., 1., 0., # 1.], # [ 1., 0., 0., 0., 0., 0., 192., 1., 1., # 0.], # [ 0., 0., 0., 0., 0., 0., 1., 187., 5., # 0.], # [ 0., 1., 2., 0., 0., 0., 1., 5., 172., # 4.], # [ 0., 0., 0., 0., 3., 0., 0., 2., 2., # 176.]]) #section 8.3.2 from pyspark.ml.classification import RandomForestClassifier rf = RandomForestClassifier(maxDepth=20) rfmodel = rf.fit(pendttrain) # RandomForestModel doesn't expose trees field in Python rfpredicts = rfmodel.transform(pendtvalid) rfresrdd = rfpredicts.select("prediction", "label").map(lambda row: (row.prediction, row.label)) rfmm = MulticlassMetrics(rfresrdd) rfmm.precision() #0.9894640403114979 print(rfmm.confusionMatrix()) #DenseMatrix([[ 211., 0., 1., 0., 0., 0., 0., 0., 0., # 0.], # [ 0., 220., 0., 1., 0., 0., 0., 0., 0., # 0.], # [ 0., 1., 211., 0., 0., 0., 0., 0., 0., # 0.], # [ 0., 0., 0., 175., 1., 0., 0., 0., 0.,