def test_raw_and_probability_prediction(self): data_path = "data/mllib/sample_multiclass_classification_data.txt" df = self.spark.read.format("libsvm").load(data_path) mlp = MultilayerPerceptronClassifier(maxIter=100, layers=[4, 5, 4, 3], blockSize=128, seed=123) model = mlp.fit(df) test = self.sc.parallelize([Row(features=Vectors.dense(0.1, 0.1, 0.25, 0.25))]).toDF() result = model.transform(test).head() expected_prediction = 2.0 expected_probability = [0.0, 0.0, 1.0] expected_rawPrediction = [57.3955, -124.5462, 67.9943] self.assertTrue(result.prediction, expected_prediction) self.assertTrue(np.allclose(result.probability, expected_probability, atol=1E-4)) self.assertTrue(np.allclose(result.rawPrediction, expected_rawPrediction, atol=1E-4))
def naiveBayeseian(): def parseLine(line): keys = [float(x) for x in line.split(",")] #return LabeledPoint(keys[0],keys[1:]) return keys scdata1 = sc.textFile("/home/ubantu/TwoClassfeatureSet.csv") data= scdata1.map(parseLine) splits = data.randomSplit([0.8, 0.2], 1234) train = splits[0] test = splits[1] layers = [30, 20, 20, 2] # create the trainer and set its parameters trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234) # train the model model = trainer.fit(train) # compute precision on the test set result = model.transform(test) predictionAndLabels = result.select("prediction", "label") evaluator = MulticlassClassificationEvaluator(metricName="precision") print("Precision:" + str(evaluator.evaluate(predictionAndLabels)))
def price_predict(path, windows=5, spark_contest=None, sql_context=None): if spark_contest is None: spark_contest, sql_context = load_spark_context() input_data = DataParser(path=path, window_size=windows) close_train_df, close_test_df, open_train_df, open_test_df = input_data.get_n_days_history_data( data_type=DATA_FRAME, spark_context=spark_contest, sql_context=sql_context) evaluator = MulticlassClassificationEvaluator(metricName=PREDICTION) # handle open data open_trainer = MultilayerPerceptronClassifier(maxIter=1, layers=[4, 5, 4, 3], blockSize=128, featuresCol=FEATURES, labelCol=LABEL, seed=1234) open_model = open_trainer.fit(open_train_df) open_result = open_model.transform(open_test_df) open_prediction_labels = open_result.select(PREDICTION, LABEL) print("Precision:" + str(evaluator.evaluate(open_prediction_labels))) # handle close data close_trainer = MultilayerPerceptronClassifier(maxIter=100, layers=[4, 5, 4, 3], blockSize=128, featuresCol=FEATURES, labelCol=LABEL, seed=1234) close_model = close_trainer.fit(close_train_df) close_result = close_model.transform(close_test_df) close_prediction_labels = close_result.select(PREDICTION, LABEL) print("Precision:" + str(evaluator.evaluate(close_prediction_labels)))
# Split data set training_df, testing_df = no_emptys_df.randomSplit([.75, .25]) # Make Spark ML pipeline using a NaiveBayes classifier (for now) hashingTF = HashingTF(inputCol='words', outputCol='word_hash', numFeatures=500) idf = IDF(minDocFreq=1, inputCol=hashingTF.getOutputCol(), outputCol='tf-idf') va = VectorAssembler(inputCols=[ 'has_link', 'verb_count', 'tf-idf', 'word_count', 'has_q', 'has_tag' ]) mp = MultilayerPerceptronClassifier( featuresCol=va.getOutputCol(), layers=[505, 250, 100, 50, 25, 10, 5, 2]) # Create param grid grid = ParamGridBuilder().addGrid(mp.maxIter, [50, 100, 200]).addGrid( mp.tol, [.0000001, .000001, .0001, .01]).addGrid(mp.stepSize, [.001, .01, .1]).build() evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction') pipeline = Pipeline(stages=[hashingTF, idf, va, mp]) cv = CrossValidator(estimator=pipeline, estimatorParamMaps=grid, evaluator=evaluator,
# Preliminary analysis ################################################################# print(clean_riskdata.describe().show()) print(riskdata.stat.crosstab("bad","job").show()) print(riskdata.stat.crosstab("bad","reason").show()) ################################################################# # Multilayer Perceptron Classifier ################################################################# # specify layers for the neural network: # input layer of size 10 (features), two intermediate of size 3 and 2 # and output of size 2 (classes) layers = [10, 3, 2, 2] # create the trainer and set its parameters MLPtrainer = MultilayerPerceptronClassifier(maxIter = 100, layers = layers, labelCol = "bad", featuresCol = "predictors", predictionCol = "prediction", blockSize = 1000, seed = 1234) # train the model MLP_model = MLPtrainer.fit(train) # compute precision on the test set MLP_result = MLP_model.transform(test) MLP_predictionAndLabels = MLP_result.select("prediction", "bad") MLP_evaluator = MulticlassClassificationEvaluator(metricName="precision") #print(MLP_model) #print(str(MLP_result.show())) # Print first 20 rows result to output file (plain text) """" ################################################################# # Decision Tree Classification #################################################################
'oh_s_gender', 'oh_s_geography', 'CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary' ], outputCol='features')) # stage for scaling the features using MinMax scaler stages.append(MinMaxScaler(inputCol='features', outputCol='scaledfeatures')) # stage for MultilayerPerceptronClassifier(ANN implementation in Spark) layers = [ 13, 6, 6, 6, 2 ] # 13- input features, two hidden layers with 5 neurons, output layer with 2 neurons(for 2 o/p labels) stages.append( MultilayerPerceptronClassifier(labelCol="s_exited", featuresCol="scaledfeatures", maxIter=200, layers=layers)) #stage for reverse indexing the prediction label stages.append( IndexToString(inputCol='prediction', outputCol='lab_prediction', labels=stages[0].labels)) # making the pipeline model from pyspark.ml import Pipeline pipeline = Pipeline(stages=stages) # Making Pipeline # making/Training the model using trainingData model = pipeline.fit(trainingData)
# In[14]: #set parameters for a KNN model from pyspark.ml.classification import MultilayerPerceptronClassifier from pyspark.ml.evaluation import BinaryClassificationEvaluator from pyspark.ml.tuning import ParamGridBuilder from pyspark.ml.tuning import CrossValidator layers = [[5, 3, 2], [5, 4, 2], [5, 5, 2]] maxAccuracy = 0 bestLayer = [] for layer in layers: trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layer, blockSize=128) param = trainer.setParams(featuresCol="features", labelCol="target") #use K-Fold validation to tune the model #pyspark library grid = ParamGridBuilder().build() # .addGrid(trainer.maxIter, [0, 1]) random forest evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction", labelCol="target") cv = CrossValidator(estimator=trainer, estimatorParamMaps=grid, evaluator=evaluator, numFolds=5) cv.extractParamMap() cvModel = cv.fit(df_train) print(layer)
pca_train_result = model_200.transform(train_vectors_withlabel).selectExpr( 'label_train as label', 'pca_vector as feature') pca_test_result = model_200.transform(test_vectors_withlabel).selectExpr( 'label_test as label', 'pca_vector as feature') # define parameters input_layer = 200 # number of features output_layer = 10 # output 0~9 hidden_1 = 150 hidden_2 = 150 layers = [input_layer, hidden_1, hidden_2, output_layer] MPC = MultilayerPerceptronClassifier(featuresCol='feature', labelCol='label', predictionCol='prediction', maxIter=400, layers=layers, blockSize=128, seed=123) model = MPC.fit(pca_train_result) result = model.transform(pca_test_result).select("label", "prediction") result_lp = result.selectExpr("label", "cast (prediction as int) prediction") final_result = result_lp.rdd count = final_result.count() # calculate the accuracy neutral_zero_value = 0
# Dividimos el dataset en train y test splits = dataset.randomSplit([0.7, 0.3], 1234) train = splits[0] test = splits[1] # Especificamos las capas que tiene la red neuronal layers = [9, 9, 9, 10] layers = [9, 9, 9, 10] now = datetime.datetime.now() print(now.year, now.month, now.day, now.hour, now.minute, now.second) # Creamos el modelo de red neuronal, lo entrenamos y realizamos la prediccion mpc = MultilayerPerceptronClassifier(layers=layers, labelCol='attack_cat_index', featuresCol='features', seed=1234, predictionCol='prediction') mpc = mpc.fit(train) now = datetime.datetime.now() print(now.year, now.month, now.day, now.hour, now.minute, now.second) result = mpc.transform(test) dataset.show(25) result.show(25) # Evaluamos la prediccion evaluator = MulticlassClassificationEvaluator(labelCol="attack_cat_index", metricName="accuracy") accuracy = evaluator.evaluate(result) print("Accuracy = {}".format(accuracy))
print("Finished randomSplit") processing = datetime.now() processing_time = (processing - start).seconds print("Processing time = {}".format(processing_time)) featured_data.rdd.saveAsTextFile(sys.argv[8]) classifiers = [ LogisticRegression(labelCol='EXPIRE_FLAG'), LinearSVC(labelCol='EXPIRE_FLAG'), DecisionTreeClassifier(labelCol='EXPIRE_FLAG'), RandomForestClassifier(labelCol='EXPIRE_FLAG'), GBTClassifier(labelCol='EXPIRE_FLAG'), MultilayerPerceptronClassifier(labelCol='EXPIRE_FLAG', layers=[34, 20, 20, 2]), NaiveBayes(smoothing=1.0, modelType="multinomial", labelCol='EXPIRE_FLAG') ] for classifier in classifiers: model = classifier.fit(train) predictions = model.transform(test) evaluator = MulticlassClassificationEvaluator(labelCol="EXPIRE_FLAG", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) accuracy print("Model is: ", model) print("Accuracy is: ", accuracy)
for i in range(len(label)): labelDict[label[i]] = i labelValIndex = list(labelDict.items()) labelRdd = sc.parallelize(labelValIndex) labelDF = spark.createDataFrame(labelRdd, ['secID', 'index']) labelDF.write.save('hdfs://master:9000//test/labelIndexer_{}'.format(index), format='parquet', mode='append') # df = spark.read.format('parquet').load('hdfs://master:9000//sparkExperiment/labelIndexer/labelIndexer_60438') inputNode = len(columnName) - 1 outputNode = len(label) layers = [inputNode, 5, 4, outputNode] trainer = MultilayerPerceptronClassifier(featuresCol="features", labelCol="label", maxIter=100, layers=layers, blockSize=128, seed=1234) trainData = trainData.select("features", "indexedLabel").selectExpr( "features as features", "indexedLabel as label") model = trainer.fit(trainData) test = sc.textFile( 'hdfs://master:9000//fcd/split/test/397-290_testDataSplit/testData_{}.csv'. format(index)) test = test.map(lambda line: line.split(',')) columnName = test.take(1)[0] test = test.filter(lambda row: row != columnName).toDF(columnName) test = test.rdd.map(lambda x: (Vectors.dense(x[0:-1]), x[-1])).toDF( ["features", "label"]) model.save('hdfs://master:9000//test/model_{}'.format(index)) pred = model.transform(test)
df_train = spark_functions.prepare_features(df_train) assembler = VectorAssembler( inputCols=['latitude', 'longitude', 'gps_height', 'construction_year'], outputCol="features") scaler = StandardScaler(inputCol='features', outputCol='features_scaled', withStd=True, withMean=False) labelIndexer = StringIndexer(inputCol="status_group", outputCol="label").fit(df_train) mlp = MultilayerPerceptronClassifier(seed=42) evaluator = MulticlassClassificationEvaluator(metricName='accuracy') # Convert indexed labels back to original labels. labelConverter = IndexToString(inputCol="prediction", outputCol="status_group_prediction", labels=labelIndexer.labels) param_grid = ParamGridBuilder()\ .addGrid(assembler.outputCol, ['features'])\ .addGrid(mlp.maxIter, [100])\ .addGrid(mlp.layers, [[4, 10, 3]])\ .addGrid(mlp.blockSize, [1])\ .build() pipeline = Pipeline(
# make a new column with a vector of features v_assembler = VectorAssembler(inputCols=features_list, outputCol='features') return v_assembler.transform(data) if __name__ == "__main__": # create SparkSession - the entry to the cluster spark = SparkSession.builder.master("spark://192.168.50.10:7077").appName("MLP - MNIST").getOrCreate() train = prepare_mnist_data("mnist_train.csv") test = prepare_mnist_data("mnist_test.csv") mlp = MultilayerPerceptronClassifier(layers=[28*28, 50, 10]) model = mlp.fit(train) evaluator = MulticlassClassificationEvaluator(metricName="accuracy") prediction_and_labels = model.transform(train).select("prediction", "label") print("Precision train: " + str(evaluator.evaluate(prediction_and_labels))) prediction_and_labels = model.transform(test).select("prediction", "label") print("Precision test: " + str(evaluator.evaluate(prediction_and_labels)))
nb = NaiveBayes(modelType="multinomial") nb_model = nb.fit(train_df) nb_predictions_df = nb_model.transform(test_df) nb_predictions_df.take(1) nb_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy") nb_accuracy = nb_evaluator.evaluate(nb_predictions_df) print(nb_accuracy) # Multi layer perceptron from pyspark.ml.classification import MultilayerPerceptronClassifier layers = [4, 5, 5, 3] # 4 layer MLP -> 2 excluding input and output mlp = MultilayerPerceptronClassifier(layers=layers, seed=1) mlp_model = mlp.fit(train_df) mlp_predictions = mlp_model.transform(test_df) mlp_evaluator = MulticlassClassificationEvaluator(metricName="accuracy") mlp_accuracy = mlp_evaluator.evaluate(mlp_predictions) print(mlp_accuracy) # Decision trees from pyspark.ml.classification import DecisionTreeClassifier dt = DecisionTreeClassifier(labelCol="label", featuresCol="features") dt_model = dt.fit(train_df) dt_predictions = dt_model.transform(test_df) dt_evaluator = MulticlassClassificationEvaluator(metricName="accuracy", labelCol="label", predictionCol="prediction")
## naive bayes classifier ## logistic regression classifier from pyspark.ml.classification import NaiveBayes from pyspark.ml.classification import LogisticRegression from pyspark.ml.classification import DecisionTreeClassifier from pyspark.ml.classification import RandomForestClassifier from pyspark.ml.classification import GBTClassifier from pyspark.ml.classification import MultilayerPerceptronClassifier nb = NaiveBayes(featuresCol="featuresCol", labelCol="label") mlor = LogisticRegression(featuresCol='indexed_features', labelCol='label') dt = DecisionTreeClassifier(featuresCol="indexed_features", labelCol="label") rf = RandomForestClassifier(featuresCol="indexed_features", labelCol="label") gbt = GBTClassifier(featuresCol="indexed_features", labelCol="label") mpnn = MultilayerPerceptronClassifier(featuresCol="indexed_features", labelCol="label") nb.fit(training).transform(training).select(['prediction']).distinct().show() dt.fit(training).transform(training).select(['prediction']).distinct().show() evaluator.evaluate(nb.fit(training).transform(training)) # build parameter grid from pyspark.ml.tuning import ParamGridBuilder # param grid for naive bayes nb_param_grid = ParamGridBuilder().\ addGrid(nb.smoothing, [0, 0.5, 1, 2, 5, 10]).\ build() # param grid for logistic regression mlor_param_grid = ParamGridBuilder().\
# load the data test_df = spark.read.csv(input_path + "Test-label-28x28.csv", \ header=False, inferSchema="true").withColumnRenamed("_c0", "label") train_df = spark.read.csv(input_path + "Train-label-28x28.csv", \ header=False, inferSchema="true").withColumnRenamed("_c0", "label") ##################### Preprocessing ##################### # assembler feature_list = test_df.columns[1:] assembler = VectorAssembler(inputCols=feature_list, outputCol="features") ##################### Multilayer Perceptron ##################### # Train a MultilayerPerceptron model. layers = [784, size, 10] perceptron = MultilayerPerceptronClassifier(maxIter=100, layers=layers, \ blockSize=30, seed=1234) ##################### Pipelined Model ##################### pipeline_per = Pipeline(stages=[assembler, perceptron]) # train the model model_per = pipeline_per.fit(train_df) ##################### Prediction ##################### # make predictions result_per = model_per.transform(test_df) ##################### Evaluation ##################### # compute accuracy evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
def prediction(titanic): performance = pd.DataFrame({'Name': ['Logistic Regression', "Logistic Regression - Cross Validation", "Random forest", "Random forest - Cross Validation", "Gradient-Boosted Tree Classifier", "Gradient-Boosted Tree Classifier - Cross Validation", "Decision Tree Classifier", "Decision Tree Classifier - Cross Validation", "Multilayer perceptron classifier", "Multilayer perceptron classifier - Cross Validation", "Naive Bayes"], 'Test_SET (Area Under ROC)': [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], 'Accuracy': [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], 'Best_Param': ["", "", "", "", "", "", "", "", "", "", ""]}) # ================================DATA PREPROCESSING================================================== features = ['female', 'male', 'Q', 'C', 'S', 'low', 'mid', 'Very_low', 'very_high', 'high', 'Pclass', 'Age'] titanic = titanic.select(F.col("Survived").alias("label"), *features) # Standardize features vectorAssembler = VectorAssembler(inputCols=features, outputCol="unscaled_features") standardScaler = StandardScaler(inputCol="unscaled_features", outputCol="features") stages = [vectorAssembler, standardScaler] pipeline = Pipeline(stages=stages) model = pipeline.fit(titanic) titanic = model.transform(titanic) # Randomly split data into training and test sets. Set seed for reproducibility (X_train, X_test) = titanic.randomSplit([0.7, 0.3], seed=1) # ================================MACHINE LEARNING ALGORITHMS========================================= results = [] names = [] # Logistic Regression name = "Logistic Regression" lr = LogisticRegression(labelCol="label") predictions_lr, model_lr, performance = run_ML(lr, X_train, X_test, performance, name) performance = binaryClassificationEvaluator(predictions_lr, name, performance) results.append(pre_plot(predictions_lr.select("probability").toPandas()['probability'])) names.append(name) # With Cross Validation name = "Logistic Regression - Cross Validation" predictions_lr_cv, model_lr_cv, performance = run_ML_regression_crossValidation(lr, X_train, X_test, performance, name) performance = binaryClassificationEvaluator(predictions_lr_cv, name, performance) results.append(pre_plot(predictions_lr_cv.select("probability").toPandas()['probability'])) names.append(name) # ROC_Curve(model_lr) # Random forest name = "Random forest" rf = RandomForestClassifier(labelCol="label", featuresCol="features") predictions_rf, model_rf, performance = run_ML(rf, X_train, X_test, performance, name) performance= binaryClassificationEvaluator(predictions_rf, name, performance) performance = multiClassClassificationEvaluator(predictions_rf, name, performance) results.append(pre_plot(predictions_rf.select("probability").toPandas()['probability'])) names.append(name) # With Cross Validation name = "Random forest - Cross Validation" predictions_rf_cv, model_rf_cv, performance = run_ML_random_crossValidation(rf, X_train, X_test, performance, name) performance = binaryClassificationEvaluator(predictions_rf_cv, name, performance) performance = multiClassClassificationEvaluator(predictions_rf_cv, name, performance) results.append(pre_plot(predictions_rf_cv.select("probability").toPandas()['probability'])) names.append(name) # Gradient-Boosted Tree Classifier name = "Gradient-Boosted Tree Classifier" gbt = GBTClassifier(labelCol="label", featuresCol="features") predictions_gbt, model_gbt, performance = run_ML(gbt, X_train, X_test, performance, name) performance = binaryClassificationEvaluator(predictions_gbt, name, performance) performance = multiClassClassificationEvaluator(predictions_gbt, name, performance) results.append(pre_plot(predictions_gbt.select("probability").toPandas()['probability'])) names.append(name) # With Cross Validation name = "Gradient-Boosted Tree Classifier - Cross Validation" predictions_gbt_cv, model_gbt_cv, performance = run_ML_gbt_crossValidation(gbt, X_train, X_test, performance, name) performance = binaryClassificationEvaluator(predictions_gbt_cv, name, performance) performance = multiClassClassificationEvaluator(predictions_gbt_cv, name, performance) results.append(pre_plot(predictions_gbt_cv.select("probability").toPandas()['probability'])) names.append(name) # DecisionTree model name = "Decision Tree Classifier" dt = DecisionTreeClassifier(labelCol="label", featuresCol="features") predictions_dt, model_dt, performance = run_ML(dt, X_train, X_test, performance, name) performance = multiClassClassificationEvaluator(predictions_dt, name, performance) results.append(pre_plot(predictions_dt.select("probability").toPandas()['probability'])) names.append(name) # With Cross Validation name = "Decision Tree Classifier - Cross Validation" predictions_dt_cv, model_dt_cv, performance = run_ML_dt_crossValidation(gbt, X_train, X_test, performance, name) performance = multiClassClassificationEvaluator(predictions_dt_cv, name, performance) results.append(pre_plot(predictions_dt_cv.select("probability").toPandas()['probability'])) names.append(name) # Multilayer perceptron classifier name = "Multilayer perceptron classifier" layers = [len(features), 5, 4, 3] mpc = MultilayerPerceptronClassifier(labelCol="label", featuresCol="features", maxIter=100, layers=layers, blockSize=128) predictions_mpc, model_mpc, performance = run_ML(mpc, X_train, X_test, performance, name) performance = multiClassClassificationEvaluator(predictions_mpc, name, performance) results.append(pre_plot(predictions_mpc.select("probability").toPandas()['probability'])) names.append(name) # With Cross Validation name = "Multilayer perceptron classifier - Cross Validation" predictions_mpc_cv, model_mpc_cv, performance = run_ML_mpc_crossValidation(mpc, X_train, X_test, performance, name) performance = multiClassClassificationEvaluator(predictions_mpc_cv, name, performance) results.append(pre_plot(predictions_mpc_cv.select("probability").toPandas()['probability'])) names.append(name) # Linear Support Vector Machine # lsvc = LinearSVC(maxIter=10, regParam=0.1) # run_ML(lsvc, X_train, X_test) # predictions_lsvc, model_lsvc = run_ML(lsvc, X_train, X_test) # multiClassClassificationEvaluator(predictions_lsvc, "Linear Support Vector Machine") # results.append(pre_plot(predictions_lsvc.select("probability").toPandas()['probability'])) # names.append("Linear Support Vector Machine") # Naive Bayes name = "Naive Bayes" nb = NaiveBayes(smoothing=1.0, modelType="multinomial", labelCol="label", featuresCol="features") predictions_nb, model_nb, performance = run_ML(nb, X_train, X_test, performance, name) performance = multiClassClassificationEvaluator(predictions_nb, name, performance) results.append(pre_plot(predictions_nb.select("probability").toPandas()['probability'])) names.append(name) """ Regression obviously doesn't work here, it's a classification problem # Linear regression # linr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8) # run_ML(linr, X_train, X_test) # predictions_linr, model_linr = run_ML(linr, X_train, X_test) # multiClassClassificationEvaluator(predictions_linr, "Linear regression") # results.append(pre_plot(predictions_linr.select("probability").toPandas()['probability'])) # Generalized linear regression # glr = GeneralizedLinearRegression(family="gaussian", link="identity", maxIter=10, regParam=0.3) # run_ML(glr, X_train, X_test) # predictions_glr, model_glr = run_ML(glr, X_train, X_test) # multiClassClassificationEvaluator(predictions_glr, "Generalized linear regression") # results.append(pre_plot(predictions_glr.select("probability").toPandas()['probability'])) # Decision tree regression dtr = DecisionTreeRegressor(featuresCol="features") run_ML(dtr, X_train, X_test) predictions_dtr, model_dtr = run_ML(dtr, X_train, X_test) regressionEvaluator(predictions_dtr, "Decision tree regression") predictions_dtr.show() # results.append(pre_plot(predictions_dtr.select("probability").toPandas()['probability'])) # Random forest regression rfr = RandomForestRegressor(featuresCol="features") run_ML(rfr, X_train, X_test) predictions_rfr, model_rfr = run_ML(rfr, X_train, X_test) regressionEvaluator(predictions_rfr, "Random forest regression") predictions_rfr.show() # results.append(pre_plot(predictions_rfr.select("probability").toPandas()['probability'])) # Gradient-boosted tree regression gbtr = GBTRegressor(featuresCol="features", maxIter=10) run_ML(gbtr, X_train, X_test) predictions_gbtr, model_gbt = run_ML(gbtr, X_train, X_test) regressionEvaluator(predictions_gbtr, "Gradient-boosted tree regression") predictions_gbtr.show() # results.append(pre_plot(predictions_gbtr.select("probability").toPandas()['probability'])) # Survival regression # quantileProbabilities = [0.3, 0.6] # aft = AFTSurvivalRegression(quantileProbabilities=quantileProbabilities, quantilesCol="quantiles") # run_ML(aft, "Survival regression", X_train, X_test) # predictions_aft, model_aft = run_ML(aft, X_train, X_test) # multiClassClassificationEvaluator(predictions_aft, "Survival regression") # results.append(pre_plot(predictions_aft.select("probability").toPandas()['probability'])) # Isotonic regression # it = IsotonicRegression() # run_ML(it, "Isotonic regression", X_train, X_test) # predictions_it, model_it = run_ML(it, X_train, X_test) # multiClassClassificationEvaluator(predictions_it, "Isotonic regression") # results.append(pre_plot(predictions_it.select("probability").toPandas()['probability'])) """ #================================BOXPLOT ALGORITHM COMPARISON======================================== fig = plt.figure() fig.suptitle('Algorithm Comparison') ax = fig.add_subplot(111) plt.boxplot(results) ax.set_xticklabels(names) plt.show() final_result = spark.createDataFrame(performance) return final_result
train_sents1_lower = train_sents1.withColumn('lower_sents', udf_lower('sentence1') ) # train_sents1_lower.show(5) udf_rv_punc = F.udf(remove_punctuation_re, StringType() ) train_sents1_rv_punc = train_sents1_lower.withColumn('rv_punc_sents', udf_rv_punc('lower_sents') ) tokenizer = Tokenizer(inputCol="rv_punc_sents", outputCol="tokens") remover = StopWordsRemover(inputCol="tokens", outputCol="filtered_tokens") w2v = Word2Vec(vectorSize=300, minCount=0, inputCol="filtered_tokens", outputCol="avg_word_embed") doc2vec_pipeline = Pipeline(stages=[tokenizer,remover,w2v]) doc2vec_model = doc2vec_pipeline.fit(train_sents1_rv_punc) doc2vecs_df = doc2vec_model.transform(train_sents1_rv_punc) w2v_train_df, w2v_test_df = doc2vecs_df.randomSplit([0.8, 0.2]) from pyspark.ml.feature import StringIndexer from pyspark.ml.classification import MultilayerPerceptronClassifier from pyspark.ml.evaluation import MulticlassClassificationEvaluator genre2label = StringIndexer(inputCol="genre", outputCol="label") rf_classifier = MultilayerPerceptronClassifier(labelCol="label", featuresCol="avg_word_embed") rf_classifier_pipeline = Pipeline(stages=[genre2label,rf_classifier]) rf_predictions = rf_classifier_pipeline.fit(w2v_train_df).transform(w2v_test_df) rf_model_evaluator = MulticlassClassificationEvaluator( \ labelCol="label", predictionCol="prediction", metricName="accuracy") accuracy = rf_model_evaluator.evaluate(rf_predictions) print("Accuracy = %g" % (accuracy))
def main(): #Loading the data player_data = spark.read.format("mongo").options( collection='players2').load() #Conversion to integers - Mongo gets strings player_data = player_data.withColumn( 'age', player_data['age'].cast(types.IntegerType())) player_data = player_data.withColumn( 'weight_kg', player_data['weight_kg'].cast(types.IntegerType())) player_data = player_data.withColumn( 'overall', player_data['overall'].cast(types.IntegerType())) player_data = player_data.withColumn( 'pace', player_data['pace'].cast(types.IntegerType())) player_data = player_data.withColumn( 'passing', player_data['passing'].cast(types.IntegerType())) player_data = player_data.withColumn( 'physic', player_data['physic'].cast(types.IntegerType())) player_data = player_data.withColumn( 'movement_agility', player_data['movement_agility'].cast(types.IntegerType())) player_data = player_data.withColumn( 'power_stamina', player_data['power_stamina'].cast(types.IntegerType())) player_data = player_data.withColumn( 'mentality_aggression', player_data['mentality_aggression'].cast(types.IntegerType())) player_data = player_data.withColumn( 'shooting', player_data['shooting'].cast(types.IntegerType())) player_data = player_data.withColumn( 'dribbling', player_data['dribbling'].cast(types.IntegerType())) player_data = player_data.withColumn( 'defending', player_data['defending'].cast(types.IntegerType())) #Feature Engineering players_data1 = player_data.select( 'age', 'weight_kg', 'nationality', 'club', 'overall', 'potential', 'value_eur', 'wage_eur', 'movement_agility', 'power_stamina', 'mentality_aggression', 'pace', 'physic', 'passing', 'shooting', 'defending', 'dribbling') players_data1 = players_data1.dropna() players_data2 = players_data1.drop('club', 'wage_eur') players_data2 = players_data2.withColumn('value_range', \ functions.when((functions.col('value_eur').between(10000, 200000)), 1) \ .when((functions.col('value_eur').between(200000, 400000)), 2) \ .when((functions.col('value_eur').between(400000, 600000)), 3) \ .when((functions.col('value_eur').between(600000, 800000)), 4) \ .when((functions.col('value_eur').between(800000, 1000000)), 5) \ .otherwise(0)) #ML train, validation = players_data2.randomSplit([0.75, 0.25]) train = train.cache() validation = validation.cache() feature_vector = VectorAssembler(inputCols=[ 'age', 'weight_kg', 'overall', 'pace', 'passing', 'physic', 'movement_agility', 'power_stamina', 'mentality_aggression', 'passing', 'shooting', 'defending', 'dribbling' ], outputCol='features') classifier = MultilayerPerceptronClassifier(layers=[13, 130, 6], featuresCol='features', labelCol='value_range', maxIter=500) ml_pipeline = Pipeline(stages=[feature_vector, classifier]) model = ml_pipeline.fit(train) model.write().overwrite().save('wage_modeller') prediction = model.transform(validation) evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='value_range', metricName='f1') score = evaluator.evaluate(prediction) print('Validation score for new player wages: %g' % (score, ))
"""-------------------------------------------------------------------------------------------------""" """MULTILAYER PERCEPTRON MLP CLASSIFIER """ from pyspark.ml.classification import MultilayerPerceptronClassifier #We will use cross validation to find the optimal hyperparameters from pyspark.ml.tuning import CrossValidator, ParamGridBuilder #mlpParamGrid = ParamGridBuilder()\ # .addGrid(MultilayerPerceptronClassifier.maxIter,[100, 200, 500])\ # .addGrid(MultilayerPerceptronClassifier.blockSize,[10,20,30])\ # .addGrid(MultilayerPerceptronClassifier.layers, [[3,6,6,3],[3,20,20,3],[3,100,100,3]])\ # .build() layers = [3, 20, 20, 3] #mlpCrossval = CrossValidator(estimator=MultilayerPerceptronClassifier(layers=layers,labelCol="label",featuresCol="pcaFeatures", solver = "l-bfgs", seed = 1234), estimatorParamMaps = mlpParamGrid, evaluator=MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="label",metricName="accuracy"), numFolds = 5) mlp = MultilayerPerceptronClassifier(blockSize=10, layers=layers, labelCol="label", featuresCol="pcaFeatures", solver="l-bfgs", seed=1234) #create the model import time mlp_start = time.time() mlpModel = mlp.fit(trainingData) mlp_end = time.time() print("MLP Classifier") print() print() #Predict on the test data mlppredictions = mlpModel.transform(testData) mlppredictions.select("prediction", "variety", "label").collect()
.appName("Spark ML KMEANS with dataframes") \ .master("local[4]") \ .getOrCreate() data_frame = spark_session \ .read \ .format("libsvm") \ .load("data/wine.scale.txt") data_frame.printSchema() data_frame.show() (training_data, test_data) = data_frame.randomSplit([0.8, 0.2]) naiveBayes = NaiveBayes(modelType="gaussian") perceptron = MultilayerPerceptronClassifier(seed=123) paramGrid_old = ParamGridBuilder() \ .addGrid(NaiveBayes.smoothing, [0.05, 0.0, 0.1, 0.2, 0.5]) \ .build() paramGrid = ParamGridBuilder() \ .addGrid(perceptron.maxIter, [10, 30, 100, 500])\ .addGrid(perceptron.layers, [[13, 7, 5, 3], [13, 8, 4, 5, 3]])\ .build() evaluator = MulticlassClassificationEvaluator(metricName="accuracy") crossval_old = CrossValidator(estimator=naiveBayes, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)
def _train_model_spark(self, data): df = self._prepare_data_spark(data) input_num = len(data.keys().difference({self.CHANGE_AMOUNT, self.CHANGE_DIRECTION, self.TARGET_PRICE, self.TODAY_PRICE})) if self.ann_hidden_nodes_num is None: self.ann_hidden_nodes_num = input_num / 2 + 1 ann_layers = [input_num, # input_num / 3 * 2, # input_num / 3, self.ann_hidden_nodes_num, 2] self.logger.info('layer settings are {}'.format(ann_layers)) self.logger.info('training method is {}'.format(self._train_method)) self.logger.info('trees num is {}'.format(self.random_forest_tree_number)) if isinstance(self._train_method, dict): if self._model is not None and self._train_method[self.CHANGE_AMOUNT] == self.ARTIFICIAL_NEURAL_NETWORK: self._model[self.CHANGE_AMOUNT].stop_server() self._model = {self.CHANGE_AMOUNT: None, self.CHANGE_DIRECTION: None} if self._train_method[self.CHANGE_AMOUNT] == self.LINEAR_REGRESSION: lr = LinearRegression(featuresCol="features", labelCol=self.CHANGE_AMOUNT, maxIter=self.linear_regression_training_times, regParam=self.linear_regression_regularization_parameter, predictionCol='AmountPrediction') self._model[self.CHANGE_AMOUNT] = lr.fit(df) elif self._train_method[self.CHANGE_AMOUNT] == self.RANDOM_FOREST: rfr = RandomForestRegressor(featuresCol="features", labelCol=self.CHANGE_AMOUNT, numTrees=self.random_forest_tree_number, maxDepth=self.random_forest_tree_max_depth, predictionCol='AmountPrediction') self._model[self.CHANGE_AMOUNT] = rfr.fit(df) elif self._train_method[self.CHANGE_AMOUNT] == self.ARTIFICIAL_NEURAL_NETWORK: ann_layers[-1] = 1 self._model[self.CHANGE_AMOUNT] = KerasNeuralNetworkSpark(layers=ann_layers, spark=self._spark, num_workers=self.spark_worker_numbers, epoch=self.ann_epoch_number, featuresCol="features", labelCol=self.CHANGE_AMOUNT, predictionCol='AmountPrediction' ) self._model[self.CHANGE_AMOUNT].fit(df) else: self.logger.warn('Unsupported training method {}'.format(self._train_method)) raise ValueError('Unsupported training method {}'.format(self._train_method)) if self._train_method[self.CHANGE_DIRECTION] == self.LOGISTIC_REGRESSION: lr = LogisticRegression(featuresCol="features", labelCol=self.CHANGE_DIRECTION, maxIter=self.logistic_regression_training_times, regParam=self.linear_regression_regularization_parameter, predictionCol='DirPrediction') self._model[self.CHANGE_DIRECTION] = lr.fit(df) elif self._train_method[self.CHANGE_DIRECTION] == self.RANDOM_FOREST: rfc = RandomForestClassifier(featuresCol="features", labelCol=self.CHANGE_DIRECTION, numTrees=self.random_forest_tree_number, maxDepth=self.random_forest_tree_max_depth, predictionCol='DirPrediction') self._model[self.CHANGE_DIRECTION] = rfc.fit(df) elif self._train_method[self.CHANGE_DIRECTION] == self.ARTIFICIAL_NEURAL_NETWORK: ann_layers[-1] = 2 mlpc = MultilayerPerceptronClassifier(featuresCol="features", labelCol=self.CHANGE_DIRECTION, layers=ann_layers, predictionCol='DirPrediction') self._model[self.CHANGE_DIRECTION] = mlpc.fit(df) else: self.logger.warn('Unsupported training method {}'.format(self._train_method)) raise ValueError('Unsupported training method {}'.format(self._train_method)) else: if self._train_method == self.LINEAR_REGRESSION: lr = LinearRegression(featuresCol="features", labelCol=self.TARGET_PRICE, predictionCol='prediction', regParam=self.linear_regression_regularization_parameter, maxIter=self.linear_regression_training_times) self._model = lr.fit(df) elif self._train_method == self.RANDOM_FOREST: rfr = RandomForestRegressor(featuresCol="features", labelCol=self.TARGET_PRICE, predictionCol='prediction', numTrees=self.random_forest_tree_number, maxDepth=self.random_forest_tree_max_depth) self._model = rfr.fit(df) elif self._train_method == self.ARTIFICIAL_NEURAL_NETWORK: ann_layers[-1] = 1 if self._model is not None: self._model.stop_server() self.logger.warn('layers are {}'.format(ann_layers)) self._model = KerasNeuralNetworkSpark(layers=ann_layers, spark=self._spark, num_workers=self.spark_worker_numbers, epoch=100, featuresCol="features", labelCol=self.TARGET_PRICE, predictionCol='prediction' ) self._model.fit(df) else: self.logger.warn('Unsupported training method {}'.format(self._train_method)) raise ValueError('Unsupported training method {}'.format(self._train_method)) return self._model
# Create PCA model (reduce to 6 principal componentes) pca = PCA(k=6, inputCol="baseFeatures", outputCol="features") timerstart = timeit.default_timer() # Reduce assembled data model = pca.fit(assembledData) reducedData = model.transform(assembledData).select("features", "label") # Specify layers for the neural network: # Input layer of size 43 (features). a hidden layer of size 23 and output of size 2 (classes) layers = [6, 4, 2] # Create the trainer and set its parameters trainer = MultilayerPerceptronClassifier(maxIter=50, layers=layers) # Split the data into training and test sets (30% held out for testing) (trainingData, testData) = reducedData.randomSplit([0.7, 0.3]) # Using PCA #(trainingData, testData) = assembledData.randomSplit([0.7, 0.3]) # Not using PCA # Train model model = trainer.fit(trainingData) timerend = timeit.default_timer() # Make predictions predictions = model.transform(testData) # Select (prediction, true label) and compute metrics f1 = MulticlassClassificationEvaluator(
# List of dataframes for each of the experts dataframes = train_data.randomSplit([1.0 for x in range(num_of_experts)], seed=1234) # Get the models for each expert using the parameters of the best model defined above print("Generating and training experts...") start = time.time() for expert in range(num_of_experts): train_data_experts, test_data_experts = dataframes[expert].randomSplit( [0.8, 0.2]) trainer = MultilayerPerceptronClassifier(maxIter=iters, layers=layers, stepSize=lr, blockSize=128, seed=1234) model = trainer.fit(train_data_experts) dict_of_models[expert] = model # Dictionary to store the predictions of the full dataset for each trained expert dict_of_predictions = dict() # Iterate through the expert and predict the values of each dataset print("Generating predictions...") for expert in range(num_of_experts): dict_of_predictions[expert] = dict_of_models[expert].transform(test_data) # Create a pandas dataframe whose columns are each predictions of each expert evaluations = pd.concat([
########################################################################### ######### Training and Test ######### print("\n======================================================= ") print("==================== NEURAL NETWORK =================== ") print("=======================================================\n") print("\n================== Training ===================\n") #training model MLP num_cols = rescaledData.select( 'features').collect()[0].features.size #vocabulary size layers = [num_cols, 100, 2] trainer_MLP = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234) model_MLP = trainer_MLP.fit(rescaledData) print("Done : Neural Network Training") print("\n========= Test on Brexit labeled data =========\n ") #MLP result_MLP = model_MLP.transform(rescaled_test_df_brexit) predictionAndLabels = result_MLP.select("prediction", "label") evaluator = MulticlassClassificationEvaluator(metricName="accuracy") accuracy_MLP = evaluator.evaluate(predictionAndLabels) print("Accuracy MLP = " + str(accuracy_MLP)) file.write("\n" + "== Results on labeled data (Brexit) ==" + "\n") file.write('-> ACCURACY MLP : ' + str(accuracy_MLP) + '\n')
# Dividimos el dataset en train y test splits = dataset.randomSplit([0.7, 0.3], 1234) train = splits[0] test = splits[1] # Especificamos las capas que tiene la red neuronal (Zeek layers = [9, 9, 9, 10]) # (Argus layers = [15,21,12,10,13,10] o layers = [16,21,12,10,13,10]) layers = [9, 9, 9, 10] # Creamos el modelo de red neuronal, lo entrenamos, lo guardamos y realizamos la prediccion now = datetime.datetime.now() print (now.year, now.month, now.day, now.hour, now.minute, now.second) mpc = MultilayerPerceptronClassifier(layers=layers, labelCol='attack_cat_index', featuresCol='features', seed=1234, predictionCol='prediction') mpc = mpc.fit(train) model_output_path = "{}/data/NeuralNetwork.bin".format( base_path) mpc.write().overwrite().save(model_output_path) now = datetime.datetime.now() print (now.year, now.month, now.day, now.hour, now.minute, now.second) result = mpc.transform(test) #Creamos una funcion para el TPR prediction_list = result.select("attack_cat_index", "prediction").toPandas()[["attack_cat_index","prediction"]].values.tolist() def truePositiveRate(list, label): tot_count = 0 true_count = 0 for a in list:
# COMPARE TO LOGISTIC REGRESSION from pyspark.ml.classification import LogisticRegression lr = LogisticRegression(maxIter=10, regParam=0.5, elasticNetParam=0.8, \ labelCol="indexed", featuresCol="pcaFeatures") lrModel = lr.fit(trainingData) #Predict on the test data lrPredictions = lrModel.transform(testData) lrPredictions.select("prediction", "indexed", "label", "pcaFeatures").collect() evaluator.evaluate(lrPredictions) # COMPARE TO NEURAL NETWORK MULTILAYER PERCEPTRON from pyspark.ml.classification import MultilayerPerceptronClassifier layers = [3, 25, 25, 2] # layers = [input_dim, internal layers, output_dim(number of classe) ] nn = MultilayerPerceptronClassifier(maxIter=100, \ layers=layers, \ blockSize=128, seed=124, labelCol="indexed", \ featuresCol="pcaFeatures") nnModel = nn.fit(trainingData) #Predict on the test data nnPredictions = nnModel.transform(testData) nnPredictions.select("prediction", "indexed", "label", "pcaFeatures").collect() evaluator.evaluate(nnPredictions) """-------------------------------------------------- Modify the code above to: - train a logistic regression with the original vars (5% significant p-value) - from the selected vars above, train 2 logistic models with regParam = [0.01 and 0.5] - train 2 random forest (number of trees = 10 and 100) - compare results """ #Create the model
output_path = args.output train_datafile = 'hdfs://soit-hdp-pro-1.ucc.usyd.edu.au/share/MNIST/Train-label-28x28.csv' test_datafile = 'hdfs://soit-hdp-pro-1.ucc.usyd.edu.au/share/MNIST/Test-label-28x28.csv' train_data = spark.read.csv(train_datafile, header=False, inferSchema="true") test_data = spark.read.csv(test_datafile, header=False, inferSchema="true").repartition(16) #Assembler assembler = VectorAssembler(inputCols=train_data.columns[1:], outputCol="features") #MLP_trainer layers = np.array(args.hiddenLayerSize.split(','), dtype=int) trainer = MultilayerPerceptronClassifier(labelCol="_c0",featuresCol='features', \ maxIter=100, layers=layers, blockSize=128,seed=1234) #pipeline pipeline = Pipeline(stages=[assembler, trainer]) pipelineFit = pipeline.fit(train_data) prediction = pipelineFit.transform(test_data) evaluator = MulticlassClassificationEvaluator(labelCol="_c0", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(prediction) print("Predictions accuracy = %g, Test Error = %g" % (accuracy, (1.0 - accuracy)))
data = sqlContext.read.format("libsvm")\ .load("data/mllib/sample_multiclass_classification_data.txt") # Split the data into train and test data.show() data.printSchema() data.select('features').show() splits = data.randomSplit([0.6, 0.4], 1234) train = splits[0] print (train.count()) train.show() test = splits[1] # specify layers for the neural network: # input layer of size 4 (features), two intermediate of size 5 and 4 # and output of size 3 (classes) layers = [4, 5, 4, 3] # create the trainer and set its parameters trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234) # train the model model = trainer.fit(train) # compute precision on the test set result = model.transform(test) predictionAndLabels = result.select("prediction", "label") evaluator = MulticlassClassificationEvaluator(metricName="precision") print("Precision:" + str(evaluator.evaluate(predictionAndLabels))) # $example off$ sc.stop()
#Creating the combined DataFrame for Phase 3 def unionAll(*dfs): return reduce(DataFrame.unionAll, dfs) finalDF = unionAll(tpDF, fp1DF, fp2DF) finalDF.count() #Training and Evalaution of Phase 3 (TData, TstData) = finalDF.randomSplit([0.7, 0.3]) layers = [28, 29, 30, 2] trainer1 = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234) pipeline4 = Pipeline(stages=[trainer1]) model4 = pipeline4.fit(TData) predict4 = model4.transform(TstData) evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predict4) print("Accuracy = %g" % (accuracy)) predictionAndLabels = predict4.select("prediction", "label") predictionAndLabels.rdd.take(2)
# Subtracting 'train' from original 'customer_complaints_DF' to get test set test = customer_complaints_DF.subtract(train) # Checking distributions of all labels in train and test sets after sampling train.groupBy("label").count().show() test.groupBy("label").count().show() train = train.cache() # specify layers for the neural network: # input layer of size lexicon_size (features), one intermediate of size (lexicon_size+13)//2 # and output of size 18 (classes) layers = [lexicon_size, (lexicon_size + 13) // 2, 13] # Orismos montelou trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234) # Ekpaideusi sto train set kai aksiologisi sto test set # Fit the model start_time = time.time() model = trainer.fit(train) time_cache = time.time() - start_time # compute accuracy on the test set # Kanoume transform panw sto montelo to test set kai pairnoume mia nea stili sto test dataframe pou perilambanei ta predictions result = result = model.transform(test) # Kratame ta pragmatika labels kai ta predictions
test_set = df.subtract(train_set) # Get number of documents for each set print('\n\nSize of train set: ', train_set.count(), '\n\n') print('\n\nSize of test set: ', test_set.count(), '\n\n') # Samples per Category for each set train_set.groupBy('category').count().show() test_set.groupBy('category').count().show() # input layer:k size, output layer:unique_cat size layers = [k, 200, len(uniq)] # Trainer trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=64, seed=seed) start_time = time.time() # Train the model model = trainer.fit(train_set) print('\n\n--- Time Elapsed for Training: {:0.2f} seconds ---\n\n'.format( time.time() - start_time)) # compute accuracy on the test set result = model.transform(test_set) predictionAndLabels = result.select('prediction', 'label') evaluator = MulticlassClassificationEvaluator(metricName='accuracy') print('\nTest set accuracy = {:0.2f} %\n'.format( evaluator.evaluate(predictionAndLabels) * 100))
testData = scalerModel.transform(testData) #pdb.set_trace() #model init lr = LogisticRegression(featuresCol='scaledFeatures', maxIter=100, regParam=0.3, elasticNetParam=0.8, tol=0.0001, family="binomial") dt = DecisionTreeClassifier(featuresCol='scaledFeatures', seed=seed) rf = RandomForestClassifier(featuresCol='scaledFeatures', seed=seed) GBDT = GBTClassifier(featuresCol='scaledFeatures', seed=seed) layers = [feature_number, 10, 5, 2] mlp = MultilayerPerceptronClassifier(featuresCol='scaledFeatures', layers=layers, seed=seed) svm = LinearSVC(featuresCol='scaledFeatures', regParam=0.1) nb = NaiveBayes(featuresCol='scaledFeatures', smoothing=1.0) #model training and testing functions def LR(trainingData, testData): Model = lr.fit(trainingData) results = Model.transform(testData) label = results.select("label").toPandas().values predict = results.select("prediction").toPandas().values np.savetxt('res/predictedLR_spark.txt', predict, fmt='%01d') print("[accuracy,precision,recall,f1]")
def main(argv): # Name of prediction column label = argv[1] start = time.time() spark = SparkSession.builder \ .master("local[*]") \ .appName("datasetClassifier") \ .getOrCreate() data = spark.read.parquet(argv[0]).cache() vector = data.first()["features"] featureCount = len(vector) print(f"Feature count : {featureCount}") classCount = int(data.select(label).distinct().count()) print(f"Class count : {classCount}") print(f"Dataset size (unbalanced) : {data.count()}") data.groupby(label).count().show(classCount) data = datasetBalancer.downsample(data, label, 1) print(f"Dataset size (balanced) : {data.count()}") data.groupby(label).count().show(classCount) testFraction = 0.3 seed = 123 # DecisionTree dtc = DecisionTreeClassifier() mcc = SparkMultiClassClassifier(dtc, label, testFraction, seed) matrics = mcc.fit(data) for k, v in matrics.items(): print(f"{k}\t{v}") # RandomForest rfc = RandomForestClassifier() mcc = SparkMultiClassClassifier(rfc, label, testFraction, seed) matrics = mcc.fit(data) for k, v in matrics.items(): print(f"{k}\t{v}") # LogisticRegression lr = LogisticRegression() mcc = SparkMultiClassClassifier(lr, label, testFraction, seed) matrics = mcc.fit(data) for k, v in matrics.items(): print(f"{k}\t{v}") # MultilayerPerceptronClassifier layers = [featureCount, 10, classCount] mpc = MultilayerPerceptronClassifier().setLayers(layers) \ .setBlockSize(128) \ .setSeed(1234) \ .setMaxIter(200) mcc = SparkMultiClassClassifier(mpc, label, testFraction, seed) matrics = mcc.fit(data) for k, v in matrics.items(): print(f"{k}\t{v}") end = time.time() print("Time: %f sec." % (end - start))
predictionsPath) spark.stop() data["predictionsPath"] = predictionsPath elif config["estimatorType"] == "mpc": train, test = spark.read.parquet( data["currentTrain"]), spark.read.parquet(data["currentTest"]) train.cache() test.cache() classifier = MultilayerPerceptronClassifier( featuresCol=config["featuresCol"], labelCol=config["labelCol"], maxIter=config["maxIter"], layers=[int(x.strip()) for x in config["layers"].split(",")], blockSize=config["blockSize"], seed=config["seed"]) # Fit the model model = classifier.fit(train) predictions = model.transform(test) predictionsPath = data['scheme'] + "://" + data[ 'save'] + "/predictions/" if "partitionCol" in data and data[ 'partitionCol'] in predictions.schema.names: test.write.partitionBy(data['partitionCol']).format(
def mpc_core(df, condition): """ mpc多分类核心函数 :param df: :param condition:{"label": "标签", "features": ["数量", "折扣", "利润", "装运成本"], "iterations": 20,"regParam":0.0,"elasticNetParam":0.0,"tol":0.000006,"fitIntercept":True} :return: """ { "label": "标签", "features": ["数量", "折扣", "利润", "装运成本"], "iterations": 20, "seed": 1, "layers": [4, 2, 2], "stepSize": 0.03, "tol": 0.000001, "blockSize": 128, "solver": "l-bfgs" } # maxIter=100, tol=1e-6, seed=None, layers=None, blockSize=128, stepSize=0.03, solver="l-bfgs" label_index = condition['label'] # 标签列(列名或列号) feature_indexs = condition['features'] # 特征列(列名或列号) iterations = condition['iterations'] # 最大迭代次数(默认100) tol = condition['tol'] # 迭代算法的收敛容限(> = 0)(默认值:1e-06即 0.000001) seed = condition['seed'] # 随机种子 layers = condition[ 'layers'] # Sizes of layers from input layer to output layer blockSize = condition[ 'blockSize'] # Block size for stacking input data in matrices. stepSize = condition['stepSize'] # 步长,默认值:0.03 solver = condition['solver'] # 是否训练截距项(默认值:"l-bfgs","gd"可选) # 参数类型转换 if isinstance(iterations, str): iterations = int(iterations) if isinstance(tol, str): tol = float(tol) if isinstance(seed, str): seed = int(seed) if isinstance(layers, list): for i in range(len(layers)): if isinstance(layers[i], str): layers[i] = int(layers[i]) if isinstance(blockSize, str): blockSize = int(blockSize) if isinstance(stepSize, str): stepSize = float(stepSize) # 1. 准备数据 def func(x): features_data = [] for feature in feature_indexs: features_data.append(x[feature]) return Row(label=x[label_index], features=Vectors.dense(features_data)) training_set = df.rdd.map(lambda x: func(x)).toDF() # 2.训练模型 mpc_param = MultilayerPerceptronClassifier(maxIter=iterations, tol=tol, seed=seed, layers=layers, blockSize=blockSize, stepSize=stepSize, solver=solver) mpc_model = mpc_param.fit(training_set) # 3.保存模型 mpc_model_path = model_url() + '/mpc/' + str(uuid.uuid1()) deltree(mpc_model_path) # 删除已经存在的模型 mpc_model.write().overwrite().save(mpc_model_path) return mpc_model_path