def clasificador_PerceptronMulticapa(dataFrame, capas, NumIter, TamLote): # dividimos en conjunto de entrenamiento y de test splits = dataFrame.randomSplit([0.7, 0.3], 1234) trainData = splits[0] testData = splits[1] # Especificamos las capas para la red neuronal: layers = capas # Creamos el entrenador de la red y le indicamos sus parámetros trainer = MultilayerPerceptronClassifier(maxIter=NumIter, layers=layers, blockSize=TamLote, seed=1234) # Entrenamos el modelo model = trainer.fit(trainData) # compute accuracy on the test set result = model.transform(testData) predictionAndLabels = result.select('prediction', 'label') evaluator = MulticlassClassificationEvaluator(metricName='accuracy') accuracy = evaluator.evaluate(predictionAndLabels) print('Test Error = %g ' % (1.0 - accuracy)) print('Accuracy = ', accuracy) #Calcular AUC evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction') evaluation = evaluator.evaluate(model.transform(testData)) print('AUC:', evaluation) print('Perceptron Multicapa: maxIter:' + str(NumIter) + ' Layers: ' + str(layers) + ' blockSize: ' + str(TamLote))
def test_mlp_classification_summary(self): df = self.spark.createDataFrame([(0.0, Vectors.dense([0.0, 0.0])), (1.0, Vectors.dense([0.0, 1.0])), (1.0, Vectors.dense([1.0, 0.0])), (0.0, Vectors.dense([1.0, 1.0])) ], ["label", "features"]) mlp = MultilayerPerceptronClassifier(layers=[2, 2, 2], seed=123) model = mlp.fit(df) self.assertTrue(model.hasSummary) s = model.summary() # test that api is callable and returns expected types self.assertTrue(isinstance(s.predictions, DataFrame)) self.assertEqual(s.labelCol, "label") self.assertEqual(s.predictionCol, "prediction") self.assertGreater(s.totalIterations, 0) self.assertTrue(isinstance(s.labels, list)) self.assertTrue(isinstance(s.truePositiveRateByLabel, list)) self.assertTrue(isinstance(s.falsePositiveRateByLabel, list)) self.assertTrue(isinstance(s.precisionByLabel, list)) self.assertTrue(isinstance(s.recallByLabel, list)) self.assertTrue(isinstance(s.fMeasureByLabel(), list)) self.assertTrue(isinstance(s.fMeasureByLabel(1.0), list)) self.assertAlmostEqual(s.accuracy, 1.0, 2) self.assertAlmostEqual(s.weightedTruePositiveRate, 1.0, 2) self.assertAlmostEqual(s.weightedFalsePositiveRate, 0.0, 2) self.assertAlmostEqual(s.weightedRecall, 1.0, 2) self.assertAlmostEqual(s.weightedPrecision, 1.0, 2) self.assertAlmostEqual(s.weightedFMeasure(), 1.0, 2) self.assertAlmostEqual(s.weightedFMeasure(1.0), 1.0, 2) # test evaluation (with training dataset) produces a summary with same values # one check is enough to verify a summary is returned, Scala version runs full test sameSummary = model.evaluate(df) self.assertTrue(isinstance(sameSummary, MultilayerPerceptronClassificationSummary)) self.assertAlmostEqual(sameSummary.accuracy, s.accuracy)
def GetFScore(self, i, ratio): spark = SparkSession.builder.getOrCreate() sc = spark.sparkContext maldataset = sc.textFile("dataset.csv") trainHeader = maldataset.first() maldataset = maldataset.filter(lambda line: line != trainHeader ).mapPartitions(lambda x: csv.reader(x)) maldataset = maldataset.map(lambda l: self.toint(l)) df = maldataset.map(lambda l: (l[-1], Vectors.dense(l[0:-1]))) maldataset = maldataset.map( lambda line: LabeledPoint(line[-1], [line[0:len(line) - 1]])) trainData, testData = maldataset.randomSplit([ratio, 1 - ratio]) if i > 0: return self.BC(trainData, testData, i) df = spark.createDataFrame(df.collect(), ["label", "features"]) splits = df.randomSplit([ratio, 1 - ratio], 1234) train = splits[0] test = splits[1] mlp = MultilayerPerceptronClassifier(maxIter=100, layers=[35, 100, 100], blockSize=1, seed=123) model = mlp.fit(train) result = model.transform(test) predictionAndLabels = result.select("prediction", "label") evaluator = MulticlassClassificationEvaluator(metricName="accuracy") return evaluator.evaluate(predictionAndLabels)
def test_raw_and_probability_prediction(self): data_path = "data/mllib/sample_multiclass_classification_data.txt" df = self.spark.read.format("libsvm").load(data_path) mlp = MultilayerPerceptronClassifier( maxIter=100, layers=[4, 5, 4, 3], blockSize=128, seed=123 ) model = mlp.fit(df) test = self.sc.parallelize([Row(features=Vectors.dense(0.1, 0.1, 0.25, 0.25))]).toDF() result = model.transform(test).head() expected_prediction = 2.0 expected_probability = [0.0, 0.0, 1.0] expected_rawPrediction = [-11.6081922998, -8.15827998691, 22.17757045] self.assertTrue(result.prediction, expected_prediction) self.assertTrue(np.allclose(result.probability, expected_probability, atol=1e-4)) # Use `assert_allclose` to show the value of `result.rawPrediction` in the assertion error # message np.testing.assert_allclose( result.rawPrediction, expected_rawPrediction, rtol=0.3, # Use the same default value as `np.allclose` atol=1e-08, )
def TrainMLP(trainingData, testData, layers): # specify layers for the neural network: # input layer of size (features), two intermediate layers # and output of size (classes) # create the trainer and set its parameters mlp = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128) # train the model start = time.time() model = mlp.fit(trainingData) end = time.time() print('Training MLP model took', end - start) # Make predictions. predictions = model.transform(testData) # Select (prediction, true label) and compute test error evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("Test Error = %g, accuracy = %g" % (1.0 - accuracy, accuracy)) return model
def main(args): spark=SparkSession\ .builder\ .master(args[2])\ .appName(args[1])\ .getOrCreate() start_computing_time = time.time() # Load the data stored in LIBSVM format as a DataFrame. data = spark.read.format("libsvm").load(args[3]) (trainingData, testData) = data.randomSplit([0.7, 0.3],seed=1234) # specify layers for the neural network: # input layer of size 4 (features), two intermediate of size 5 and 4 # and output of size 3 (classes) layers = [4, 5, 4, 3] # create the trainer and set its parameters trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234) # train the model model = trainer.fit(trainingData) # compute accuracy on the test set result = model.transform(testData) appendTime(sys.argv,start_computing_time) spark.stop()
def mpc(ss, data, label_index, feature_indexs, project_url): # 1.构造训练数据集 def func(x): features_data = [] for feature in feature_indexs: features_data.append(x[feature]) return Row(label=label_index, features=Vectors.dense(features_data)) training_set = data.rdd.map(lambda x: func(x)).toDF() # 2.训练模型 # maxIter=100, tol=1e-6, seed=None, layers=None, blockSize=128, stepSize=0.03, solver="l-bfgs", initialWeights=None mpc_param = MultilayerPerceptronClassifier(maxIter=100, tol=1e-6, blockSize=128, stepSize=0.03, solver="l-bfgs") mpc_param.setSeed(1) mpc_param.setLayers([4, 2, 2]) mpc_model = mpc_param.fit(training_set) # 3.保存模型 model_path = project_url + '/model/multipleClassification/mpc' mpc_model.write().overwrite().save(model_path) # 4.读取模型 mpc2 = MultilayerPerceptronClassificationModel.load(model_path) # 5.预测 result = mpc2.transform(training_set).select("prediction", "features").show()
def clasificar_chi2(): #Leemos la data y convertimos a float los valores de cada columna conf = SparkConf().setAppName("NN_1").setMaster("local") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) rdd = sqlContext.read.csv( "/home/ulima-azure/data/Enfermedad_Oncologica_T3.csv", header=True).rdd rdd = rdd.map(lambda x: (float(x[0]), float(x[1]), float(x[2]), float(x[ 3]), float(x[4]), float(x[5]), float(x[6]), float(x[7]), float(x[8]), float(x[9]))) df = rdd.toDF([ "Cellenght", "Cellsize", "Cellshape", "mgadhesion", "sepics", "bnuclei", "bchromatin", "nucleos", "mitoses", "P_Benigno" ]) #Construir nuestro vector assembler (features) assembler = VectorAssembler(inputCols=[ "Cellenght", "Cellsize", "Cellshape", "nucleos", "bchromatin", "mitoses" ], outputCol="featuresChi2") df_chi2 = assembler.transform(df) df_chi2 = df_chi2.select("featuresChi2", "P_Benigno") selector = ChiSqSelector(numTopFeatures=3, featuresCol="featuresChi2", labelCol="P_Benigno", outputCol="featuresSelected") df_result = selector.fit(df_chi2).transform(df_chi2) #Dividir data en training y test (df_training, df_test) = df_result.randomSplit([0.7, 0.3]) # Definir arquitectura de nuestra red (hiperparametro) capas = [3, 4, 6, 2] # Construimos al entrenador # Hiperparametro: maxIter entrenador = MultilayerPerceptronClassifier(featuresCol="featuresSelected", labelCol="P_Benigno", maxIter=1000, layers=capas) # Entrenar nuestro modelo modelo = entrenador.fit(df_training) # Validar nuestro modelo df_predictions = modelo.transform(df_test) evaluador = MulticlassClassificationEvaluator(labelCol="P_Benigno", predictionCol="prediction", metricName="accuracy") accuracy = evaluador.evaluate(df_predictions) print(f"Accuracy: {accuracy}") df_predictions.select("prediction", "rawPrediction", "probability").show() #Mostramos la cantidad de 0 y 1 de las predicciones df_predictions.groupby('prediction').count().show()
def neuralNetwork_model(train, x, y, feature_count): layers = [feature_count, feature_count * 3, feature_count * 2, 2] mlp = MultilayerPerceptronClassifier(featuresCol=x, labelCol=y, maxIter=100, layers=layers, blockSize=512, seed=12345) mlpModel = mlp.fit(train) return mlpModel
def _get_mlp_model(feat_train): from pyspark.ml.classification import MultilayerPerceptronClassifier global num_features layers = [num_features, 10, 10, 2] mlp_trainer = MultilayerPerceptronClassifier(maxIter=10, layers=layers, seed=123, stepSize=0.005, solver='gd', featuresCol="features", labelCol="label") mlp_model = mlp_trainer.fit(feat_train) return mlp_model
def make_model(train,val): layers = [100, 100, 2] # create the trainer and set its parameters trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234) model = trainer.fit(train) result = model.transform(val) predictionAndLabels = result.select("prediction", "label") #predictionAndLabels.where(predictionAndLabels['prediction'] == 0 ).show() evaluator = MulticlassClassificationEvaluator(metricName="accuracy") print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels))) #save model mlp_path = "s3://projfakenews/mlp" model.save(mlp_path)
def price_predict(path, windows=5, spark_contest=None, sql_context=None): if spark_contest is None: spark_contest, sql_context = load_spark_context() input_data = DataParser(path=path, window_size=windows) close_train_df, close_test_df, open_train_df, open_test_df = input_data.get_n_days_history_data( data_type=DATA_FRAME, spark_context=spark_contest, sql_context=sql_context) evaluator = MulticlassClassificationEvaluator(metricName=PREDICTION) # handle open data open_trainer = MultilayerPerceptronClassifier(maxIter=1, layers=[4, 5, 4, 3], blockSize=128, featuresCol=FEATURES, labelCol=LABEL, seed=1234) open_model = open_trainer.fit(open_train_df) open_result = open_model.transform(open_test_df) open_prediction_labels = open_result.select(PREDICTION, LABEL) print("Precision:" + str(evaluator.evaluate(open_prediction_labels))) # handle close data close_trainer = MultilayerPerceptronClassifier(maxIter=100, layers=[4, 5, 4, 3], blockSize=128, featuresCol=FEATURES, labelCol=LABEL, seed=1234) close_model = close_trainer.fit(close_train_df) close_result = close_model.transform(close_test_df) close_prediction_labels = close_result.select(PREDICTION, LABEL) print("Precision:" + str(evaluator.evaluate(close_prediction_labels)))
def perceptron_multicapa(train, test, capas, num_iter, tamlot): layers = capas trainer = MultilayerPerceptronClassifier( maxIter=num_iter, layers=layers, blockSize=tamlot, seed=13) # Entrenamos el modelo model = trainer.fit(train) # compute accuracy on the test set result = model.transform(test) predictionAndLabels = result.select('prediction', 'label') evaluator = MulticlassClassificationEvaluator(metricName='accuracy') accuracy = evaluator.evaluate(predictionAndLabels) return accuracy
def MLPclf(trainingData, testData): mlp = MultilayerPerceptronClassifier().setFeaturesCol( "features").setLabelCol("label").setLayers(layers).setSolver( "gd").setStepSize(0.3).setMaxIter(1000) mlpModel = mlp.fit(trainingData) results = mlpModel.transform(testData) label = results.select("label").toPandas().values predict = results.select("prediction").toPandas().values np.savetxt('res/predictedMLP_spark.txt', predict, fmt='%01d') print("[accuracy,precision,recall,f1]") # print(evaluate(label,predict)) return evaluate(label, predict)
def test_raw_and_probability_prediction(self): data_path = "data/mllib/sample_multiclass_classification_data.txt" df = self.spark.read.format("libsvm").load(data_path) mlp = MultilayerPerceptronClassifier(maxIter=100, layers=[4, 5, 4, 3], blockSize=128, seed=123) model = mlp.fit(df) test = self.sc.parallelize([Row(features=Vectors.dense(0.1, 0.1, 0.25, 0.25))]).toDF() result = model.transform(test).head() expected_prediction = 2.0 expected_probability = [0.0, 0.0, 1.0] expected_rawPrediction = [57.3955, -124.5462, 67.9943] self.assertTrue(result.prediction, expected_prediction) self.assertTrue(np.allclose(result.probability, expected_probability, atol=1E-4)) self.assertTrue(np.allclose(result.rawPrediction, expected_rawPrediction, atol=1E-4))
def run_nlp(train_df, test_df): # len(features) # layers = [21, 80, 3] layers = [train_df.schema["features"].metadata["ml_attr"]["num_attrs"], 160, 150, 50, 10, 2] # create the trainer and set its parameters trainer = MultilayerPerceptronClassifier(featuresCol='features', labelCol='label', predictionCol='prediction', maxIter=200, layers=layers, stepSize=0.0003, blockSize=30, tol=0.00001, seed=seed) # train the model model = trainer.fit(train_df) # compute accuracy on the test set predictTest = model.transform(test_df) predictionAndLabels = predictTest.select("prediction", "label") evaluator = MulticlassClassificationEvaluator(metricName="accuracy") print("Test set accuracy of MLP = " + str(evaluator.evaluate(predictionAndLabels))) return model
def test_multilayer_load(self): df = self.spark.createDataFrame([(0.0, Vectors.dense([0.0, 0.0])), (1.0, Vectors.dense([0.0, 1.0])), (1.0, Vectors.dense([1.0, 0.0])), (0.0, Vectors.dense([1.0, 1.0]))], ["label", "features"]) mlp = MultilayerPerceptronClassifier(layers=[2, 2, 2], seed=123) model = mlp.fit(df) self.assertEqual(model.getSolver(), "l-bfgs") transformed1 = model.transform(df) path = tempfile.mkdtemp() model_path = path + "/mlp" model.save(model_path) model2 = MultilayerPerceptronClassificationModel.load(model_path) self.assertEqual(model2.getSolver(), "l-bfgs") transformed2 = model2.transform(df) self.assertEqual(transformed1.take(4), transformed2.take(4))
def MLP_train(training): """ Input : normalized tweet-term format training set Output : Neural Network training model """ num_cols = training.select( 'features').collect()[0].features.size #vocabulary size layers = [num_cols, 100, 2] MLP_trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234) model = MLP_trainer.fit(training) return model
def train_model(train_data, num_features): """Train the multilayer perceptron model. Params: - train_data (pyspark.rdd.RDD): The training dataset partition - num_features (int): The number of features Returns: - model (pyspark.ml.MultilayerPerceptronModel): The trained MLP model """ multilayer_perceptron = MultilayerPerceptronClassifier( blockSize=1, featuresCol="presence_feature_set", labelCol="label", predictionCol="prediction", layers=[num_features, 100, 50, 10, 2]) model = multilayer_perceptron.fit(train_data) return model
def multilayer_perceptron_classify(comment_preprocessed): sc = SparkContext(appName="Classification") sql_context = SQLContext(sc) data = sql_context.createDataFrame(comment_preprocessed) train, test = data.randomSplit([0.7, 0.3], 1234) layers = [len(comment_preprocessed[0].features), 11, 2] # sqrt(2000) = 45, sqrt(4000) = 63, log(2000, 2) = 11 trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234) model = trainer.fit(train) predictions = model.transform(test) evaluate_classification(predictions) time.sleep(1) # predict_comment(sql_context, model) compare_classification_with_tool(sql_context, model)
def naiveBayeseian(): def parseLine(line): keys = [float(x) for x in line.split(",")] #return LabeledPoint(keys[0],keys[1:]) return keys scdata1 = sc.textFile("/home/ubantu/TwoClassfeatureSet.csv") data= scdata1.map(parseLine) splits = data.randomSplit([0.8, 0.2], 1234) train = splits[0] test = splits[1] layers = [30, 20, 20, 2] # create the trainer and set its parameters trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234) # train the model model = trainer.fit(train) # compute precision on the test set result = model.transform(test) predictionAndLabels = result.select("prediction", "label") evaluator = MulticlassClassificationEvaluator(metricName="precision") print("Precision:" + str(evaluator.evaluate(predictionAndLabels)))
def multilayer_perceptron_classifier(trainingDataFrame, maxIter=100, tol=1e-6, seed=None, layers=None, blockSize=128, stepSize=0.03, solver="gd", initialWeights=None): mlp = MultilayerPerceptronClassifier(maxIter=maxIter, tol=tol, seed=seed, layers=layers, blockSize=blockSize, stepSize=stepSize, solver=solver, initialWeights=initialWeights) mlpModel = mlp.fit(trainingDataFrame) result = {} result["model"] = mlpModel return result
def train_evaluate(train, test, hidden_layers, num_columns, num_classes, labelCol): # specify layers for the neural network: layers = [num_columns - 1, *hidden_layers, max(2, num_classes)] # create the trainer and set its parameters from pyspark.ml.classification import MultilayerPerceptronClassifier trainer = MultilayerPerceptronClassifier(labelCol=labelCol, maxIter=500, layers=layers) # train the model model = trainer.fit(train) # compute accuracy on the test set result = model.transform(test) predictionAndLabels = result.select("prediction", labelCol) from pyspark.ml.evaluation import MulticlassClassificationEvaluator evaluator = MulticlassClassificationEvaluator(labelCol=labelCol, metricName="accuracy") return evaluator.evaluate(predictionAndLabels)
def get_results(data): results = [] for i in data: splits = i.randomSplit([0.8, 0.2], 1234) training = splits[0] testing = splits[1] training = training.toDF(["label", "features"]) testing = testing.toDF(["label", "features"]) numFeatures = training.take(1)[0].features.size #First layers has to be the number of the features of the data layers = [numFeatures, 4, 5, 2] trainer = MultilayerPerceptronClassifier(maxIter=1000, layers=layers, blockSize=128, seed=1234) model = trainer.fit(training) result = model.transform(testing) predictionAndLabels = result.select("prediction", "label") evaluator = MulticlassClassificationEvaluator(metricName="accuracy") answer = "Test set precision = " + str(evaluator.evaluate(predictionAndLabels)) + '\n' results.append(answer) return sc.parallelize(results)
def FeedforwardNeuralNet(input_size): # specify layers for the neural network: # input layer of size 4 (features), two intermediate of size 5 and 4 # and output of size 3 (classes) layers = [input_size, 100, 20, 2] # create the trainer and set its parameters trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234) # train the model model = trainer.fit(train) model.write().overwrite().save("save/tencent2vec_nn") # compute accuracy on the test set result = model.transform(test) # result.select("prediction", "label").show(400) predictionAndLabels = result.select("prediction", "label") evaluator = MulticlassClassificationEvaluator(metricName="accuracy") print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))
def run(data, headers_feature): print('-*-*-*- Iniciando la red neuronal -*-*-*-') start_time = datetime.datetime.now() print('Tiempo inicial', start_time) # Obtener data de entrenamiento y pruebas train_data, test_data = prepare_dataset(data, headers_feature) train_data.show() # Especificar las capas para la red neuronal: # input de 5000 (features), # 5 capas intermedias de 50 neuronas # y output de 101 (clases) layers = [5000, 50, 50, 50, 50, 50, 101] # Configurar la el clasificador perceptron multicapa mlpc = MultilayerPerceptronClassifier( maxIter=100, layers=layers, blockSize=128, seed=1234 ) # Obtener el modelo de clasificacion mlpc_model = mlpc.fit(train_data) # print("Coeficientes: " + str(lr_model.coefficients)) # print("Intercepto: " + str(lr_model.intercept)) data_to_validate = mlpc_model.transform(test_data) # Validar la precision de la prediccion de la data de prueba prediction = data_to_validate.select("prediction", "label") evaluator = MulticlassClassificationEvaluator(metricName="accuracy") print("Precision de la prueba = " + str(evaluator.evaluate(prediction))) end_time = datetime.datetime.now() print('Tiempo final', end_time) print('Tiempo transcurrido para red neuronal', end_time - start_time)
rf = RandomForestClassifier(labelCol="label", featuresCol="features") rf_model = rf.fit(train_df) rf_predictions = rf_model.transform(test_df) rf_predictions.take(1) # COMMAND ---------- from pyspark.ml.classification import MultilayerPerceptronClassifier if enabled[3]: layers = [len(inputCols), 5, 4, 2] mp = MultilayerPerceptronClassifier(maxIter=100, layers=layers, labelCol="label", featuresCol="features") mp_model = mp.fit(train_df) mp_predictions = mp_model.transform(test_df) mp_predictions.take(1) # COMMAND ---------- from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder, CrossValidator from pyspark.ml.evaluation import BinaryClassificationEvaluator evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="rawPrediction") accuracies = [] if enabled[0]: dt_accuracy = evaluator.evaluate(dt_predictions)
train = splits[0] test = splits[1] # specify layers for the neural network: # input layer of size 6 (features), two intermediate of size 6 and 4 # and output of size 2 (classes) layers = [6, 10, 2] # create the trainer and set its parameters trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234) # train the model model = trainer.fit(train) # compute accuracy on the test set result = model.transform(test) predictionAndLabels = result.select('prediction', 'label') evaluator = MulticlassClassificationEvaluator(metricName='accuracy') print('Test set accuracy = ' + str(evaluator.evaluate(predictionAndLabels))) #Calcular AUC evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction') evaluation = evaluator.evaluate(model.transform(test)) print('AUC:', evaluation) #Detener sc.stop()
print(riskdata.stat.crosstab("bad","reason").show()) ################################################################# # Multilayer Perceptron Classifier ################################################################# # specify layers for the neural network: # input layer of size 10 (features), two intermediate of size 3 and 2 # and output of size 2 (classes) layers = [10, 3, 2, 2] # create the trainer and set its parameters MLPtrainer = MultilayerPerceptronClassifier(maxIter = 100, layers = layers, labelCol = "bad", featuresCol = "predictors", predictionCol = "prediction", blockSize = 1000, seed = 1234) # train the model MLP_model = MLPtrainer.fit(train) # compute precision on the test set MLP_result = MLP_model.transform(test) MLP_predictionAndLabels = MLP_result.select("prediction", "bad") MLP_evaluator = MulticlassClassificationEvaluator(metricName="precision") #print(MLP_model) #print(str(MLP_result.show())) # Print first 20 rows result to output file (plain text) """" ################################################################# # Decision Tree Classification ################################################################# # Train a DecisionTree model. dt_model_spec = DecisionTreeClassifier(labelCol="bad", featuresCol="predictors")
data = sqlContext.read.format("libsvm")\ .load("data/mllib/sample_multiclass_classification_data.txt") # Split the data into train and test data.show() data.printSchema() data.select('features').show() splits = data.randomSplit([0.6, 0.4], 1234) train = splits[0] print (train.count()) train.show() test = splits[1] # specify layers for the neural network: # input layer of size 4 (features), two intermediate of size 5 and 4 # and output of size 3 (classes) layers = [4, 5, 4, 3] # create the trainer and set its parameters trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234) # train the model model = trainer.fit(train) # compute precision on the test set result = model.transform(test) predictionAndLabels = result.select("prediction", "label") evaluator = MulticlassClassificationEvaluator(metricName="precision") print("Precision:" + str(evaluator.evaluate(predictionAndLabels))) # $example off$ sc.stop()
seed=1234) # Get the models for each expert using the parameters of the best model defined above print("Generating and training experts...") start = time.time() for expert in range(num_of_experts): train_data_experts, test_data_experts = dataframes[expert].randomSplit( [0.8, 0.2]) trainer = MultilayerPerceptronClassifier(maxIter=iters, layers=layers, stepSize=lr, blockSize=128, seed=1234) model = trainer.fit(train_data_experts) dict_of_models[expert] = model # Dictionary to store the predictions of the full dataset for each trained expert dict_of_predictions = dict() # Iterate through the expert and predict the values of each dataset print("Generating predictions...") for expert in range(num_of_experts): dict_of_predictions[expert] = dict_of_models[expert].transform(test_data) # Create a pandas dataframe whose columns are each predictions of each expert evaluations = pd.concat([ dict_of_predictions[x].toPandas().prediction for x in range(num_of_experts) ], axis=1)
labelCol="indexed", featuresCol="pcaFeatures") lrModel = lr.fit(trainingData) #Predict on the test data lrPredictions = lrModel.transform(testData) lrPredictions.select("prediction", "indexed", "label", "pcaFeatures").collect() evaluator.evaluate(lrPredictions) # COMPARE TO NEURAL NETWORK MULTILAYER PERCEPTRON from pyspark.ml.classification import MultilayerPerceptronClassifier layers = [3, 25, 25, 2] # layers = [input_dim, internal layers, output_dim(number of classe) ] nn = MultilayerPerceptronClassifier(maxIter=100, \ layers=layers, \ blockSize=128, seed=124, labelCol="indexed", \ featuresCol="pcaFeatures") nnModel = nn.fit(trainingData) #Predict on the test data nnPredictions = nnModel.transform(testData) nnPredictions.select("prediction", "indexed", "label", "pcaFeatures").collect() evaluator.evaluate(nnPredictions) """-------------------------------------------------- Modify the code above to: - train a logistic regression with the original vars (5% significant p-value) - from the selected vars above, train 2 logistic models with regParam = [0.01 and 0.5] - train 2 random forest (number of trees = 10 and 100) - compare results """ #Create the model rmClassifer10 = RandomForestClassifier(labelCol="indexed", \ featuresCol="pcaFeatures", numTrees=10)
def _train_model_spark(self, data): df = self._prepare_data_spark(data) input_num = len(data.keys().difference({self.CHANGE_AMOUNT, self.CHANGE_DIRECTION, self.TARGET_PRICE, self.TODAY_PRICE})) if self.ann_hidden_nodes_num is None: self.ann_hidden_nodes_num = input_num / 2 + 1 ann_layers = [input_num, # input_num / 3 * 2, # input_num / 3, self.ann_hidden_nodes_num, 2] self.logger.info('layer settings are {}'.format(ann_layers)) self.logger.info('training method is {}'.format(self._train_method)) self.logger.info('trees num is {}'.format(self.random_forest_tree_number)) if isinstance(self._train_method, dict): if self._model is not None and self._train_method[self.CHANGE_AMOUNT] == self.ARTIFICIAL_NEURAL_NETWORK: self._model[self.CHANGE_AMOUNT].stop_server() self._model = {self.CHANGE_AMOUNT: None, self.CHANGE_DIRECTION: None} if self._train_method[self.CHANGE_AMOUNT] == self.LINEAR_REGRESSION: lr = LinearRegression(featuresCol="features", labelCol=self.CHANGE_AMOUNT, maxIter=self.linear_regression_training_times, regParam=self.linear_regression_regularization_parameter, predictionCol='AmountPrediction') self._model[self.CHANGE_AMOUNT] = lr.fit(df) elif self._train_method[self.CHANGE_AMOUNT] == self.RANDOM_FOREST: rfr = RandomForestRegressor(featuresCol="features", labelCol=self.CHANGE_AMOUNT, numTrees=self.random_forest_tree_number, maxDepth=self.random_forest_tree_max_depth, predictionCol='AmountPrediction') self._model[self.CHANGE_AMOUNT] = rfr.fit(df) elif self._train_method[self.CHANGE_AMOUNT] == self.ARTIFICIAL_NEURAL_NETWORK: ann_layers[-1] = 1 self._model[self.CHANGE_AMOUNT] = KerasNeuralNetworkSpark(layers=ann_layers, spark=self._spark, num_workers=self.spark_worker_numbers, epoch=self.ann_epoch_number, featuresCol="features", labelCol=self.CHANGE_AMOUNT, predictionCol='AmountPrediction' ) self._model[self.CHANGE_AMOUNT].fit(df) else: self.logger.warn('Unsupported training method {}'.format(self._train_method)) raise ValueError('Unsupported training method {}'.format(self._train_method)) if self._train_method[self.CHANGE_DIRECTION] == self.LOGISTIC_REGRESSION: lr = LogisticRegression(featuresCol="features", labelCol=self.CHANGE_DIRECTION, maxIter=self.logistic_regression_training_times, regParam=self.linear_regression_regularization_parameter, predictionCol='DirPrediction') self._model[self.CHANGE_DIRECTION] = lr.fit(df) elif self._train_method[self.CHANGE_DIRECTION] == self.RANDOM_FOREST: rfc = RandomForestClassifier(featuresCol="features", labelCol=self.CHANGE_DIRECTION, numTrees=self.random_forest_tree_number, maxDepth=self.random_forest_tree_max_depth, predictionCol='DirPrediction') self._model[self.CHANGE_DIRECTION] = rfc.fit(df) elif self._train_method[self.CHANGE_DIRECTION] == self.ARTIFICIAL_NEURAL_NETWORK: ann_layers[-1] = 2 mlpc = MultilayerPerceptronClassifier(featuresCol="features", labelCol=self.CHANGE_DIRECTION, layers=ann_layers, predictionCol='DirPrediction') self._model[self.CHANGE_DIRECTION] = mlpc.fit(df) else: self.logger.warn('Unsupported training method {}'.format(self._train_method)) raise ValueError('Unsupported training method {}'.format(self._train_method)) else: if self._train_method == self.LINEAR_REGRESSION: lr = LinearRegression(featuresCol="features", labelCol=self.TARGET_PRICE, predictionCol='prediction', regParam=self.linear_regression_regularization_parameter, maxIter=self.linear_regression_training_times) self._model = lr.fit(df) elif self._train_method == self.RANDOM_FOREST: rfr = RandomForestRegressor(featuresCol="features", labelCol=self.TARGET_PRICE, predictionCol='prediction', numTrees=self.random_forest_tree_number, maxDepth=self.random_forest_tree_max_depth) self._model = rfr.fit(df) elif self._train_method == self.ARTIFICIAL_NEURAL_NETWORK: ann_layers[-1] = 1 if self._model is not None: self._model.stop_server() self.logger.warn('layers are {}'.format(ann_layers)) self._model = KerasNeuralNetworkSpark(layers=ann_layers, spark=self._spark, num_workers=self.spark_worker_numbers, epoch=100, featuresCol="features", labelCol=self.TARGET_PRICE, predictionCol='prediction' ) self._model.fit(df) else: self.logger.warn('Unsupported training method {}'.format(self._train_method)) raise ValueError('Unsupported training method {}'.format(self._train_method)) return self._model
######### Training and Test ######### print("\n======================================================= ") print("==================== NEURAL NETWORK =================== ") print("=======================================================\n") print("\n================== Training ===================\n") #training model MLP num_cols = rescaledData.select( 'features').collect()[0].features.size #vocabulary size layers = [num_cols, 100, 2] trainer_MLP = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234) model_MLP = trainer_MLP.fit(rescaledData) print("Done : Neural Network Training") print("\n========= Test on Brexit labeled data =========\n ") #MLP result_MLP = model_MLP.transform(rescaled_test_df_brexit) predictionAndLabels = result_MLP.select("prediction", "label") evaluator = MulticlassClassificationEvaluator(metricName="accuracy") accuracy_MLP = evaluator.evaluate(predictionAndLabels) print("Accuracy MLP = " + str(accuracy_MLP)) file.write("\n" + "== Results on labeled data (Brexit) ==" + "\n") file.write('-> ACCURACY MLP : ' + str(accuracy_MLP) + '\n')
# define parameters input_layer = 200 # number of features output_layer = 10 # output 0~9 hidden_1 = 150 hidden_2 = 150 layers = [input_layer, hidden_1, hidden_2, output_layer] MPC = MultilayerPerceptronClassifier(featuresCol='feature', labelCol='label', predictionCol='prediction', maxIter=400, layers=layers, blockSize=128, seed=123) model = MPC.fit(pca_train_result) result = model.transform(pca_test_result).select("label", "prediction") result_lp = result.selectExpr("label", "cast (prediction as int) prediction") final_result = result_lp.rdd count = final_result.count() # calculate the accuracy neutral_zero_value = 0 def seqOp(a, b): if b[0] == b[1]: return a else: