# Now we need to convert our feature columns (sensor names) into a vector for each row va = VectorAssembler(inputCols=sensorNames, outputCol="features")\ .transform(modelData) # Index the labels (maintenance type) li = StringIndexer(inputCol='maintenanceType', outputCol='label')\ .fit(va) # #### Model Training # We split the data into 2 subsets - one to train the model, and one to test/evaluate it # We then build a pipeline of all steps involved in running the model on some data. The # pipeline will have the following steps: # 1. StringIndexer - convert the maintenance type strings to a numeric index # 2. RandomForestClassifier - classify the data into one of the different indexes # 3. IndexToString - convert he maintenance type indexes back to strings (trainingData, testData) = va.randomSplit([0.7, 0.3]) rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=10) i2s = IndexToString(inputCol="prediction", outputCol="predictedLabel", labels=li.labels) pipeline = Pipeline(stages=[li, rf, i2s]) model = pipeline.fit(trainingData) # #### Model Evaluation # The training data was used to fit the model (ie. train it), now we can test the model # using the test subset, and calculate the accuracy (ie. false prediction rate) predictions = model.transform(testData) evaluator = MulticlassClassificationEvaluator( labelCol="label", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print("Test Error = %g" % (1.0 - accuracy))
from pyspark.ml.classification import LogisticRegression from pyspark.ml.feature import VectorAssembler from pyspark.sql import SparkSession from sklearn.datasets import load_iris import mlflow spark = SparkSession.builder.getOrCreate() df = load_iris(as_frame=True).frame.rename(columns={"target": "label"}) df = spark.createDataFrame(df) df = VectorAssembler(inputCols=df.columns[:-1], outputCol="features").transform(df) train, test = df.randomSplit([0.8, 0.2]) mlflow.pyspark.ml.autolog() lor = LogisticRegression(maxIter=5) with mlflow.start_run(): lorModel = lor.fit(train) pred = lorModel.transform(test) pred.select(lorModel.getPredictionCol()).show(10) spark.stop()
.getOrCreate() # Prepare data logs = spark.read.csv("hdfs://devenv/user/spark/web_logs_analysis/data/", header=True, inferSchema=True) # Preprocessing and feature engineering feature_prep = logs.select("product_category_id", "device_type", "connection_type", "gender") \ final_data = VectorAssembler( inputCols=["product_category_id", "device_type", "connection_type"], outputCol="features").transform(feature_prep) # Split data into train and test sets train_data, test_data = final_data.randomSplit([0.7, 0.3]) # Model training classifier = RandomForestClassifier(featuresCol="features", labelCol="gender", numTrees=10, maxDepth=10) model = classifier.fit(train_data) # Transform the test data using the model to get predictions predicted_test_data = model.transform(test_data) # Evaluate the model performance evaluator_f1 = MulticlassClassificationEvaluator( labelCol='gender', predictionCol='prediction', metricName='f1') print("F1 score: {}", evaluator_f1.evaluate(predicted_test_data))
model = glr.fit(feature_data) print(model.summary) maxPValue = max(model.summary.pValues) if maxPValue > 0.05: i = model.summary.pValues.index(maxPValue) print(features_names[i]) del features_names[i] else: break print("final features sets: %s" % features_names) # run logistic regression on the selected features with different threshold final_data = VectorAssembler(inputCols=features_names, outputCol='features').transform(data) train, test = final_data.randomSplit([0.8, 0.2]) threshold = [0.5, 0.4, 0.3, 0.2, 0.1] for t in threshold: lr = LogisticRegression(threshold=t) model = lr.fit(train) testSummary = model.evaluate(test) print("With threshold %s, Accuracy is %s" % (t, testSummary.accuracy)) print("With threshold %s, Area under roc is %s" % (t, testSummary.areaUnderROC)) print("With threshold %s, Specificity is %s" % (t, testSummary.recallByLabel[0])) print("With threshold %s, Sensitivity is %s" % (t, testSummary.recallByLabel[1])) spark.stop()
#modelovanie SVM_df = nova_vzorka atr_without_severity = [ "Number_of_Vehicles", "Number_of_Casualties", "1st_Road_Class", "Speed_limit", "Junction_Detail", "Light_Conditions", "Urban_or_Rural_Area", "Did_Police_Officer_Attend_Scene_of_Accident", "Junction_Location", "Skidding_and_Overturning", "Hit_Object_off_Carriageway" ] SVM_vector_data = VectorAssembler(inputCols=atr_without_severity, outputCol="features").transform(SVM_df) training_data, test_data = SVM_vector_data.randomSplit([0.7, 0.3], seed=123) #Modelovanie print "---------------------------------------------------------------------" print "-----------------------------Modelovanie-----------------------------" print "---------------------------------------------------------------------" #Decision tree classifier print "-------------------------------------------------" print "---------------DESICION TREE---------------" print "-------------------------------------------------" tree_classifier = DecisionTreeClassifier(featuresCol="features", labelCol="Accident_Severity", impurity="entropy", maxDepth=10,
columnaInicial = int(os.environ.get('COLUMNA_INICIAL')) columnaFinal = int(os.environ.get('COLUMNA_FINAL')) array = data.columns start_time = time() # Comienzo de contar tiempo data = VectorAssembler(inputCols=array[columnaInicial:columnaFinal], outputCol="features").transform(data) # Automatically identify categorical features, and index them. # We specify maxCategories so features with > 4 distinct values are treated as continuous. featureIndexer =\ VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=12).fit(data) # Split the data into train and test splits = data.randomSplit([0.7, 0.3]) train = splits[0] test = splits[1] # create the trainer and set its parameters nb = NaiveBayes(smoothing=1.0, labelCol=array[columnaInicial], modelType="multinomial", featuresCol="indexedFeatures") # Pipeline pipeline = Pipeline(stages=[featureIndexer, nb]) # train the model model = pipeline.fit(train) elapsed_time = time() - start_time elapsed_time = format(elapsed_time, '.6f') salida = 'Tiempo ejecución:' + str(elapsed_time) + ' segundos'
0).otherwise(cast_vzorky["Accident_Severity"])) SVM_df = SVM_df.withColumn( "Accident_Severity", when(SVM_df["Accident_Severity"] == 2, 1).otherwise(SVM_df["Accident_Severity"])) SVM_df.describe("Accident_Severity").collect() SVM_df = VectorAssembler(inputCols=[ "Number_of_Vehicles", "Number_of_Casualties", "Junction_Detail", "Junction_Control", "Did_Police_Officer_Attend_Scene_of_Accident", "Junction_Location", "Skidding_and_Overturning", "Hit_Object_off_Carriageway" ], outputCol="features").transform(SVM_df) #Rozdelenie dat na trenovaciu a testovaciu mnozinu training_data, test_data = SVM_df.randomSplit([0.6, 0.4], seed=123) SVM_df.count() #Modelovanie print "---------------------------------------------------------------------" print "-----------------------------Modelovanie-----------------------------" print "---------------------------------------------------------------------" #Decision tree classifier print "-------------------------------------------------" print "---------------DESICION TREE---------------" print "-------------------------------------------------" tree_classifier = DecisionTreeClassifier(featuresCol="features", labelCol="Accident_Severity", impurity="entropy",