예제 #1
0
# Now we need to convert our feature columns (sensor names) into a vector for each row
va = VectorAssembler(inputCols=sensorNames, outputCol="features")\
  .transform(modelData)
# Index the labels (maintenance type)
li = StringIndexer(inputCol='maintenanceType', outputCol='label')\
  .fit(va)

# #### Model Training
# We split the data into 2 subsets - one to train the model, and one to test/evaluate it 
# We then build a pipeline of all steps involved in running the model on some data. The
# pipeline will have the following steps:
# 1. StringIndexer - convert the maintenance type strings to a numeric index
# 2. RandomForestClassifier - classify the data into one of the different indexes
# 3. IndexToString - convert he maintenance type indexes back to strings
(trainingData, testData) = va.randomSplit([0.7, 0.3])
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=10)
i2s = IndexToString(inputCol="prediction", outputCol="predictedLabel",
                    labels=li.labels)
pipeline = Pipeline(stages=[li, rf, i2s])
model = pipeline.fit(trainingData)

# #### Model Evaluation
# The training data was used to fit the model (ie. train it), now we can test the model
# using the test subset, and calculate the accuracy (ie. false prediction rate)
predictions = model.transform(testData)
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))
예제 #2
0
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import SparkSession
from sklearn.datasets import load_iris

import mlflow

spark = SparkSession.builder.getOrCreate()

df = load_iris(as_frame=True).frame.rename(columns={"target": "label"})
df = spark.createDataFrame(df)
df = VectorAssembler(inputCols=df.columns[:-1],
                     outputCol="features").transform(df)
train, test = df.randomSplit([0.8, 0.2])

mlflow.pyspark.ml.autolog()
lor = LogisticRegression(maxIter=5)

with mlflow.start_run():
    lorModel = lor.fit(train)

pred = lorModel.transform(test)
pred.select(lorModel.getPredictionCol()).show(10)

spark.stop()
예제 #3
0
        .getOrCreate()

    # Prepare data
    logs = spark.read.csv("hdfs://devenv/user/spark/web_logs_analysis/data/",
                          header=True,
                          inferSchema=True)

    # Preprocessing and feature engineering
    feature_prep = logs.select("product_category_id", "device_type", "connection_type", "gender") \

    final_data = VectorAssembler(
        inputCols=["product_category_id", "device_type", "connection_type"],
        outputCol="features").transform(feature_prep)

    # Split data into train and test sets
    train_data, test_data = final_data.randomSplit([0.7, 0.3])

    # Model training
    classifier = RandomForestClassifier(featuresCol="features",
                                        labelCol="gender",
                                        numTrees=10,
                                        maxDepth=10)
    model = classifier.fit(train_data)

    # Transform the test data using the model to get predictions
    predicted_test_data = model.transform(test_data)

    # Evaluate the model performance
    evaluator_f1 = MulticlassClassificationEvaluator(
        labelCol='gender', predictionCol='prediction', metricName='f1')
    print("F1 score: {}", evaluator_f1.evaluate(predicted_test_data))
예제 #4
0
        model = glr.fit(feature_data)
        print(model.summary)
        maxPValue = max(model.summary.pValues)
        if maxPValue > 0.05:
            i = model.summary.pValues.index(maxPValue)
            print(features_names[i])
            del features_names[i]
        else:
            break

    print("final features sets: %s" % features_names)

    # run logistic regression on the selected features with different threshold
    final_data = VectorAssembler(inputCols=features_names,
                                 outputCol='features').transform(data)
    train, test = final_data.randomSplit([0.8, 0.2])

    threshold = [0.5, 0.4, 0.3, 0.2, 0.1]
    for t in threshold:
        lr = LogisticRegression(threshold=t)
        model = lr.fit(train)
        testSummary = model.evaluate(test)
        print("With threshold %s, Accuracy is %s" % (t, testSummary.accuracy))
        print("With threshold %s, Area under roc is %s" %
              (t, testSummary.areaUnderROC))
        print("With threshold %s, Specificity is %s" %
              (t, testSummary.recallByLabel[0]))
        print("With threshold %s, Sensitivity is %s" %
              (t, testSummary.recallByLabel[1]))

    spark.stop()
예제 #5
0
파일: zadanie.py 프로젝트: kmbee/TSVD
#modelovanie
SVM_df = nova_vzorka

atr_without_severity = [
    "Number_of_Vehicles", "Number_of_Casualties", "1st_Road_Class",
    "Speed_limit", "Junction_Detail", "Light_Conditions",
    "Urban_or_Rural_Area", "Did_Police_Officer_Attend_Scene_of_Accident",
    "Junction_Location", "Skidding_and_Overturning",
    "Hit_Object_off_Carriageway"
]

SVM_vector_data = VectorAssembler(inputCols=atr_without_severity,
                                  outputCol="features").transform(SVM_df)

training_data, test_data = SVM_vector_data.randomSplit([0.7, 0.3], seed=123)

#Modelovanie
print "---------------------------------------------------------------------"
print "-----------------------------Modelovanie-----------------------------"
print "---------------------------------------------------------------------"

#Decision tree classifier
print "-------------------------------------------------"
print "---------------DESICION TREE---------------"
print "-------------------------------------------------"

tree_classifier = DecisionTreeClassifier(featuresCol="features",
                                         labelCol="Accident_Severity",
                                         impurity="entropy",
                                         maxDepth=10,
    columnaInicial = int(os.environ.get('COLUMNA_INICIAL'))
    columnaFinal = int(os.environ.get('COLUMNA_FINAL'))
    array = data.columns

    start_time = time() # Comienzo de contar tiempo

    data = VectorAssembler(inputCols=array[columnaInicial:columnaFinal], outputCol="features").transform(data)

    # Automatically identify categorical features, and index them.
    # We specify maxCategories so features with > 4 distinct values are treated as continuous.
    featureIndexer =\
        VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=12).fit(data)

    # Split the data into train and test
    splits = data.randomSplit([0.7, 0.3])
    train = splits[0]
    test = splits[1]

    # create the trainer and set its parameters
    nb = NaiveBayes(smoothing=1.0, labelCol=array[columnaInicial], modelType="multinomial", featuresCol="indexedFeatures")

    # Pipeline
    pipeline = Pipeline(stages=[featureIndexer, nb])

    # train the model
    model = pipeline.fit(train)

    elapsed_time = time() - start_time
    elapsed_time = format(elapsed_time, '.6f')
    salida = 'Tiempo ejecución:' + str(elapsed_time) + ' segundos'
         0).otherwise(cast_vzorky["Accident_Severity"]))
SVM_df = SVM_df.withColumn(
    "Accident_Severity",
    when(SVM_df["Accident_Severity"] == 2,
         1).otherwise(SVM_df["Accident_Severity"]))

SVM_df.describe("Accident_Severity").collect()
SVM_df = VectorAssembler(inputCols=[
    "Number_of_Vehicles", "Number_of_Casualties", "Junction_Detail",
    "Junction_Control", "Did_Police_Officer_Attend_Scene_of_Accident",
    "Junction_Location", "Skidding_and_Overturning",
    "Hit_Object_off_Carriageway"
],
                         outputCol="features").transform(SVM_df)
#Rozdelenie dat na trenovaciu a testovaciu mnozinu
training_data, test_data = SVM_df.randomSplit([0.6, 0.4], seed=123)
SVM_df.count()

#Modelovanie
print "---------------------------------------------------------------------"
print "-----------------------------Modelovanie-----------------------------"
print "---------------------------------------------------------------------"

#Decision tree classifier
print "-------------------------------------------------"
print "---------------DESICION TREE---------------"
print "-------------------------------------------------"

tree_classifier = DecisionTreeClassifier(featuresCol="features",
                                         labelCol="Accident_Severity",
                                         impurity="entropy",