def main(config): # Cookie cutter sequence of processes involved in running the # necessary steps. Using the general pipeline outlined in Spark's # MLLib docs here: https://spark.apache.org/docs/latest/ml-pipeline.html spark = spark_initiate() # some data / tramsformer raw_data = config['base']['train_df'] structure_schema = model_structure() data = load_data(spark, raw_data, 'df', structure_schema) # data.show() df, cat_dict = transformer(data) datatype_dict = dict(df.dtypes) features = config['base']['featuresCol'].split(',') list_str = [] # list of string columns for feature in features: if datatype_dict[feature] == 'string': list_str.append(feature) df = StringIndexer(inputCol=feature, outputCol=feature + '_index' ) \ .fit(df) \ .transform(df) df = df.drop(*list_str) df.show() features = list(set(df.columns) - set(config['base']['labelCol'])) assembler = VectorAssembler(inputCols=features, outputCol='features') df = assembler.transform(df) (trainingData, testData) = df.randomSplit([0.7, 0.3]) # estimator model = estimators(config) fitted_model = model.fit(trainingData) testData = fitted_model.transform(testData) predictionAndLabels = testData.select('probability','Survived') \ .rdd.map(lambda x: (float(x[0][0]), float(x[1]) ) ) metrics = BinaryClassificationMetrics(predictionAndLabels) # Area under precision-recall curve print("Area under PR = %s" % metrics.areaUnderPR) # Area under ROC curve print("Area under ROC = %s" % metrics.areaUnderROC)
sentenceData = spark.createDataFrame(tranform_data, ["label", "sentence"]) tokenizer = Tokenizer(inputCol="sentence", outputCol="words") wordsData = tokenizer.transform(sentenceData) #计算TF-IDF hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=3000) featurizedData = hashingTF.transform(wordsData) idf = IDF(inputCol="rawFeatures", outputCol="features") idfModel = idf.fit(featurizedData) rescaledData = idfModel.transform(featurizedData) rescaledData.select("label", "features").show() forData = StringIndexer().setInputCol("label").setOutputCol("indexed").fit( rescaledData).transform(rescaledData) (trainingData, testData) = forData.randomSplit([0.8, 0.2], seed=0) print(trainingData.take(1)) nb = NaiveBayes(smoothing=1.0, modelType="multinomial", labelCol="indexed") start_time = time.time() modelClassifier = nb.fit(trainingData) end_time = time.time() print(end_time - start_time) predictionsClassifier = modelClassifier.transform(testData) evaluator = MulticlassClassificationEvaluator().setLabelCol( "indexed").setPredictionCol("prediction") print( "accuracy = ", evaluator.evaluate(predictionsClassifier, {evaluator.metricName: "accuracy"}))
inputCol="Survived", outputCol="indexedSurvived").fit(dataTitanic).transform(dataTitanic) # One Hot Encoder on indexed features dataTitanic = OneHotEncoder(inputCol="indexedSex", outputCol="sexVec").transform(dataTitanic) dataTitanic = OneHotEncoder(inputCol="indexedEmbarked", outputCol="embarkedVec").transform(dataTitanic) # Feature assembler as a vector dataTitanic = VectorAssembler( inputCols=["Pclass", "sexVec", "embarkedVec", "Age", "SibSp", "Fare"], outputCol="features").transform(dataTitanic) # Spliting in train and test set. Beware : It sorts the dataset (trainDF, testDF) = dataTitanic.randomSplit([0.7, 0.3], seed=42) rf = RandomForestClassifier(labelCol="indexedSurvived", featuresCol="features") time_start = time.time() model_rf = rf.fit(trainDF) time_end = time.time() time_rf = (time_end - time_start) print("RF takes %d s" % (time_rf)) predictions = model_rf.transform(testDF) # Select example rows to display. predictions.select( col("prediction"),
coviddeath = spark.sql("SELECT * FROM uscasestemp1_csv") # COMMAND ---------- data = coviddeath.select("Year", "Date", "Day", "Temp", "Lat", "Long", "Admin2", "Province", ((col("Case") > 2).cast("Double").alias("label"))) data = StringIndexer(inputCol='Admin2', outputCol='Admin2' + "_index").fit(data).transform(data) data = StringIndexer(inputCol='Province', outputCol='Province' + "_index").fit(data).transform(data) data.show(5) # COMMAND ---------- splits = data.randomSplit([0.7, 0.3]) train = splits[0] test = splits[1].withColumnRenamed("label", "trueLabel") train_rows = train.count() test_rows = test.count() print("Training Rows:", train_rows, " Testing Rows:", test_rows) # COMMAND ---------- from pyspark.ml.classification import RandomForestClassifier assembler = VectorAssembler( inputCols=["Day", "Temp", "Lat", "Province_index", "Admin2_index"], outputCol="normfeatures") minMax = MinMaxScaler(inputCol=assembler.getOutputCol(), outputCol="nfeatures") featVect = VectorAssembler(inputCols=["nfeatures"], outputCol="features") lr = LogisticRegression(labelCol="label",
# Check first five records flights.show(5) # Create an assembler object assembler = VectorAssembler(inputCols=[ "mon", "dom", "dow", "carrier_idx", "org_idx", "km", "depart", "duration" ], outputCol='features') # Consolidate predictor columns flights_assembled = assembler.transform(flights) # Check the resulting column flights = flights_assembled.select('features', 'xdelay') # Split into training and testing sets in a 80:20 ratio flights_train, flights_test = flights.randomSplit([0.8, 0.2], seed=17) # Create a classifier object and fit to the training data tree = DecisionTreeClassifier(labelCol="xdelay") tree_model = tree.fit(flights_train) # Create predictions for the testing data and take a look at the predictions prediction = tree_model.transform(flights_test) predictions = prediction.select('xdelay', 'prediction', 'probability') print(predictions.toPandas().sample(12)) spark.stop()
'hash_sim') train_col = [ 'follow', 'pr_score_scr', 'pr_score_dst', 'jaccard', 'user_sim', 'hash_sim' ] for i in train_col: df = df.withColumn(i, df[i].cast('float')) assembler = VectorAssembler(inputCols=[ 'pr_score_scr', 'pr_score_dst', 'jaccard', 'user_sim', 'hash_sim' ], outputCol="features") df = assembler.transform(df) df = StringIndexer(inputCol="follow", outputCol="label").fit(df).transform(df).select( 'features', 'label') print('Split train and test dataset...') train_df, test_df = df.randomSplit([0.7, 0.3], seed=0) print('Train RandomForest model...') rf = RandomForestClassifier(numTrees=10, maxDepth=5, labelCol="label", seed=0) model = rf.fit(train_df) print('Evaluation...') prediction = model.transform(test_df).select('label', 'probability', 'prediction') evaluator = BinaryClassificationEvaluator(rawPredictionCol="probability") print('Test Area Under ROC', evaluator.evaluate(prediction)) sc.stop()
'carrier_idx', 'org_idx', 'km', 'depart', 'duration' ], outputCol='features') # Consolidate predictor columns flights_assembled = assembler.transform(flights) # Check the resulting column flights_assembled.select('features', 'delay').show(5, truncate=False) # Split train/test # Specify a seed for reproducibility cars_train, cars_test = cars.randomSplit([0.8, 0.2], seed=23) print([car_train.count(), cars_test.count()]) # Build a Decision Tree model from pyspark.ml.classification import DecisionTreeClassifier # Create a Decision Tree classifier tree = DecisionTreeClassifier() # Learn from the training data tree_model = tree.fit(cars_train) # Evaluating prediction = tree_model.transform(cars_test) # Confusion matrix: a table describes performance of a model on testing data prediction.groupBy('label', 'prediction').count().show() # Accuracy = (TN + TP) / (TN + TP + FN + FP) - proportion of correct predictions # Split into training and testing sets in a 80:20 ratio
outputCol=i + "_Vec").transform(data) test = OneHotEncoder(dropLast=False, inputCol=i, outputCol=i + "_Vec").transform(test) assembler_input_col = [] for i in encoder_input_col: assembler_input_col.append(i + '_Vec') # # Assembel all features into 'features' data = VectorAssembler(inputCols=assembler_input_col, outputCol='features').transform(data) test = VectorAssembler(inputCols=assembler_input_col, outputCol='features').transform(test) # # Split the data into training and test sets (30% held out for testing) (data_train, data_test) = data.randomSplit([0.7, 0.3]) print('==TRAINING MODEL== \n') rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="features", numTrees=10) model = rf.fit(data_train) predictions = model.transform(test) test_predictions = model.transform(test) # Select example rows to display. predictions.select(["prediction", "probability"]).show(5)