Пример #1
0
modelURI

# COMMAND ----------

# MAGIC %md
# MAGIC ### Step 2: Load our model and apply predictions
# MAGIC *Add in some guiding Text*

# COMMAND ----------

import mlflow.spark

# COMMAND ----------

## Why does this take 5 minutes???
spark_model = mlflow.spark.load_model(modelURI)

# COMMAND ----------

df = spark.sql("select * from max_db.bank_marketing_train_set")

# COMMAND ----------

resultDF = spark_model.transform(df.drop("label"))

# COMMAND ----------

display(resultDF.drop("features", "rawPrediction"))

# COMMAND ----------
Пример #2
0
            model_stage = "Staging"
    # move the model to the appropriate stage.
    client.transition_model_version_stage(name=model_name,
                                          version=model_version,
                                          stage=model_stage)

    predicted_inference_DF = pipelineModel.transform(df_inference)
    #   the idea now is to return the predicted delay for each model version and save these things in a table such as the one in notebook 06 RandomForest with Time & Weather.
    return predicted_inference_DF


# COMMAND ----------

inputs = spark.sql("""
SELECT * 
FROM bronze_air_traffic_cleaned_v3 
WHERE ORIGIN IN ("JFK","SEA","BOS","ATL","LAX","SFO","DEN","DFW","ORD","CVG","CLT","DCA","IAH")
AND DEST IN ("JFK","SEA","BOS","ATL","LAX","SFO","DEN","DFW","ORD","CVG","CLT","DCA","IAH")
""")

# COMMAND ----------

display(inputs)

# COMMAND ----------

predicted_inference_DF = train_model(inputs, 6, 50)

# COMMAND ----------

predicted_inference_DF.select("SCHEDULED_DEP_TIME", "ARR_DELAY",
                              "prediction").display()
Пример #3
0
    maxIter = int(sys.argv[2])
    regParam = float(sys.argv[3])
    elasticNetParam = float(sys.argv[4])

    log_param("modelType", modelType)
    log_param("maxIter", maxIter)
    log_param("regParam", regParam)
    log_param("elasticNetParam", elasticNetParam)

    spark = SparkSession \
        .builder \
        .appName("Python Spark MLFlow basic example") \
        .enableHiveSupport() \
        .getOrCreate()

    df_102 = spark.sql("SELECT * from default.nyc_trips_final_102").na.drop()
    df_102 = df_102.withColumnRenamed("fare_amt", "label")
    df_102 = df_102.withColumn("day_of_week_new",
                               df_102.day_of_week.cast("int"))

    paymentIndexer = StringIndexer(
        inputCol="payment_type",
        outputCol="payment_indexed").setHandleInvalid("skip")
    vendorIndexer = StringIndexer(
        inputCol="vendor_name",
        outputCol="vendor_indexed").setHandleInvalid("skip")

    assembler = VectorAssembler(inputCols=[
        "passenger_count", "trip_distance", "hour", "day_of_week_new",
        "start_cluster", "payment_indexed", "vendor_indexed"
    ],
Пример #4
0
    dbutils.fs.ls("/mnt/%s" % mount_name)
except:
    print("bucket isn't mounted, mount the demo bucket under %s" % mount_name)
    dbutils.fs.mount("s3a://%s" % aws_bucket_name, "/mnt/%s" % mount_name)

# COMMAND ----------

current_user = dbutils.notebook.entry_point.getDbutils().notebook().getContext(
).tags().apply('user')
dbName = re.sub(r'\W+', '_', current_user)
path = "/Users/{}/demo".format(current_user)
dbutils.widgets.text("path", path, "path")
dbutils.widgets.text("dbName", dbName, "dbName")
print("using path {}".format(path))
spark.sql(
    """create database if not exists {} LOCATION '{}/global_demo/tables' """.
    format(dbName, path))
spark.sql("""USE {}""".format(dbName))

# COMMAND ----------

tables = [
    "turbine_bronze", "turbine_silver", "turbine_gold", "turbine_power",
    "turbine_schema_evolution"
]
reset_all = dbutils.widgets.get("reset_all_data") == "true" or any([
    not spark.catalog._jcatalog.tableExists(table)
    for table in ["turbine_power"]
])
if reset_all:
    print("resetting data")
Пример #5
0
# COMMAND ----------

# MAGIC %sql
# MAGIC select * from tempFlightsWeatherB

# COMMAND ----------

dff2 = spark.table('tempFlightsWeatherB')

dff2 = dff2.withColumnRenamed('prcp', 'dest_prcp')
dff2 = dff2.drop('date')
dff2 = dff2.drop('iata')

# COMMAND ----------

display(dff2)

# COMMAND ----------

dff2.write.format("delta").save('/mnt/delta/clemens/airaugmented')
spark.sql(
    "create table clemens.flightdelays_augmented using delta location '/mnt/delta/clemens/airaugmented'"
)

# COMMAND ----------

# MAGIC %sql
# MAGIC select * from clemens.flightdelays_augmented

# COMMAND ----------
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit

import mlflow
from mlflow import spark
import mlflow.mleap  #fonctionne avec Spark 3.0 ?
#import mlflow.pyfunc
import mleap.pyspark

# COMMAND ----------

#df = spark.sql("SELECT trip_duration,start_station_id,birth_year,unknown_gender,male_gender,female_gender,Subscriber,Customer,real_distance,((real_distance / trip_duration)* 3.6) as vitesse, DATE(start_time) as date,HOUR(start_time) as hour FROM CitibikeNY NATURAL JOIN citybike_station_distance")

df = spark.sql(
    "SELECT trip_duration,start_station_id,birth_year,unknown_gender,male_gender,female_gender,Subscriber,Customer,distance_bwn_stations,(((distance_bwn_stations * 1000) / trip_duration)* 3.6) as vitesse, DATE(start_time) as date,HOUR(start_time) as hour FROM CitibikeNY2 NATURAL JOIN citybike_station_distance"
)

# COMMAND ----------

df = df.filter((df.vitesse > 13) & (df.vitesse < 32))

# COMMAND ----------

df = spark.sql("SELECT * FROM tab_nycitibike")

# COMMAND ----------

display(df)

# COMMAND ----------
Пример #7
0
# the below experient id needs subscription
mlflow_experiment_id = 0

# Including MLflow
import mlflow
import mlflow.spark

import os

print("MLflow Version: %s" % mlflow.__version__)

# COMMAND ----------

# Create df DataFrame which contains our simulated financial fraud detection dataset
df = spark.sql(
    "select step, type, amount, nameOrig, oldbalanceOrg, newbalanceOrig, nameDest, oldbalanceDest, newbalanceDest, isFraud from sim_fin_fraud_detection"
)

# COMMAND ----------

# Review the schema of your data
df.printSchema()

# COMMAND ----------

# MAGIC %md
# MAGIC ### Calculate Differences between Originating and Destination Balanaces
# MAGIC With the following PySpark DataFrame query, we will calculate the following columns:
# MAGIC
# MAGIC | New Column | Definition |
# MAGIC | ---------- | ---------- |
# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 1: Upload and Read Sensor Dataset
# MAGIC
# MAGIC For the training dataset, you will need to upload some data to the Databricks File System (DBFS). Go to File > Upload Data and click "Browse" in the middle box to bring up your file explorer for your local computer. Navigate to the place where you downloaded the artifacts for this workshop and go into the `/Datasets` folder and choose `sensordata.csv`. Once you see a green checkmark, then you just need to press **Next** and then **Done** on the next screen.

# COMMAND ----------

# MAGIC %md
# MAGIC Here we will be creating a database to store some of the tables that we will create during this workshop. The first table will be a Delta Lake table that will hold our uploaded sensor data.

# COMMAND ----------

MODEL_PROJECT_NAME = dbutils.widgets.get("model_project_name")
spark.sql(f"CREATE DATABASE IF NOT EXISTS {MODEL_PROJECT_NAME}")
username = spark.sql("SELECT current_user()").collect()[0][0]
sensorData = spark.read.csv(
    f"dbfs:/FileStore/shared_uploads/{username}/sensordata.csv",
    header=True,
    inferSchema=True)
sensorData.write.saveAsTable(f"{DB_NAME}.sensor",
                             format="delta",
                             mode="overwrite")
dataDf = spark.table("sensor").where(col('Device') == 'Device001')

# COMMAND ----------

# MAGIC %md
# MAGIC With our sensor data table saved, we can create an MLFlow Experiment to house the metrics that we log during our training runs.
# MAGIC
Пример #9
0
import matplotlib.pyplot as plt
import pyspark.sql.functions as sqlf
from pyspark.sql.functions import col, to_date

# COMMAND ----------

# Load Departure Dataframe
#silverArrDF_2 = spark.sql("""
#  SELECT *
#  FROM {}.silverarr_delta
#  WHERE FL_DATE
#  """.format(GROUP_DBNAME))

silverArrDF_2 = spark.sql("""
  SELECT *
  FROM {0}.silverdep_delta
  WHERE FL_DATE BETWEEN '{1}' AND '{2}'
  """.format(GROUP_DBNAME, training_start_date, training_end_date))

# Clean Arrival Data
silverArrDF_2 = silverArrDF_2.filter(col('ARR_DELAY').isNotNull())
silverArrDF_2 = silverArrDF_2.filter(col('DEP_DELAY').isNotNull())
silverArrDF_2 = silverArrDF_2.drop('FL_DATE')
silverArrDF_2 = drop_null_columns(silverArrDF_2)

#Transform into Pandas Dataframe
silverArrDF_2 = silverArrDF_2.toPandas()
display(silverArrDF_2.head())

# COMMAND ----------
Пример #10
0
def getProdModelURI(modelRegistryName):
  models = client.search_model_versions("name='%s'" % modelRegistryName)
  source = [model for model in models if model.current_stage == "Production"][0].source
  return source

modelURI = getProdModelURI(modelRegistryName)

# COMMAND ----------

# MAGIC %md
# MAGIC ### Step 2: Load our model and apply predictions
# MAGIC We can use the MLFlow spark flavor to load the LightGBM model. This may sound counterintuitive, however it works because mml spark LightGBM just uses Spark under the hood

# COMMAND ----------

import mlflow.lightgbm
import mlflow.spark
from mmlspark import LightGBMClassifier

# COMMAND ----------

LGB_model = mlflow.spark.load_model(modelURI)

# COMMAND ----------

df = spark.sql("select * from global_temp.globalTempTestData")
resultDF = LGB_model.transform(df)

# COMMAND ----------

display(resultDF)
Пример #11
0
# MAGIC from training_rwd.patient_encounters
# MAGIC group by encounterclass
# MAGIC order by count desc

# COMMAND ----------

pt_encounters = sql('select * from patient_encounters')

# COMMAND ----------

encounters.describe('cost').show()

# COMMAND ----------

# get the list of patients with the target condition (cases)
condition_patients = spark.sql("SELECT DISTINCT PATIENT FROM training_rwd.patient_encounters WHERE lower(REASONDESCRIPTION) LIKE '%" + dbutils.widgets.get('condition') + "%'")

# COMMAND ----------

# DBTITLE 1,List of patients with the condition to model
condition_patients = (encounters
  .where(lower(encounters.REASONDESCRIPTION).contains(dbutils.widgets.get('condition')))
  .select('PATIENT').dropDuplicates()
)

# COMMAND ----------

condition_patients.count()

# COMMAND ----------
Пример #12
0
# MAGIC * `mlflow.log_metric` tells MLflow to track a particular variable as a *metric* of the run
# MAGIC * `mlflow.<flavor>.log_model` (optional) tells MLflow to log a model including it's dependencies
# MAGIC * `mlflow.log_artifact` (optional) tells MLflow to log a file from local disk (ie. image, config file, dataset, etc.)

# COMMAND ----------

import mlflow
import mlflow.mleap
import mlflow.spark

from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline

# Pull our data into a Spark dataframe
df = spark.sql("select * from sensor_readings")

# Extract the columns that we want in our feature vector
featureColumns = df.drop("timestamp", "Sensor-Predict").columns


def trainLRModel(data, maxIter, regParam, elasticNetParam):
    def evalMetrics(summary):
        rmse = summary.rootMeanSquaredError
        r2 = summary.r2
        return (rmse, r2)

    with mlflow.start_run() as run:
        # Split our dataset into training and testing
        (train, test) = df.randomSplit([0.7, 0.3])
Пример #13
0
import pandas as pd
from pyspark.ml.classification import LogisticRegression
from pyspark.sql import SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StringIndexer, IndexToString, VectorIndexer
from pyspark.ml.linalg import Vector
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

spark = SparkSession.builder.appName('Project').getOrCreate()

dataset = spark.read.csv("reviews.tbl", inferSchema=True, header=True, sep='|')

dataset.createTempView("product_reviews")
q = "SELECT CASE pr_rating WHEN 1 THEN '0' WHEN 2 THEN '0' WHEN 3 THEN '1' WHEN 4 THEN '3' WHEN 5 THEN '3' END AS pr_r_rating, pr_content FROM product_reviews WHERE pmod(pr_review_id, 5) IN (1,2,3)"
df = spark.sql(q).toDF("label", "sentence")
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
wordsData = tokenizer.transform(df)
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures")
featurizedData = hashingTF.transform(wordsData)

idf = IDF(inputCol="rawFeatures", outputCol="userFeatures")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

df = rescaledData.select(rescaledData["label"].cast("double"),
                         (rescaledData["userFeatures"]))

from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=["userFeatures"], outputCol="features")
Пример #14
0
    dbutils.fs.mount("s3a://%s" % aws_bucket_name, "/mnt/%s" % mount_name)

# COMMAND ----------

# DBTITLE 1,Create User-Specific database
current_user = dbutils.notebook.entry_point.getDbutils().notebook().getContext(
).tags().apply('user')
print("Created variables:")
print("current_user: {}".format(current_user))
dbName = re.sub(r'\W+', '_', current_user)
path = "/Users/{}/demo".format(current_user)
dbutils.widgets.text("path", path, "path")
dbutils.widgets.text("dbName", dbName, "dbName")
print("path (default path): {}".format(path))
spark.sql(
    """create database if not exists {} LOCATION '{}/global_demo/tables' """.
    format(dbName, path))
spark.sql("""USE {}""".format(dbName))
print("dbName (using database): {}".format(dbName))

# COMMAND ----------

# DBTITLE 1,Reset tables in user's database
tables = [
    "turbine_bronze", "turbine_silver", "turbine_gold", "turbine_power",
    "turbine_schema_evolution"
]
reset_all = dbutils.widgets.get("reset_all_data") == "true" or any([
    not spark.catalog._jcatalog.tableExists(table)
    for table in ["turbine_power"]
])