def train_eval_register_model(ws, experiment_name, model_name, full_X, full_Y,
                              training_set_percentage):

    # start a training run by defining an experiment
    myexperiment = Experiment(ws, experiment_name)
    run = myexperiment.start_logging()

    train_X, test_X, train_Y, test_Y = train_test_split(
        full_X, full_Y, train_size=training_set_percentage, random_state=42)

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(train_X)
    clf = linear_model.LogisticRegression(C=1)
    clf.fit(X_scaled, train_Y)

    scaled_inputs = scaler.transform(test_X)
    predictions = clf.predict(scaled_inputs)
    score = accuracy_score(test_Y, predictions)

    print("With %0.2f percent of data, model accuracy reached %0.4f." %
          (training_set_percentage, score))

    # Log the training metrics to Azure Machine Learning service run history
    run.log("Training_Set_Percentage", training_set_percentage)
    run.log("Accuracy", score)

    # Serialize the model to a pickle file in the outputs folder
    output_model_path = 'outputs/' + model_name + '.pkl'
    pickle.dump(clf, open(output_model_path, 'wb'))
    print('Exported model to ', output_model_path)

    # Serialize the scaler as a pickle file in the same folder as the model
    output_scaler_path = 'outputs/' + 'scaler' + '.pkl'
    pickle.dump(scaler, open(output_scaler_path, 'wb'))
    print('Exported scaler to ', output_scaler_path)

    # notice for the model_path, we supply the name of the outputs folder without a trailing slash
    # this will ensure both the model and the scaler get uploaded.
    registered_model = Model.register(model_path='outputs',
                                      model_name=model_name,
                                      workspace=ws)

    print(registered_model.name,
          registered_model.id,
          registered_model.version,
          sep='\t')

    run.complete()

    return (registered_model, clf, scaler, score, run)
def train_eval_register_model(experiment_name, full_X, full_Y,
                              training_set_percentage):

    # start a training run by defining an experiment
    myexperiment = Experiment(ws, experiment_name)
    run = myexperiment.start_logging()

    train_X, test_X, train_Y, test_Y = train_test_split(
        full_X, full_Y, train_size=training_set_percentage, random_state=42)

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(train_X)
    clf = linear_model.LogisticRegression(C=1)
    clf.fit(X_scaled, train_Y)

    scaled_inputs = scaler.transform(test_X)
    predictions = clf.predict(scaled_inputs)
    score = accuracy_score(test_Y, predictions)

    print("With %0.2f percent of data, model accuracy reached %0.4f." %
          (training_set_percentage, score))

    # Log the training metrics to Azure Machine Learning service run history
    run.log("Training_Set_Percentage", training_set_percentage)
    run.log("Accuracy", score)
    run.complete()

    model_name = experiment_name + '.pkl'
    output_model_path = './outputs/' + model_name
    pickle.dump(clf, open(output_model_path, 'wb'))

    # Upload and register this version of the model with Azure Machine Learning service
    destination_path = 'outputs/' + model_name
    run.upload_file(destination_path, output_model_path)  # destination, source
    registered_model = run.register_model(model_name='usedcarsmodel',
                                          model_path=destination_path)

    print(registered_model.name,
          registered_model.id,
          registered_model.version,
          sep='\t')

    return (clf, score)
def trigger_training_job():

    # Define Vars < Change the vars>.
    # In a production situation, don't put secrets in source code, but as secret variables,
    # see https://docs.microsoft.com/en-us/azure/devops/pipelines/process/variables?view=azure-devops&tabs=yaml%2Cbatch#secret-variables
    workspace = "<Name of your workspace>"
    subscription_id = "<Subscription id>"
    resource_grp = "<Name of your resource group where aml service is created>"

    domain = "westeurope.azuredatabricks.net"  # change location in case databricks instance is not in westeurope
    dbr_pat_token_raw = "<<your Databricks Personal Access Token>>"

    DBR_PAT_TOKEN = bytes(dbr_pat_token_raw, encoding='utf-8')  # adding b'
    notebookRemote = "/3_IncomeNotebookDevops"
    experiment_name = "experiment_model_release"
    model_name_run = datetime.datetime.now().strftime(
        "%Y%m%d%H%M%S"
    ) + "_dbrmod.mml"  # in case you want to change the name, keep the .mml extension
    model_name = "databricksmodel.mml"  # in case you want to change the name, keep the .mml extension
    db_compute_name = "dbr-amls-comp"

    #
    # Step 1: Run notebook using Databricks Compute in AML SDK
    #
    cli_auth = AzureCliAuthentication()

    ws = Workspace(workspace_name=workspace,
                   subscription_id=subscription_id,
                   resource_group=resource_grp,
                   auth=cli_auth)
    ws.get_details()

    #
    # Step 2: Create job and attach it to cluster
    #
    # In this steps, secret are added as parameters (spn_tenant, spn_clientid, spn_clientsecret)
    # Never do this in a production situation, but use secret scope backed by key vault instead
    # See https://docs.azuredatabricks.net/user-guide/secrets/secret-scopes.html#azure-key-vault-backed-scopes
    response = requests.post(
        'https://%s/api/2.0/jobs/create' % domain,
        headers={'Authorization': b"Bearer " + DBR_PAT_TOKEN},
        json={
            "name": "Run AzureDevopsNotebook Job",
            "new_cluster": {
                "spark_version": "4.0.x-scala2.11",
                "node_type_id": "Standard_D3_v2",
                "spark_env_vars": {
                    'PYSPARK_PYTHON': '/databricks/python3/bin/python3',
                },
                "autoscale": {
                    "min_workers": 1,
                    "max_workers": 2
                }
            },
            "libraries": [{
                "pypi": {
                    "package": "azureml-sdk[databricks]"
                }
            }],
            "notebook_task": {
                "notebook_path":
                notebookRemote,
                "base_parameters": [{
                    "key": "subscription_id",
                    "value": subscription_id
                }, {
                    "key": "resource_group",
                    "value": resource_grp
                }, {
                    "key": "workspace_name",
                    "value": workspace
                }, {
                    "key": "model_name",
                    "value": model_name_run
                }]
            }
        })

    if response.status_code != 200:
        print("Error launching cluster: %s: %s" %
              (response.json()["error_code"], response.json()["message"]))
        exit(2)

    #
    # Step 3: Start job
    #
    databricks_job_id = response.json()['job_id']

    response = requests.post(
        'https://%s/api/2.0/jobs/run-now' % domain,
        headers={'Authorization': b"Bearer " + DBR_PAT_TOKEN},
        json={"job_id": +databricks_job_id})

    if response.status_code != 200:
        print("Error launching cluster: %s: %s" %
              (response.json()["error_code"], response.json()["message"]))
        exit(3)

    print(response.json()['run_id'])

    #
    # Step 4: Wait until job is finished
    #
    databricks_run_id = response.json()['run_id']
    scriptRun = 1
    count = 0
    while scriptRun == 1:
        response = requests.get(
            'https://%s/api/2.0/jobs/runs/get?run_id=%s' %
            (domain, databricks_run_id),
            headers={'Authorization': b"Bearer " + DBR_PAT_TOKEN},
        )

        state = response.json()['state']
        life_cycle_state = state['life_cycle_state']
        print(state)

        if life_cycle_state in ["TERMINATED", "SKIPPED", "INTERNAL_ERROR"]:
            result_state = state['result_state']
            if result_state == "SUCCESS":
                print("run ok")
                scriptRun = 0
            #exit(0)
            else:
                exit(4)
        elif count > 180:
            print("time out occurred after 30 minutes")
            exit(5)
        else:
            count += 1
            time.sleep(30)  # wait 30 seconds before next status update

    #
    # Step 5: Retrieve model from dbfs
    #
    mdl, ext = model_name_run.split(".")
    model_zip_run = mdl + ".zip"

    response = requests.get(
        'https://%s/api/2.0/dbfs/read?path=/%s' % (domain, model_zip_run),
        headers={'Authorization': b"Bearer " + DBR_PAT_TOKEN})
    if response.status_code != 200:
        print("Error copying dbfs results: %s: %s" %
              (response.json()["error_code"], response.json()["message"]))
        exit(1)

    model_output = base64.b64decode(response.json()['data'])

    # download model in deploy folder
    os.chdir("deploy")
    with open(model_zip_run, "wb") as outfile:
        outfile.write(model_output)
    print("Downloaded model {} to Project root directory".format(model_name))

    #
    # Step 6: Retrieve model metrics from dbfs
    #
    mdl, ext = model_name_run.split(".")
    model_metrics_json_run = mdl + "_metrics.json"

    response = requests.get(
        'https://%s/api/2.0/dbfs/read?path=/%s' %
        (domain, model_metrics_json_run),
        headers={'Authorization': b"Bearer " + DBR_PAT_TOKEN})
    if response.status_code != 200:
        print("Error copying dbfs results: %s: %s" %
              (response.json()["error_code"], response.json()["message"]))
        exit(2)

    model_metrics_output = json.loads(base64.b64decode(
        response.json()['data']))

    #
    # Step 7: Put model and metrics to Azure ML Service
    #

    # start a training run by defining an experiment
    myexperiment = Experiment(ws, experiment_name)
    run = myexperiment.start_logging()
    run.upload_file("outputs/" + model_zip_run, model_zip_run)

    #run.log("pipeline_run", pipeline_run.id)
    run.log("au_roc", model_metrics_output["Area_Under_ROC"])
    run.log("au_prc", model_metrics_output["Area_Under_PR"])
    run.log("truePostive", model_metrics_output["True_Positives"])
    run.log("falsePostive", model_metrics_output["False_Positives"])
    run.log("trueNegative", model_metrics_output["True_Negatives"])
    run.log("falseNegative", model_metrics_output["False_Negatives"])

    run.complete()
    run_id = run.id
    print("run id:", run_id)

    # unzip file to model_name_run
    shutil.unpack_archive(model_zip_run, model_name_run)

    model = Model.register(
        model_path=model_name_run,  # this points to a local file
        model_name=model_name,  # this is the name the model is registered as
        tags={
            "area": "spar",
            "type": "regression",
            "run_id": run_id
        },
        description="Medium blog test model",
        workspace=ws,
    )
    print("Model registered: {} \nModel Description: {} \nModel Version: {}".
          format(model.name, model.description, model.version))

    # Step 8. Finally, writing the registered model details to conf/model.json
    model_json = {}
    model_json["model_name"] = model.name
    model_json["model_version"] = model.version
    model_json["run_id"] = run_id
    model_json["model_name_run"] = model_name_run
    with open("../conf/model.json", "w") as outfile:
        json.dump(model_json, outfile)
                      subscription_id=subscription_id,
                      resource_group=resource_group,
                      location=workspace_region,
                      exist_ok=True)

print("Workspace Provisioning complete.")

#%%
# Step 10 - Create an experiment and log metrics for multiple training runs
###########################################################################
from azureml.core.run import Run
from azureml.core.experiment import Experiment

# start a training run by defining an experiment
myexperiment = Experiment(ws, "UsedCars_Experiment")
root_run = myexperiment.start_logging()

training_set_percentage = 0.25
run = root_run.child_run("Training_Set_Percentage-%0.5F" %
                         training_set_percentage)
model, score = train_eval_model(full_X, full_Y, training_set_percentage)
print("With %0.2f percent of data, model accuracy reached %0.4f." %
      (training_set_percentage, score))
run.log("Training_Set_Percentage", training_set_percentage)
run.log("Accuracy", score)
run.complete()

training_set_percentage = 0.5
run = root_run.child_run("Training_Set_Percentage-%0.5F" %
                         training_set_percentage)
model, score = train_eval_model(full_X, full_Y, training_set_percentage)
示例#5
0
import os
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml import Pipeline, PipelineModel
from azureml.core.run import Run
from azureml.core.experiment import Experiment

#defining some wariables
model_name = "smsspamclassif_runs.mml"
model_dbfs = os.path.join("/dbfs", model_name)

#these are the different regularization parameter values we are going to test
regs = [0.0001, 0.001, 0.01, 0.1]

myexperiment = Experiment(ws, "SMS_Spam_Classifier")
main_run = myexperiment.start_logging()

for reg in regs:
    print("Regularization rate: {}".format(reg))
    with main_run.child_run("reg-" + str(reg)) as run:
        lr = LogisticRegression(featuresCol="features",
                                labelCol='label',
                                regParam=reg)
        pipe = Pipeline(stages=[
            stringIndexer, tokenizer, stopwordsRemover, hashingTF, idf, lr
        ])
        model_p = pipe.fit(training_data)

        # make prediction on test_data
        pred = model_p.transform(test_data)
#Interactive Authentication
ws = Workspace(workspace_name=workspace_name,
               subscription_id=subscription_id,
               resource_group=resource_group,
               auth=cli_auth)

ws.write_config()
############# Experiement gbr-turbofan ######################
experiment = Experiment(ws, 'gbr-turbofan')

train = pd.read_csv("data/turbofan.csv")

X = train.drop('rul', axis=1)
y = pd.Series(train.rul)

run = experiment.start_logging()
run.tag("python version", sys.version[0:6])

# Log the algorithm parameter alpha to the run
run.log('max_depth', max_depth)
run.log('n_estimators', n_estimators)

X_train, X_test, y_train, y_test = train_test_split(X, y)

regression_model = GradientBoostingRegressor(max_depth=max_depth,
                                             n_estimators=n_estimators,
                                             learning_rate=.5)

regression_model.fit(X_train, y_train)

y_pred = regression_model.predict(X_test)
示例#7
0
def trigger_training_job():

    # get the parameter values
    workspace = sys.argv[1]
    subscription_id = sys.argv[2]
    resource_grp = sys.argv[3]

    domain = sys.argv[4]
    DBR_PAT_TOKEN = bytes(sys.argv[5], encoding='utf-8')  # adding b'

    stor2_name = sys.argv[6]
    stor2_container = sys.argv[7]
    secret_scope = sys.argv[8]

    train_dataset = "p_train.csv"
    test_dataset = "p_test.csv"

    notebook_remote_path = "/lgb_eq_sec"
    experiment_name = "experiment_model_release"
    model_name_run = datetime.datetime.now().strftime(
        "%Y%m%d%H%M%S"
    ) + "_dbrmod.mml"  # in case you want to change the name, keep the .mml extension
    model_name = "databricksmodel.mml"
    #
    # Step 1: Create job and attach it to cluster
    #
    response = requests.post(
        'https://%s/api/2.0/jobs/create' % domain,
        headers={'Authorization': b"Bearer " + DBR_PAT_TOKEN},
        json={
            "name": "Run AzureDevopsNotebook Job",
            "existing_cluster_id": "0626-030203-fie285",
            "notebook_task": {
                "notebook_path":
                notebook_remote_path,
                "base_parameters": [{
                    "key": "model_name",
                    "value": model_name_run
                }, {
                    "key": "stor2_name",
                    "value": stor2_name
                }, {
                    "key": "stor2_container",
                    "value": stor2_container
                }, {
                    "key": "stor2_train_file",
                    "value": train_dataset
                }, {
                    "key": "stor2_test_file",
                    "value": test_dataset
                }, {
                    "key": "secret_scope",
                    "value": secret_scope
                }]
            }
        })

    if response.status_code != 200:
        print("Error launching cluster: %s: %s" %
              (response.json()["error_code"], response.json()["message"]))
        exit(2)

    #
    # Step 2: Start job
    #
    databricks_job_id = response.json()['job_id']

    response = requests.post(
        'https://%s/api/2.0/jobs/run-now' % domain,
        headers={'Authorization': b"Bearer " + DBR_PAT_TOKEN},
        json={"job_id": +databricks_job_id})

    if response.status_code != 200:
        print("Error launching cluster: %s: %s" %
              (response.json()["error_code"], response.json()["message"]))
        exit(3)

    print(response.json()['run_id'])

    #
    # Step 3: Wait until job is finished
    #
    databricks_run_id = response.json()['run_id']
    script_run_state = 1
    count = 0
    while script_run_state == 1:
        response = requests.get(
            'https://%s/api/2.0/jobs/runs/get?run_id=%s' %
            (domain, databricks_run_id),
            headers={'Authorization': b"Bearer " + DBR_PAT_TOKEN},
        )

        state = response.json()['state']
        life_cycle_state = state['life_cycle_state']
        print(state)

        if life_cycle_state in ["TERMINATED", "SKIPPED", "INTERNAL_ERROR"]:
            result_state = state['result_state']
            if result_state == "SUCCESS":
                print("run successful")
                script_run_state = 0
            #exit(0)
            else:
                exit(4)
        elif count > 100:  #180
            print("time out occurred after 30 minutes")
            exit(5)
        else:
            count += 1
            time.sleep(30)  # wait 30 seconds before next status update

    #
    # Step 4: Retrieve model from dbfs
    #
    mdl, ext = model_name_run.split(".")
    model_zip_run = mdl + ".zip"

    response = requests.get(
        'https://%s/api/2.0/dbfs/read?path=/%s' % (domain, model_zip_run),
        headers={'Authorization': b"Bearer " + DBR_PAT_TOKEN})
    if response.status_code != 200:
        print("Error copying dbfs results: %s: %s" %
              (response.json()["error_code"], response.json()["message"]))
        exit(6)

    model_output = base64.b64decode(response.json()['data'])

    # download model in deploy folder
    os.chdir("deploy")
    with open(model_zip_run, "wb") as outfile:
        outfile.write(model_output)
    print("Downloaded model {} to Project root directory".format(model_name))

    #
    # Step 5: Put model to Azure ML Service
    #
    cli_auth = AzureCliAuthentication()

    ws = Workspace(workspace_name=workspace,
                   subscription_id=subscription_id,
                   resource_group=resource_grp,
                   auth=cli_auth)
    ws.get_details()

    # start a training run by defining an experiment
    myexperiment = Experiment(ws, experiment_name)
    run = myexperiment.start_logging()
    run.upload_file("outputs/" + model_zip_run, model_zip_run)
    run.complete()
    run_id = run.id
    print("run id:", run_id)

    # unzip file to model_name_run
    shutil.unpack_archive(model_zip_run, model_name_run)

    model = Model.register(
        model_path=model_name_run,  # this points to a local file
        model_name=model_name,  # the name the model is registered as
        tags={
            "area": "spar",
            "type": "regression",
            "run_id": run_id
        },
        description=
        "LightGBM model from Kaggle 1st place, for Earthquake prediction",
        workspace=ws,
    )
    print("Model registered: {} \nModel Description: {} \nModel Version: {}".
          format(model.name, model.description, model.version))

    # Step 6: Finally, writing the registered model details to conf/model.json
    model_json = {}
    model_json["model_name"] = model.name
    model_json["model_version"] = model.version
    model_json["run_id"] = run_id
    model_json["model_name_run"] = model_name_run
    with open("../conf/model.json", "w") as outfile:
        json.dump(model_json, outfile)