示例#1
0
def RunAutoML():   
    automl_settings = {
    "name": "AutoML_Demo_Experiment",
    "iteration_timeout_minutes": 15,
    "iterations": 3,
    "n_cross_validations": 5,
    "primary_metric": 'r2_score',
    "preprocess": False,
    "max_concurrent_iterations": 8,
    "verbosity": logging.INFO
    }
    subscription_id = request.json['subscription_id']
    print(userData)
    print(userData[subscription_id])
    #return "ok"
    try:
        automl_config = AutoMLConfig(task="classification",
                        X=userData[subscription_id][1],
                        y=userData[subscription_id][2],
                        debug_log='automl_errors.log',
                        preprocess=True,
                        **automl_settings,
                        )
        experiment=Experiment(userData[subscription_id][0], 'automl_remote')
        run = experiment.submit(automl_config, show_output=True)
        run
        best_model,fitted_model = run.get_output()

        return 'ok'
    except:
        return 'error'  
示例#2
0
def auto_train_model(ws, experiment_name, model_name, full_X, full_Y,training_set_percentage, training_target_accuracy):

    # start a training run by defining an experiment
    experiment = Experiment(ws, experiment_name)
    
    train_X, test_X, train_Y, test_Y = train_test_split(full_X, full_Y, train_size=training_set_percentage, random_state=42)

    train_Y_array = train_Y.values.flatten()

    # Configure the automated ML job
    # The model training is configured to run on the local machine
    # The values for all settings are documented at https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-configure-auto-train
    # Notice we no longer have to scale the input values, as Auto ML will try various data scaling approaches automatically
    Automl_config = AutoMLConfig(task = 'classification',
                                 primary_metric = 'accuracy',
                                 max_time_sec = 12000,
                                 iterations = 20,
                                 n_cross_validations = 3,
                                 exit_score = training_target_accuracy,
                                 blacklist_algos = ['kNN','LinearSVM'],
                                 X = train_X,
                                 y = train_Y_array,
                                 path='./04-automl/outputs')

    # Execute the job
    run = experiment.submit(Automl_config, show_output=True)

    # Get the run with the highest accuracy value.
    best_run, best_model = run.get_output()

    return (best_model, run, best_run)
示例#3
0
    def get_latest_model(self, experiment_name):
        """
        This function finds the experiment associated with the Data Labelling
        Project and finds the best model and downloads the train artifacts. Note,
        at the time of writing no SDK support is available for data labelling projects
        :param experiment_name:
        :return:
        """
        success = False

        ws = self.get_workspace()

        logging.info(f"Connected to Workspace {ws.name}")
        experiment = Experiment(workspace=ws, name=experiment_name)
        list_runs = experiment.get_runs()
        for run in list_runs:
            logging.info(f"Getting last run {run.id}")
            tags = run.get_tags()
            if tags['model_explain_run'] == 'best_run':
                # Get the latest run
                logging.info(f"Getting last best child run {tags['automl_best_child_run_id']}")
                child_run = run.get(ws, tags['automl_best_child_run_id'])
                metrics = run.get_metrics()
                logging.info(f"Accuracy (class) {metrics['accuracy']}")
                file_names = child_run.get_file_names()
                if "train_artifacts/model.pt" in file_names:
                    logging.info('Found a trained model.pt')
                    child_run.download_files(prefix='train_artifacts',
                                             output_directory='/usr/src/api/models')
                    success = True
                    break

        return success
def RunAutoML():
        subscription_id = request.json['subscription_id']
        resource_group = request.json['resource_group']
        workspace_name = request.json['workspace_name']
        file_name = request.json['file_name']
        #location = request.json['location']
    
        ws = Workspace(subscription_id=subscription_id,
                                  resource_group=resource_group,
                                  workspace_name=workspace_name)
                                            
        print("Found workspace {} at location {}".format(ws.name, ws.location))
        print('Found existing Workspace.')
            
        dataset_name = file_name

        # Get a dataset by name
        df = Dataset.get_by_name(workspace=ws, name=dataset_name)
        stock_dataset_df = df.to_pandas_dataframe()
        print('file successfully recieved.')
        stock_dataset_df.head()
        #stock_dataset_json = stock_dataset_df.to_json(orient='split')
        #print(stock_dataset_json)
        y_df = stock_dataset_df['ActionTaken'].values
        x_df = stock_dataset_df.drop(['ActionTaken'], axis=1)
        
        ExperimentName = request.json['ExperimentName']       
        tasks = request.json['tasks']
        iterations = request.json['iterations']
        iteration_timeout_minutes = request.json['iteration_timeout_minutes']
        primary_metric = request.json['primary_metric']
        
        #n_cross_validations = request.json['n_cross_validations']
        
        try:
            automl_config = AutoMLConfig(
                task=tasks,
                X=x_df,
                y=y_df,
                iterations=iterations,
                iteration_timeout_minutes=iteration_timeout_minutes,
                primary_metric=primary_metric,
                #n_cross_validations=n_cross_validations,
                preprocess=True,
                )
            experiment = Experiment(ws, ExperimentName)
            run = experiment.submit(config=automl_config, show_output=True)
    
            best_model,fitted_model = run.get_output()

            return 'ok'
        except:

            return 'error'
def main(train_path, pred_path, n_pred, dt, target, time_limit_min):
    df_train = pd.read_csv(train_path)
    df_train[dt] = pd.to_datetime(df_train[dt])

    time_series_settings = {
        "time_column_name": dt,
        "max_horizon": n_pred,
        "target_lags": "auto",
        "target_rolling_window_size": "auto"
    }
    automl_config = AutoMLConfig(task="forecasting",
                                 training_data=df_train,
                                 label_column_name=target,
                                 n_cross_validations=5,
                                 max_cores_per_iteration=-1,
                                 path=os.environ["SCRATCH"],
                                 experiment_timeout_minutes=time_limit_min,
                                 ensemble_download_models_timeout_sec=3600,
                                 **time_series_settings)
    ws = Workspace.from_config()
    experiment = Experiment(ws, "experiment")
    best_run, fitted_model = experiment.submit(automl_config,
                                               show_output=True).get_output()

    print("Best pipeline:")
    try:
        ensemble = vars(fitted_model.steps[1][1])["_wrappedEnsemble"]
        print(ensemble.__class__)
        steps = ensemble.estimators_
    except:
        steps = fitted_model.steps
    best_pipeline = ""
    for i, step in enumerate(steps):
        best_pipeline += f"{i}. {str(step)}\n"
    print(best_pipeline)

    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.max_colwidth', -1)
    print(fitted_model.named_steps["timeseriestransformer"].
          get_engineered_feature_names())
    featurization_summary = fitted_model.named_steps[
        "timeseriestransformer"].get_featurization_summary()
    print(pd.DataFrame.from_records(featurization_summary))

    x_pred = pd.date_range(df_train[dt].iloc[-1],
                           periods=n_pred + 1,
                           freq=pd.infer_freq(df_train[dt]))[1:]
    y_pred = fitted_model.forecast(forecast_destination=x_pred[-1])[0]
    #     y_pred = fitted_model.forecast(pd.DataFrame({dt: x_pred}))[0]

    df_pred = pd.DataFrame({dt: x_pred, target: y_pred})
    df_pred.to_csv(pred_path, index=False)
def train_model(data_file, random_seed):
    """Train the automl model."""
    target = "utilization"
    df = pd.read_parquet(data_file)

    x = df.loc[:, [c for c in df if c != target]].values
    y = df[target].values
    project_folder = "./automl"

    automl_config = AutoMLConfig(
        task="regression",
        iteration_timeout_minutes=5,
        iterations=10,
        primary_metric="spearman_correlation",
        n_cross_validations=5,
        debug_log="automl.log",
        verbosity=logging.INFO,
        X=x,
        y=y,
        path=project_folder,
    )

    load_dotenv(find_dotenv())
    ws = Workspace(
        workspace_name=getenv("AML_WORKSPACE_NAME"),
        subscription_id=getenv("AML_SUBSCRIPTION_ID"),
        resource_group=getenv("AML_RESOURCE_GROUP"),
    )
    experiment = Experiment(ws, getenv("AML_EXPERIMENT_NAME"))

    local_run = experiment.submit(automl_config, show_output=True)

    sub_runs = list(local_run.get_children())

    best_run = None
    best_score = 0

    for sub_run in sub_runs:
        props = sub_run.get_properties()
        if props["run_algorithm"] != "Ensemble":
            if float(props["score"]) > best_score:
                best_run = sub_run

    model_name = "Automl{}".format(str(uuid.uuid4()).replace("-", ""))[:20]
    best_run.register_model(model_name=model_name,
                            model_path="outputs/model.pkl")

    # best_run, fitted_model = local_run.get_output()
    # local_run.register_model(
    #     description="automl meetup best model"
    # )
    print("Model name is {}".format(model_name))
def train_eval_register_model(ws, experiment_name, model_name, full_X, full_Y,
                              training_set_percentage):

    # start a training run by defining an experiment
    myexperiment = Experiment(ws, experiment_name)
    run = myexperiment.start_logging()

    train_X, test_X, train_Y, test_Y = train_test_split(
        full_X, full_Y, train_size=training_set_percentage, random_state=42)

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(train_X)
    clf = linear_model.LogisticRegression(C=1)
    clf.fit(X_scaled, train_Y)

    scaled_inputs = scaler.transform(test_X)
    predictions = clf.predict(scaled_inputs)
    score = accuracy_score(test_Y, predictions)

    print("With %0.2f percent of data, model accuracy reached %0.4f." %
          (training_set_percentage, score))

    # Log the training metrics to Azure Machine Learning service run history
    run.log("Training_Set_Percentage", training_set_percentage)
    run.log("Accuracy", score)

    # Serialize the model to a pickle file in the outputs folder
    output_model_path = 'outputs/' + model_name + '.pkl'
    pickle.dump(clf, open(output_model_path, 'wb'))
    print('Exported model to ', output_model_path)

    # Serialize the scaler as a pickle file in the same folder as the model
    output_scaler_path = 'outputs/' + 'scaler' + '.pkl'
    pickle.dump(scaler, open(output_scaler_path, 'wb'))
    print('Exported scaler to ', output_scaler_path)

    # notice for the model_path, we supply the name of the outputs folder without a trailing slash
    # this will ensure both the model and the scaler get uploaded.
    registered_model = Model.register(model_path='outputs',
                                      model_name=model_name,
                                      workspace=ws)

    print(registered_model.name,
          registered_model.id,
          registered_model.version,
          sep='\t')

    run.complete()

    return (registered_model, clf, scaler, score, run)
示例#8
0
    def submit(self,
               dispatcher: CollectingDispatcher,
               tracker: Tracker,
               domain: Dict[Text, Any],) -> List[Dict]:
        """Define what the form has to do
        after all required slots are filled"""
        task=tracker.get_slot('task')
        data=tracker.get_slot('data')
        column_name=tracker.get_slot('column_name')
        dispatcher.utter_message(template="utter_doing_task", task=tracker.get_slot('task'),data=tracker.get_slot('data'),
                                 column_name=tracker.get_slot('column_name'))
        # Load the workspace from the saved config file
        ws = Workspace.from_config()
        print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))

        
        df = pd.read_csv(data)
        train_data, test_data = train_test_split(df, test_size=0.1, random_state=42)
        label = column_name
        automl_config = AutoMLConfig(name='Automated ML Experiment',
                             task= task,
                             compute_target='local',
                             training_data = train_data,
                             validation_data = test_data,
                             label_column_name= label,
                             experiment_timeout_minutes=30,
                             iterations=6,
                             primary_metric = 'AUC_weighted',
                             featurization='auto',
                             )
        automl_experiment = Experiment(ws, 'mslearn-diabetes-automl')
        automl_run = automl_experiment.submit(automl_config)
        best_run, fitted_model = automl_run.get_output()
        best_run_metrics = best_run.get_metrics()
        metric_list = []
        for metric_name in best_run_metrics:
            metric = best_run_metrics[metric_name]
            metric_list.append((metric_name, metric))
        return fitted_model, metric_list
        
        print("The best model pipeline for the data is")
        dispatcher.utter_message(text="The best model pipeline for the data is")
        print(model)
        dispatcher.utter_message(model)
        print("The different metrics are")
        dispatcher.utter_message(text="The different metrics are")
        print(metrics)
        dispatcher.utter_message(text=metrics)
                             column_name=tracker.get_slot('column_name'))
    def run_pipeline(self, params):
        """
        run_pipeline - Submit a pipeline job.

        :param Workspace ws: AML Workspace.
        :param Pipeline pipeline: AML pipeline.
        :param str pipeline_name: Directory of the source files.
        :param dict params: Pipeline parameteters.

        :returns:                               An AML experiment
        :rtype:                                 Experiment
        """
        # Submit the pipeline to be run
        exp = Experiment(self.ws, self.pipeline_name)
        exp_id = exp.submit(self.pipeline, pipeline_parameters=params)
        return exp_id
示例#10
0
 def fit(self, X, y, sample_weight=None):
     # fit implementation for a single output model.
     # Create experiment for specified workspace
     automl_config = copy.deepcopy(self._automl_config)
     current_time = time.localtime()
     current_time_string = time.strftime('%y_%m_%d-%H_%M_%S', current_time)
     experiment_name = self._experiment_name_prefix + "_" + current_time_string
     self._experiment = Experiment(self._workspace, experiment_name)
     # Configure automl_config with training set information.
     automl_config.user_settings['X'] = X
     automl_config.user_settings['y'] = y
     automl_config.user_settings['sample_weight'] = sample_weight
     # Wait for remote run to complete, the set the model
     print("Experiment " + experiment_name + " has started.")
     local_run = self._experiment.submit(automl_config, show_output=self._show_output)
     print("Experiment " + experiment_name + " completed.")
     _, self._model = local_run.get_output()
def train_eval_register_model(experiment_name, full_X, full_Y,
                              training_set_percentage):

    # start a training run by defining an experiment
    myexperiment = Experiment(ws, experiment_name)
    run = myexperiment.start_logging()

    train_X, test_X, train_Y, test_Y = train_test_split(
        full_X, full_Y, train_size=training_set_percentage, random_state=42)

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(train_X)
    clf = linear_model.LogisticRegression(C=1)
    clf.fit(X_scaled, train_Y)

    scaled_inputs = scaler.transform(test_X)
    predictions = clf.predict(scaled_inputs)
    score = accuracy_score(test_Y, predictions)

    print("With %0.2f percent of data, model accuracy reached %0.4f." %
          (training_set_percentage, score))

    # Log the training metrics to Azure Machine Learning service run history
    run.log("Training_Set_Percentage", training_set_percentage)
    run.log("Accuracy", score)
    run.complete()

    model_name = experiment_name + '.pkl'
    output_model_path = './outputs/' + model_name
    pickle.dump(clf, open(output_model_path, 'wb'))

    # Upload and register this version of the model with Azure Machine Learning service
    destination_path = 'outputs/' + model_name
    run.upload_file(destination_path, output_model_path)  # destination, source
    registered_model = run.register_model(model_name='usedcarsmodel',
                                          model_path=destination_path)

    print(registered_model.name,
          registered_model.id,
          registered_model.version,
          sep='\t')

    return (clf, score)
示例#12
0
def main(force_model_register: bool, skip_model_register: bool,
         submit_pipeline: bool, publish_pipeline: bool, experiment_name: str,
         debug_run: bool, dbx_cluster_name: str, aml_compute_name: str,
         input_dataset_name: str, validation_dataset_name: str):
    pipeline: Pipeline = create_pipeline(
        debug_run=debug_run,
        dbx_compute=dbx_cluster_name,
        aml_compute=aml_compute_name,
        input_dataset=input_dataset_name,
        validation_dataset=validation_dataset_name)
    pipeline.validate()

    if submit_pipeline and not publish_pipeline:
        exp = Experiment(WS, experiment_name)
        exp.submit(pipeline,
                   pipeline_parameters={
                       "force_registration": str(force_model_register),
                       "skip_registration": str(skip_model_register)
                   })

    if publish_pipeline:
        published_pipeline: PublishedPipeline = pipeline.publish(
            name="Driver Safety Pipeline",
            description="Training Pipeline for new driver safety model")

        if submit_pipeline:
            published_pipeline.submit(workspace=WS,
                                      experiment_name=experiment_name,
                                      pipeline_parameters={
                                          "force_registration":
                                          str(force_model_register),
                                          "skip_registration":
                                          str(skip_model_register)
                                      })

        sys.stdout.write(published_pipeline.id)
示例#13
0
    def existingModel(self, exp_name, run_id):
        SUBSCRIPTION_ID = '1f6fddae-bfa7-4f33-b9a5-ad3d4f29b8a9'
        RESOURCE_GROUP = 'DECADAAPPS'
        WORKSPACE_NAME = 'kongming-aml'
        TENANT_ID = 'd7802200-0ab3-48a9-a946-c4e20d15c1ca'

        auth = InteractiveLoginAuthentication(tenant_id=TENANT_ID)
        ws = Workspace(subscription_id=SUBSCRIPTION_ID,
               resource_group=RESOURCE_GROUP,
               workspace_name=WORKSPACE_NAME,
               auth=auth)
        exp = Experiment(ws, exp_name)
        run = AutoMLRun(experiment=exp, run_id=run_id)
        _, model = run.get_output()
        return run, model
    def run_pipeline(self, experiment_name, tags=None):
        """
        submits batch inference pipeline as an experiment run

        :param str experiment_name: [required] name of the experiment in azureml
        :param dict tags: [optional] dictionary of tags
        :returns: run
        :rtype: Run
        """
        if tags is None:
            tags = self.pipeline_tags
        step_sequence = StepSequence(steps=self.steps)
        pipeline = Pipeline(workspace=self.ws, steps=step_sequence)
        run = Experiment(self.ws, experiment_name).submit(
            pipeline, tags=tags, continue_on_step_failure=False)
        return run
示例#15
0
    def __init__(self,
                 directory=".",
                 experiment=None,
                 auth=None,
                 _disable_service_check=False):
        """
        Creates the project object using the local project path.
        :param directory: Project path.
        :type directory: str
        :param experiment:
        :type experiment: azureml.core.Experiment
        :param auth: An authentication object of a subclass of azureml.core.authentication.AbstractAuthentication
        :type auth: azureml.core.authentication.AbstractAuthentication
        :return:
        """
        from azureml.core.experiment import Experiment
        if not directory:
            directory = "."
        if experiment:
            self._workspace = experiment.workspace
            self.directory = directory
            self._project_path = os.path.abspath(directory)
            self._experiment = experiment
            self._snapshots_client = SnapshotsClient(
                self._workspace.service_context)

        else:
            if not auth:
                auth = InteractiveLoginAuthentication()

            self._project_path = os.path.abspath(directory)

            info_dict = _commands.get_project_info(auth, self._project_path)

            from azureml.core.workspace import Workspace
            self._workspace = Workspace(
                info_dict[_commands.SUBSCRIPTION_KEY],
                info_dict[_commands.RESOURCE_GROUP_KEY],
                info_dict[_commands.WORKSPACE_KEY],
                auth,
                _disable_service_check=_disable_service_check)
            self._experiment = Experiment(self._workspace,
                                          info_dict[_commands.PROJECT_KEY])
            self._snapshots_client = SnapshotsClient(
                self._workspace.service_context)
示例#16
0
class _InnerAutomatedMLModel():
    # Inner single model to be passed that wrapper can use to pass into MultiOutputRegressor
    def __init__(self,
                 automl_config,
                 workspace,
                 experiment_name_prefix="aml_experiment"):
        self._show_output = automl_config._show_output
        self._workspace = workspace
        self._automl_config = automl_config
        self._experiment_name_prefix = experiment_name_prefix

    def get_params(self, deep=True):
        # Must be implemented for MultiOutputRegressor to view _InnerAutomatedMLModel
        # as an sklearn estimator
        return {
            'workspace': self._workspace,
            'automl_config': self._automl_config,
            'experiment_name_prefix': self._experiment_name_prefix
        }

    def fit(self, X, y, sample_weight=None):
        # fit implementation for a single output model.
        # Create experiment for specified workspace
        automl_config = copy.deepcopy(self._automl_config)
        current_time = time.localtime()
        current_time_string = time.strftime('%y_%m_%d-%H_%M_%S', current_time)
        experiment_name = self._experiment_name_prefix + "_" + current_time_string
        self._experiment = Experiment(self._workspace, experiment_name)
        # Configure automl_config with training set information.
        automl_config.user_settings['X'] = X
        automl_config.user_settings['y'] = y
        automl_config.user_settings['sample_weight'] = sample_weight
        # Wait for remote run to complete, the set the model
        print("Experiment " + experiment_name + " has started.")
        local_run = self._experiment.submit(automl_config,
                                            show_output=self._show_output)
        print("Experiment " + experiment_name + " completed.")
        _, self._model = local_run.get_output()

    def predict(self, X):
        return self._model.predict(X)

    def predict_proba(self, X):
        return self._model.predict_proba(X)
示例#17
0
def detach(name=None, project=None):
    """Detach compute target"""
    # Set correlation id
    set_correlation_id()

    if not project:
        project = "."

    auth = get_cli_specific_auth()
    project_object = Project(auth=auth, directory=project)
    experiment = Experiment(project_object.workspace,
                            project_object.history.name)
    remove_legacy_compute_target(experiment, project, name)
    command_output = CLICommandOutput(
        "Detaching {} compute target for project "
        "{} successful".format(name, project_object.project_directory))
    command_output.set_do_not_print_dict()

    return get_cli_specific_output(command_output)
    verbosity=logging.INFO,
    spark_context=sc,  # noqa
    whitelist_models=[
        "GradientBoosting",
        "DecisionTree",
        "RandomForest",
        "ExtremeRandomTrees",
        "LightGBM",
    ],
    blacklist_models=["ensemble"],
    X=x,
    y=y,
    path=project_folder,
)

experiment = Experiment(ws, "host-ml-nt-ai-meetup")

db_run = experiment.submit(automl_config, show_output=True)

sub_runs = list(db_run.get_children())

best_run = None
best_score = 0

for sub_run in sub_runs:
    props = sub_run.get_properties()
    if props["run_algorithm"] != "Ensemble":
        if float(props["score"]) > best_score:
            best_run = sub_run

model_name = "Automl{}".format(str(uuid.uuid4()).replace("-", ""))[:20]
ws = Workspace.create(name=workspace_name,
                      subscription_id=subscription_id,
                      resource_group=resource_group,
                      location=workspace_region,
                      exist_ok=True)

print("Workspace Provisioning complete.")

#%%
# Step 10 - Create an experiment and log metrics for multiple training runs
###########################################################################
from azureml.core.run import Run
from azureml.core.experiment import Experiment

# start a training run by defining an experiment
myexperiment = Experiment(ws, "UsedCars_Experiment")
root_run = myexperiment.start_logging()

training_set_percentage = 0.25
run = root_run.child_run("Training_Set_Percentage-%0.5F" %
                         training_set_percentage)
model, score = train_eval_model(full_X, full_Y, training_set_percentage)
print("With %0.2f percent of data, model accuracy reached %0.4f." %
      (training_set_percentage, score))
run.log("Training_Set_Percentage", training_set_percentage)
run.log("Accuracy", score)
run.complete()

training_set_percentage = 0.5
run = root_run.child_run("Training_Set_Percentage-%0.5F" %
                         training_set_percentage)
# ```
# from azureml.core.authentication import ServicePrincipalAuthentication
# auth = auth = ServicePrincipalAuthentication('mytenantid', 'myappid', 'mypassword')
# ws = Workspace.from_config(auth = auth)
# ```
# For more details, see [aka.ms/aml-notebook-auth](http://aka.ms/aml-notebook-auth)

# In[85]:

ws = Workspace.from_config()

# Choose a name for the experiment and specify the project folder.
experiment_name = 'automl-classification'
project_folder = './sample_projects/automl-classification'

experiment = Experiment(ws, experiment_name)

output = {}
output['SDK version'] = azureml.core.VERSION
output['Subscription ID'] = ws.subscription_id
output['Workspace Name'] = ws.name
output['Resource Group'] = ws.resource_group
output['Location'] = ws.location
output['Project Directory'] = project_folder
output['Experiment Name'] = experiment.name
pd.set_option('display.max_colwidth', -1)
outputDf = pd.DataFrame(data=output, index=[''])
outputDf.T

# ## Data
#
示例#21
0
import shutil
import os
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml import Pipeline, PipelineModel
from azureml.core.run import Run
from azureml.core.experiment import Experiment

#defining some wariables
model_name = "smsspamclassif_runs.mml"
model_dbfs = os.path.join("/dbfs", model_name)

#these are the different regularization parameter values we are going to test
regs = [0.0001, 0.001, 0.01, 0.1]

myexperiment = Experiment(ws, "SMS_Spam_Classifier")
main_run = myexperiment.start_logging()

for reg in regs:
    print("Regularization rate: {}".format(reg))
    with main_run.child_run("reg-" + str(reg)) as run:
        lr = LogisticRegression(featuresCol="features",
                                labelCol='label',
                                regParam=reg)
        pipe = Pipeline(stages=[
            stringIndexer, tokenizer, stopwordsRemover, hashingTF, idf, lr
        ])
        model_p = pipe.fit(training_data)

        # make prediction on test_data
        pred = model_p.transform(test_data)
示例#22
0
from azureml.core.runconfig import RunConfiguration
from azureml.widgets import RunDetails
from checknotebookoutput import checkNotebookOutput

if __name__ == "__main__":
    ws = Workspace.from_config()

    print(ws.resource_group)
    print(ws.subscription_id)

    # choose a name for the run history container in the workspace
    experiment_name = 'automl-remote-attach'
    # project folder
    project_folder = './sample_projects/automl-remote-attach'

    experiment = Experiment(ws, experiment_name)
    automl_runs = list(experiment.get_runs(type='automl'))

    assert (len(automl_runs) == 1)

    compute_name = 'mydsvmb'

    dsvm_compute = ws.compute_targets[compute_name]

    # create a new RunConfig object
    conda_run_config = RunConfiguration(framework="python")

    # Set compute target to the Linux DSVM
    conda_run_config.target = dsvm_compute

    cd = CondaDependencies.create(pip_packages=['azureml-sdk[automl]'],
@author: datacore
"""

from azureml.core.authentication import AzureCliAuthentication
import azure.cli.core
#cli_auth = AzureCliAuthentication()
from azureml.core.workspace import Workspace

ws = Workspace(subscription_id="24075937-2687-4457-bac6-ec16dec514c3",
               resource_group="VstsRG-784AbhijitC-8a31",
               workspace_name="automldc")

from azureml.core.experiment import Experiment
from azureml.core import Run
experiment = Experiment(ws, 'Myexp2_v1_test21')
best_run = Run(experiment=experiment,
               run_id='AutoML_74e9d9dc-f347-4392-b8bb-3edeb4a6afad_8')
fitted_model = Run(experiment=experiment,
                   run_id='AutoML_74e9d9dc-f347-4392-b8bb-3edeb4a6afad_8')
#print(best_run.register_model()
print(fitted_model)

# Get a dataset by name
from azureml.core.dataset import Dataset

file_name = '2018Q4PredictionTrainedSet101.csv'
stock_dataset = Dataset.get_by_name(ws, '2018Q4PredictionTrainedSet101.csv')
#stock_dataset
#dataset = Dataset.Tabular.from_delimited_files(stock_dataset)
stock_dataset.to_pandas_dataframe().describe()
示例#24
0
import numpy as np
import logging

from azureml.core.workspace import Workspace
from azureml.core.experiment import Experiment
from azureml.train.automl import AutoMLConfig
from azureml.train.automl.run import AutoMLRun
from azureml.automl.core.featurization import FeaturizationConfig

# https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/automated-machine-learning/forecasting-orange-juice-sales/auto-ml-forecasting-orange-juice-sales.ipynb
# Choose a name for the experiment and specify the project folder.
experiment_name = "automl-revenue-region-forecast"

try:

    experiment = Experiment(ws, experiment_name)

    output = {}
    output["SDK version"] = azureml.core.VERSION
    output["Subscription ID"] = ws.subscription_id
    output["Workspace Name"] = ws.name
    output["Resource Group"] = ws.resource_group
    output["Location"] = ws.location
    output["Experiment Name"] = experiment.name
    pd.set_option("display.max_colwidth", -1)
    print(pd.DataFrame(data=output, index=[""]).T)
except Exception as error:
    print(error)
    log_error("{} {}".format(notebook, error))  #log error in sentry
    raise dbutils.notebook.exit(error)  #raise the exception
def RunAutoML():
    subscription_id = request.json['subscription_id']
    resource_group = request.json['resource_group']
    workspace_name = request.json['workspace_name']
    file_name = request.json['file_name']
    #location = request.json['location']

    ws = Workspace(subscription_id=subscription_id,
                   resource_group=resource_group,
                   workspace_name=workspace_name)

    print("Found workspace {} at location {}".format(ws.name, ws.location))
    print('Found existing Workspace.')

    dataset_name = file_name

    # Get a dataset by name
    df = Dataset.get_by_name(workspace=ws, name=dataset_name)
    stock_dataset_df = df.to_pandas_dataframe()
    print('file successfully recieved.')
    stock_dataset_df.head()
    #stock_dataset_json = stock_dataset_df.to_json(orient='split')
    #print(stock_dataset_json)
    y_df = stock_dataset_df['ActionTaken'].values
    x_df = stock_dataset_df.drop(['ActionTaken'], axis=1)
    print(y_df)
    ExperimentName = request.json['ExperimentName']
    tasks = request.json['tasks']
    iterations = request.json['iterations']
    n_cross_validations = request.json['n_cross_validations']
    iteration_timeout_minutes = request.json['iteration_timeout_minutes']
    primary_metric = request.json['primary_metric']
    max_concurrent_iterations = request.json['max_concurrent_iterations']

    #n_cross_validations = request.json['n_cross_validations']

    try:
        automl_settings = {
            "name": ExperimentName,
            "iteration_timeout_minutes": iteration_timeout_minutes,
            "iterations": iterations,
            "n_cross_validations": n_cross_validations,
            "primary_metric": primary_metric,
            "preprocess": True,
            "max_concurrent_iterations": max_concurrent_iterations,
            "verbosity": logging.INFO
        }

        automl_config = AutoMLConfig(
            task=tasks,
            debug_log='automl_errors.log',
            path=os.getcwd(),
            #compute_target = 'Automlvm',
            X=x_df,
            y=y_df,
            **automl_settings,
        )

        experiment = Experiment(ws, 'automl_local_v2')
        remote_run = experiment.submit(automl_config, show_output=True)
        children = list(remote_run.get_children())
        metricslist = {}
        for run in children:
            properties = run.get_properties()
            metrics = {
                k: v
                for k, v in run.get_metrics().items() if isinstance(v, float)
            }
            metricslist[int(properties['iteration'])] = metrics

        rundata = pd.DataFrame(metricslist).sort_index(1)
        rundata_toJson = rundata.to_json(orient='columns')

        return rundata_toJson
    except:

        return 'error'
示例#26
0
from azureml.train.automl.runtime.automl_explain_utilities import AutoMLExplainerSetupClass, \
    automl_setup_model_explanations, automl_check_model_if_explainable
from azureml.explain.model.mimic.models.lightgbm_model import LGBMExplainableModel
from azureml.explain.model.mimic_wrapper import MimicWrapper
from automl.client.core.common.constants import MODEL_PATH
from azureml.explain.model.scoring.scoring_explainer import TreeScoringExplainer, save

OUTPUT_DIR = './outputs/'
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Get workspace from the run context
run = Run.get_context()
ws = run.experiment.workspace

# Get the AutoML run object from the experiment name and the workspace
experiment = Experiment(ws, '<<experiment_name>>')
automl_run = Run(experiment=experiment, run_id='<<run_id>>')

# Check if this AutoML model is explainable
if not automl_check_model_if_explainable(automl_run):
    raise Exception("Model explanations is currently not supported for " +
                    automl_run.get_properties().get('run_algorithm'))

# Download the best model from the artifact store
automl_run.download_file(name=MODEL_PATH, output_file_path='model.pkl')

# Load the AutoML model into memory
fitted_model = joblib.load('model.pkl')

# Get the train dataset from the workspace
train_dataset = Dataset.get_by_name(workspace=ws,
示例#27
0
from azureml.pipeline.core import PublishedPipeline
from azureml.core.experiment import Experiment
from azureml.core import Workspace

workspace = Workspace.from_config()

published_pipeline_id = ""
is_debug = True
debug_relay_connection_name = "test"

if published_pipeline_id is None or published_pipeline_id == "":
    raise ValueError("Initialize published_pipeline_id")

pipeline_parameters = {"is_debug": is_debug}
if is_debug:
    if debug_relay_connection_name == "":
        raise ValueError("Hybrid connection name cannot be empty!")

    pipeline_parameters.update(
        {"debug_relay_connection_name": debug_relay_connection_name})

experiment = Experiment(workspace, "Pipeline_debug_experiment")
published_pipeline = PublishedPipeline.get(workspace=workspace,
                                           id=published_pipeline_id)
experiment.submit(published_pipeline, pipeline_parameters=pipeline_parameters)
def RunAutoML():
    subscription_id = request.json['subscription_id']
    resource_group = request.json['resource_group']
    workspace_name = request.json['workspace_name']
    file_name = request.json['file_name']
    location = request.json['location']
    target_var = request.json['target_var']

    ws = Workspace(subscription_id=subscription_id,
                   resource_group=resource_group,
                   workspace_name=workspace_name)

    print("Found workspace {} at location {}".format(ws.name, ws.location))
    print('Found existing Workspace.')

    dataset_name = file_name

    # Get a dataset by name
    df = Dataset.get_by_name(workspace=ws, name=dataset_name)
    stock_dataset_df = df.to_pandas_dataframe()
    print('file successfully recieved.')
    stock_dataset_df.head()
    #stock_dataset_json = stock_dataset_df.to_json(orient='split')
    #print(stock_dataset_json)
    y_df = stock_dataset_df[target_var].values
    x_df = stock_dataset_df.drop([target_var], axis=1)
    print(y_df)
    ExperimentName = request.json['ExperimentName']
    tasks = request.json['tasks']
    iterations = request.json['iterations']
    n_cross_validations = request.json['n_cross_validations']
    iteration_timeout_minutes = request.json['iteration_timeout_minutes']
    primary_metric = request.json['primary_metric']
    max_concurrent_iterations = request.json['max_concurrent_iterations']
    best_model = request.json['best_model']

    #n_cross_validations = request.json['n_cross_validations']

    try:
        automl_settings = {
            "name": ExperimentName,
            "iteration_timeout_minutes": iteration_timeout_minutes,
            "iterations": iterations,
            "n_cross_validations": n_cross_validations,
            "primary_metric": primary_metric,
            "preprocess": True,
            "max_concurrent_iterations": max_concurrent_iterations,
            "verbosity": logging.INFO
        }

        automl_config = AutoMLConfig(
            task=tasks,
            debug_log='automl_errors.log',
            path=
            'D:\\Stock_Prediction\\AutoML_Azure\\python\\Flask_API_Azure\\log',
            #compute_target = 'Automlvm',
            X=x_df,
            y=y_df,
            **automl_settings,
        )

        experiment = Experiment(ws, ExperimentName)
        remote_run = experiment.submit(automl_config, show_output=True)
        best_run, fitted_model = remote_run.get_output()
        #print(best_run)
        print(best_run.get_file_names())
        #Register the model
        from datetime import date
        model = best_run.register_model(model_name=best_model +
                                        str(date.today()),
                                        model_path='outputs/model.pkl')
        print(model.name, model.id, model.version, sep='\t')
        children = list(remote_run.get_children())
        metricslist = {}
        for run in children:
            properties = run.get_properties()
            metrics = {
                k: v
                for k, v in run.get_metrics().items() if isinstance(v, float)
            }
            metricslist[int(properties['iteration'])] = metrics

        rundata = pd.DataFrame(metricslist).sort_index(1)
        rundata.rename(column={
            0: "one",
            1: "two",
            2: "three",
            3: "four",
            4: "five",
            5: "six",
            6: "seven",
            7: "right",
            8: "nine",
            9: "ten",
        },
                       inplace=True)
        rundata_toJson = rundata.to_json(orient='columns')
        print(rundata_toJson)
        return rundata_toJson
    except:

        return 'error'
def trigger_training_job():

    # Define Vars < Change the vars>.
    # In a production situation, don't put secrets in source code, but as secret variables,
    # see https://docs.microsoft.com/en-us/azure/devops/pipelines/process/variables?view=azure-devops&tabs=yaml%2Cbatch#secret-variables
    workspace = "<Name of your workspace>"
    subscription_id = "<Subscription id>"
    resource_grp = "<Name of your resource group where aml service is created>"

    domain = "westeurope.azuredatabricks.net"  # change location in case databricks instance is not in westeurope
    dbr_pat_token_raw = "<<your Databricks Personal Access Token>>"

    DBR_PAT_TOKEN = bytes(dbr_pat_token_raw, encoding='utf-8')  # adding b'
    notebookRemote = "/3_IncomeNotebookDevops"
    experiment_name = "experiment_model_release"
    model_name_run = datetime.datetime.now().strftime(
        "%Y%m%d%H%M%S"
    ) + "_dbrmod.mml"  # in case you want to change the name, keep the .mml extension
    model_name = "databricksmodel.mml"  # in case you want to change the name, keep the .mml extension
    db_compute_name = "dbr-amls-comp"

    #
    # Step 1: Run notebook using Databricks Compute in AML SDK
    #
    cli_auth = AzureCliAuthentication()

    ws = Workspace(workspace_name=workspace,
                   subscription_id=subscription_id,
                   resource_group=resource_grp,
                   auth=cli_auth)
    ws.get_details()

    #
    # Step 2: Create job and attach it to cluster
    #
    # In this steps, secret are added as parameters (spn_tenant, spn_clientid, spn_clientsecret)
    # Never do this in a production situation, but use secret scope backed by key vault instead
    # See https://docs.azuredatabricks.net/user-guide/secrets/secret-scopes.html#azure-key-vault-backed-scopes
    response = requests.post(
        'https://%s/api/2.0/jobs/create' % domain,
        headers={'Authorization': b"Bearer " + DBR_PAT_TOKEN},
        json={
            "name": "Run AzureDevopsNotebook Job",
            "new_cluster": {
                "spark_version": "4.0.x-scala2.11",
                "node_type_id": "Standard_D3_v2",
                "spark_env_vars": {
                    'PYSPARK_PYTHON': '/databricks/python3/bin/python3',
                },
                "autoscale": {
                    "min_workers": 1,
                    "max_workers": 2
                }
            },
            "libraries": [{
                "pypi": {
                    "package": "azureml-sdk[databricks]"
                }
            }],
            "notebook_task": {
                "notebook_path":
                notebookRemote,
                "base_parameters": [{
                    "key": "subscription_id",
                    "value": subscription_id
                }, {
                    "key": "resource_group",
                    "value": resource_grp
                }, {
                    "key": "workspace_name",
                    "value": workspace
                }, {
                    "key": "model_name",
                    "value": model_name_run
                }]
            }
        })

    if response.status_code != 200:
        print("Error launching cluster: %s: %s" %
              (response.json()["error_code"], response.json()["message"]))
        exit(2)

    #
    # Step 3: Start job
    #
    databricks_job_id = response.json()['job_id']

    response = requests.post(
        'https://%s/api/2.0/jobs/run-now' % domain,
        headers={'Authorization': b"Bearer " + DBR_PAT_TOKEN},
        json={"job_id": +databricks_job_id})

    if response.status_code != 200:
        print("Error launching cluster: %s: %s" %
              (response.json()["error_code"], response.json()["message"]))
        exit(3)

    print(response.json()['run_id'])

    #
    # Step 4: Wait until job is finished
    #
    databricks_run_id = response.json()['run_id']
    scriptRun = 1
    count = 0
    while scriptRun == 1:
        response = requests.get(
            'https://%s/api/2.0/jobs/runs/get?run_id=%s' %
            (domain, databricks_run_id),
            headers={'Authorization': b"Bearer " + DBR_PAT_TOKEN},
        )

        state = response.json()['state']
        life_cycle_state = state['life_cycle_state']
        print(state)

        if life_cycle_state in ["TERMINATED", "SKIPPED", "INTERNAL_ERROR"]:
            result_state = state['result_state']
            if result_state == "SUCCESS":
                print("run ok")
                scriptRun = 0
            #exit(0)
            else:
                exit(4)
        elif count > 180:
            print("time out occurred after 30 minutes")
            exit(5)
        else:
            count += 1
            time.sleep(30)  # wait 30 seconds before next status update

    #
    # Step 5: Retrieve model from dbfs
    #
    mdl, ext = model_name_run.split(".")
    model_zip_run = mdl + ".zip"

    response = requests.get(
        'https://%s/api/2.0/dbfs/read?path=/%s' % (domain, model_zip_run),
        headers={'Authorization': b"Bearer " + DBR_PAT_TOKEN})
    if response.status_code != 200:
        print("Error copying dbfs results: %s: %s" %
              (response.json()["error_code"], response.json()["message"]))
        exit(1)

    model_output = base64.b64decode(response.json()['data'])

    # download model in deploy folder
    os.chdir("deploy")
    with open(model_zip_run, "wb") as outfile:
        outfile.write(model_output)
    print("Downloaded model {} to Project root directory".format(model_name))

    #
    # Step 6: Retrieve model metrics from dbfs
    #
    mdl, ext = model_name_run.split(".")
    model_metrics_json_run = mdl + "_metrics.json"

    response = requests.get(
        'https://%s/api/2.0/dbfs/read?path=/%s' %
        (domain, model_metrics_json_run),
        headers={'Authorization': b"Bearer " + DBR_PAT_TOKEN})
    if response.status_code != 200:
        print("Error copying dbfs results: %s: %s" %
              (response.json()["error_code"], response.json()["message"]))
        exit(2)

    model_metrics_output = json.loads(base64.b64decode(
        response.json()['data']))

    #
    # Step 7: Put model and metrics to Azure ML Service
    #

    # start a training run by defining an experiment
    myexperiment = Experiment(ws, experiment_name)
    run = myexperiment.start_logging()
    run.upload_file("outputs/" + model_zip_run, model_zip_run)

    #run.log("pipeline_run", pipeline_run.id)
    run.log("au_roc", model_metrics_output["Area_Under_ROC"])
    run.log("au_prc", model_metrics_output["Area_Under_PR"])
    run.log("truePostive", model_metrics_output["True_Positives"])
    run.log("falsePostive", model_metrics_output["False_Positives"])
    run.log("trueNegative", model_metrics_output["True_Negatives"])
    run.log("falseNegative", model_metrics_output["False_Negatives"])

    run.complete()
    run_id = run.id
    print("run id:", run_id)

    # unzip file to model_name_run
    shutil.unpack_archive(model_zip_run, model_name_run)

    model = Model.register(
        model_path=model_name_run,  # this points to a local file
        model_name=model_name,  # this is the name the model is registered as
        tags={
            "area": "spar",
            "type": "regression",
            "run_id": run_id
        },
        description="Medium blog test model",
        workspace=ws,
    )
    print("Model registered: {} \nModel Description: {} \nModel Version: {}".
          format(model.name, model.description, model.version))

    # Step 8. Finally, writing the registered model details to conf/model.json
    model_json = {}
    model_json["model_name"] = model.name
    model_json["model_version"] = model.version
    model_json["run_id"] = run_id
    model_json["model_name_run"] = model_name_run
    with open("../conf/model.json", "w") as outfile:
        json.dump(model_json, outfile)
示例#30
0
from azureml.train.automl.automlexplainer import retrieve_model_explanation
from azureml.core.model import Model
from azureml.core.image import ContainerImage
from azureml.core.image.image import Image
from azureml.core import Webservice
from azureml.core.webservice import AciWebservice

# try:
# setting the local env to hadnle missing packages
run_user_managed = RunConfiguration()
run_user_managed.environment.python.user_managed_dependencies = False

# Create workspace object for existing one and create an experiment
ws = Workspace.from_config('subscription.json')
print(ws.name, ws.location, ws.resource_group, ws.location, sep='\t')
experiment = Experiment(workspace=ws, name='experiment1')

# full path to training data,testing data
file_path1 = os.path.join(os.getcwd(), "cumodelwo2014.csv")
dflowtr = dprep.auto_read_file(path=file_path1)
file_path2 = os.path.join(os.getcwd(), "test2014.csv")
dflowte = dprep.auto_read_file(path=file_path2)

# Specifying x(causal) and y(response) attributes in training data
dflowtr_x = dflowtr.keep_columns([
    'cell-ID', 'Soil_Name', 'MEAN_Yld_V', 'COUNT_Yld', 'MEAN_Eleva',
    'RANGE_Elev', 'Crop-Type', 'V.A.T(F)', 'R.A.T(F)', 'M.A.T(F)',
    'V.PET(inch)', 'R.PET(inch)', 'M.PET(inch)', 'V.T.R(inch)', 'R.T.R(inch)',
    'M.T.R(inch)'
])
dflowtr_y = dflowtr.keep_columns('NormalizedYield')