def train_step(train_dir, compute_target): max_depth = PipelineParameter(name='max_depth', default_value=5) n_estimators = PipelineParameter(name='n_estimators', default_value=500) model_dir = PipelineData(name='model_dir', pipeline_output_name='model_dir', datastore=train_dir.datastore, output_mode='mount', is_directory=True) outputs = [model_dir] outputs_map = {'model_dir': model_dir} estimator = SKLearn(source_directory=os.path.dirname( os.path.abspath(__file__)), entry_script='train.py', compute_target=compute_target) step = EstimatorStep(estimator=estimator, estimator_entry_script_arguments=[ '--train_dir', train_dir, '--output_dir', model_dir, '--max_depth', max_depth, '--n_estimators', n_estimators ], inputs=[train_dir], compute_target=compute_target, outputs=outputs, allow_reuse=False) return step, outputs_map
def evaluate_step(model_dir, test_dir, compute_target): accuracy_file = PipelineData(name='accuracy_file', pipeline_output_name='accuracy_file', datastore=test_dir.datastore, output_mode='mount', is_directory=False) outputs = [accuracy_file] outputs_map = {'accuracy_file': accuracy_file} estimator = SKLearn(source_directory=os.path.dirname( os.path.abspath(__file__)), entry_script='evaluate.py', compute_target=compute_target) step = EstimatorStep(estimator=estimator, estimator_entry_script_arguments=[ '--test_dir', test_dir, '--model_dir', model_dir, '--accuracy_file', accuracy_file ], inputs=[model_dir, test_dir], outputs=outputs, compute_target=compute_target, allow_reuse=True) return step, outputs_map
def submit(experiment_name: str, compute_name: str, kernal: str, penalty: float): print("This notebook was created using version 1.0.83 of the Azure ML SDK") print("You are using version", azureml.core.VERSION, "of the SDK") # Get a reference to the workspace. Be sure to download the config.json # from your workspace and place in the parent folder. ws = Workspace.from_config() print('Loaded workspace', ws.name) # Reference the experiment experiment = Experiment(workspace=ws, name=experiment_name) print('Logging to experiment', experiment_name) # Get a reference to an existing the compute target. compute_target = ws.compute_targets[compute_name] # Setup an Estimator for submitting the job. An Estimator further wraps # RunConfig with additional configuration for specific cases. There are # Estimators provided for many common runtimes such as PyTorch and # Tensorflow. In this case we use the SKLearn specific estimator. script_params = { '--output-dir': "outputs", '--kernel': kernal, '--penalty': penalty, } # NOTE: scikit-learn added below until default image includes v22.1+ estimator = SKLearn(source_directory=".", entry_script='train.py', script_params=script_params, compute_target=compute_target, pip_packages=['matplotlib', 'scikit-learn']) # Submit the experiment to get a run and wait for completion run = experiment.submit(estimator) print('Submitted please wait...') run.wait_for_completion(show_output=True) # register the trained model model = run.register_model( model_name='covid-tweets-analyis', model_path='outputs/model/covid-tweets-analyis.joblib') print('Run number:', run.number) print('Run id:', run.id) print("Run details are available at:", run.get_portal_url()) print("Model: {} v{}".format(model.name, model.version)) if 'azureml.git.dirty' in run.properties: if run.properties['azureml.git.dirty']: print("WARNNG: You have uncomitted changes. To ensure " "reproducability check in your code before you train.") else: print('WARNNG: To ensure reproducability you should be using git!')
def main(experiment, environment, dataset): workspace = Workspace.from_config() experiment = Experiment(workspace, experiment) compute_target = ComputeTarget(workspace, environment) # Use the root of the solution as source folder for the run. root_folder = Path(__file__).parent.parent # Provide each of the datasets to the estimator as a named input. # You can acccess these from within the training script. datasets = [Dataset.get_by_name(workspace, ds).as_named_input(ds) for ds in dataset] estimator = SKLearn( source_directory=root_folder, entry_script='customer_churn/train.py', conda_dependencies_file='conda_dependencies.yml', compute_target=compute_target, inputs=datasets ) run = experiment.submit(estimator) run.wait_for_completion(show_output=True)
# create directory for training scripts train_folder = os.path.join(os.getcwd(), 'sklearn_classification', 'training') # create environment for classification env = Environment('classification_env') cd = CondaDependencies.create(pip_packages=[ 'azureml-sdk', 'scikit-learn', 'azureml-dataprep[pandas,fuse]>=1.1.14' ]) env.python.conda_dependencies = cd # setup hyper parameter values to tune regularizations = np.linspace(0.05, 0.95, 10) # loop over the parameter values for reg in regularizations: # create sklearn estimator train_params = { '--data-folder': dataset.as_named_input('data').as_mount(), '--regularization': reg } est = SKLearn(source_directory=train_folder, script_params=train_params, compute_target=compute_target, environment_definition=env, entry_script='train.py') # submit run for execution run = exp.submit(config=est)
# the estimator helps to submit training jobs. # here define a single node sklearn job. from azureml.train.sklearn import SKLearn script_params = { '--kernel': 'linear', '--penalty': 1.0, } estimator = SKLearn(source_directory=project_folder, script_params=script_params, compute_target=compute_target, entry_script='train_iris.py', pip_packages=['joblib==0.13.2'] )
def _start_estimator_training(self, training_name: str, estimator_type: str = None, input_datasets: np.array = None, input_datasets_to_download: np.array = None, compute_target:str='local', gpu_compute: bool = False, script_parameters: dict = None, show_widget: bool = True, **kwargs): ''' Will start a new training using an Estimator, taking the training name as the folder of the run Args: training_name (str): The name of a training. This will be used to create a directory. Can contain subdirectory environment_type (str): one of these values (tensorflow, sklearn, pytorch). input_datasets (np.array): An array of data set names that will be mounted on the compute in a directory of the dataset name input_datasets_to_download (np.array): An array of data set names that will be downloaded to the compute in a directory of the dataset name compute_target (str): The compute target (default = 'local') on which the training should be executed gpu_compute (bool): Indicates if GPU compute is required for this script or not script_parameters (dict): A dictionary of key/value parameters that will be passed as arguments to the training script show_widget (bool): Will display the live tracking of the submitted Run ''' from azureml.train.estimator import Estimator # Check if directory exists if not(os.path.exists(training_name) and os.path.isdir(training_name)): raise FileNotFoundError(training_name) # Check compute target if compute_target != 'local': self.__check_compute_target(compute_target, gpu_compute) # Add datasets datasets = list() if(input_datasets is not None): for ds in input_datasets: datasets.append(self.__workspace.datasets[ds].as_named_input(ds).as_mount(path_on_compute=ds)) if(input_datasets_to_download is not None): for ds in input_datasets_to_download: datasets.append(self.__workspace.datasets[ds].as_named_input(ds).as_download(path_on_compute=ds)) # as mount - as download constructor_parameters = { 'source_directory':training_name, 'script_params':script_parameters, 'inputs':datasets, 'compute_target':compute_target, 'entry_script':'train.py', 'pip_requirements_file':'requirements.txt', 'use_gpu':gpu_compute, 'use_docker':True} print('Creating estimator of type', estimator_type) if(estimator_type is None): # Using default Estimator estimator = Estimator(**constructor_parameters) elif(estimator_type == 'tensorflow'): from azureml.train.dnn import TensorFlow version_par = 'framework_version' if(not version_par in constructor_parameters.keys()): print('Defaulting to version 2.0 for TensorFlow') constructor_parameters[version_par] = '2.0' estimator = TensorFlow(**constructor_parameters) elif(estimator_type == 'sklearn'): from azureml.train.sklearn import SKLearn estimator = SKLearn(**constructor_parameters) elif(estimator_type == 'pytorch'): from azureml.train.dnn import PyTorch estimator = PyTorch(**constructor_parameters) # Submit training self.__current_run = self.__experiment.submit(estimator)
from azureml.train.sklearn import SKLearn from azureml.core import Experiment from azureml.widgets import RunDetails # Set up the parameters script_params = { '--regularization': 0.1, # regularization rate '--data-folder': data_ref # data reference to download files from datastore } # Create an estimator estimator = SKLearn(source_directory=experiment_folder, entry_script='diabetes_training.py', script_params=script_params, compute_target = 'local' ) # Create an experiment experiment_name = 'diabetes-training' experiment = Experiment(workspace = ws, name = experiment_name) # Run the experiment run = experiment.submit(config=estimator) # Show the run details while running RunDetails(run).show() run.wait_for_completion()
def func1(self, tenant_id, service_principal_id, service_principal_password): # check core SDK version number print("Azure ML SDK Version: ", azureml.core.VERSION) logging.info("Azure ML SDK Version: " + str(azureml.core.VERSION)) svc_pr = ServicePrincipalAuthentication( tenant_id, service_principal_id, service_principal_password) # load workspace configuration from the config.json file in the current folder. ws = Workspace.from_config(auth=svc_pr) print(ws.name, ws.location, ws.resource_group, sep='\t') from azureml.core import Experiment experiment_name = 'sklearn-mnist' exp = Experiment(workspace=ws, name=experiment_name) # Create a compute from azureml.core.compute import AmlCompute from azureml.core.compute import ComputeTarget import os # choose a name for your cluster compute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME", "cpucluster") compute_min_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MIN_NODES", 0) compute_max_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MAX_NODES", 4) # This example uses CPU VM. For using GPU VM, set SKU to STANDARD_NC6 vm_size = os.environ.get("AML_COMPUTE_CLUSTER_SKU", "STANDARD_D2_V2") if compute_name in ws.compute_targets: compute_target = ws.compute_targets[compute_name] if compute_target and type(compute_target) is AmlCompute: print('found compute target. just use it. ' + compute_name) else: print('creating a new compute target...') provisioning_config = AmlCompute.provisioning_configuration(vm_size=vm_size, min_nodes=compute_min_nodes, max_nodes=compute_max_nodes) # create the cluster compute_target = ComputeTarget.create( ws, compute_name, provisioning_config) # can poll for a minimum number of nodes and for a specific timeout. # if no min node count is provided it will use the scale settings for the cluster compute_target.wait_for_completion( show_output=True, min_node_count=None, timeout_in_minutes=20) # For a more detailed view of current AmlCompute status, use get_status() print(compute_target.get_status().serialize()) from azureml.core.dataset import Dataset data_folder = os.path.join(os.getcwd(), 'Data') paths = [ os.path.join(data_folder, "test-images.gz"), os.path.join(data_folder, "test-labels.gz"), os.path.join(data_folder, "train-images.gz"), os.path.join(data_folder, "train-labels.gz") ] datastore = ws.get_default_datastore() datastore.upload(src_dir=data_folder, target_path='mnist', overwrite=True, show_progress=True) logging.info("Uploaded") datastore_paths = [ (datastore, "test-images.gz"), (datastore, "test-labels.gz"), (datastore, "train-images.gz"), (datastore, "train-labels.gz"), ] dataset = Dataset.File.from_files(path = datastore_paths) from azureml.core.environment import Environment from azureml.core.conda_dependencies import CondaDependencies env = Environment('my_env') cd = CondaDependencies.create(pip_packages=['azureml-sdk','scikit-learn','azureml-dataprep[pandas,fuse]>=1.1.14']) env.python.conda_dependencies = cd from azureml.train.sklearn import SKLearn script_params = { '--data-folder': dataset.as_named_input('mnist').as_mount(), '--regularization': 0.5 } script_folder = os.path.join(os.getcwd(), "scripts") est = SKLearn(source_directory=script_folder, script_params=script_params, compute_target=compute_target, environment_definition=env, entry_script='train.py') run = exp.submit(config=est) run
#multi tenant with my account from azureml.core.authentication import InteractiveLoginAuthentication int_auth = InteractiveLoginAuthentication(tenant_id='your_tenant_id') ws = Workspace.from_config(auth=int_auth) print(ws.name) dataset = Dataset.get_by_name(workspace=ws, name = 'demo_wines_live') #point to compute target comp = ComputeTarget(ws, name = 'compute-instance-demo') #estimator with SKlearn by default + azureml-sdk package est = SKLearn( source_directory='./scripts', entry_script='train.py', compute_target=comp, inputs = [dataset.as_named_input('train')], #readable from the script pip_packages=['azureml-sdk', 'pyarrow>=0.12.0'] ) exp = Experiment(workspace=ws, name = 'submitted_wine') run = exp.submit(est) run.wait_for_completion(show_output=True) #%% %%writefile ./scripts/train.py from azureml.core import Workspace, Datastore, Dataset from azureml.core.run import Run from sklearn import datasets import pandas as pd import numpy as np
#%% import shutil shutil.copy('utils.py', script_folder) # create an estimator #%% from azureml.train.sklearn import SKLearn script_params = { '--data-folder': ds.path('mnist').as_mount(), '--regularization': 0.5 } est = SKLearn(source_directory=script_folder, script_params=script_params, compute_target=compute_target, entry_script='train.py') # Submit the job # #%% run = exp.submit(config=est) run # Monitor the run #%% from azureml.widgets import RunDetails RunDetails(run).show()
# Specify parameter sampler from azureml.train.hyperdrive.parameter_expressions import choice ps = RandomParameterSampling( { "--C": uniform(0.1,1), "--max_iter": choice(50,100,150,200) }) # Specify a Policy policy = BanditPolicy(slack_factor = 0.1, evaluation_interval=1, delay_evaluation=5) if "training" not in os.listdir(): os.mkdir("./training") # Create a SKLearn estimator for use with train.py est = SKLearn(source_directory = '.', entry_script = 'train.py', compute_target = cpu_cluster) # Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy. hyperdrive_config = HyperDriveConfig(hyperparameter_sampling = ps, primary_metric_name = 'Accuracy', max_total_runs = 5, max_concurrent_runs = 2, primary_metric_goal=PrimaryMetricGoal.MAXIMIZE, policy = policy, estimator = est) # In[63]: # Submit your hyperdrive run to the experiment and show run details with the widget.
# + # Specify parameter sampler, usnig Baysesian sampling to quickly choose the most promising combinations ps = BayesianParameterSampling({ "--model": choice('linear', 'mlpregressor', 'gradientboosting'), "--lrf": choice(1.0, 0.1, 0.25, 0.5, 2.0), "--iterations": choice(100, 200), "--complexity": choice(1.0, 0.25, 0.5, 2.0) }) # Create a SKLearn estimator for use with train.py est = SKLearn(source_directory=script_path, entry_script=script_file, compute_target=compute_target) # Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy. hyperdrive_config = HyperDriveConfig( estimator=est, hyperparameter_sampling=ps, policy=None, primary_metric_name="r2_score", primary_metric_goal=PrimaryMetricGoal.MAXIMIZE, max_total_runs=50, max_concurrent_runs=5) # - # ### Setup experiment and submit run experiment_name = 'AzureMLCapstoneExperiment_HyperDrive'
joblib.dump(value=model, filename='outputs/model.pkl') run.complete() ###### from azureml.train.estimator import Estimator from azureml.core import Experiment # Create an estimator estimator = Estimator(source_directory='experiment_folder', entry_script='training_script.py', compute_target='local', conda_packages=['scikit-learn'] ) # Create and run an experiment experiment = Experiment(workspace = ws, name = 'training_experiment') run = experiment.submit(config=estimator) ####### from azureml.train.sklearn import SKLearn from azureml.core import Experiment # Create an estimator estimator = SKLearn(source_directory='experiment_folder', entry_script='training_script.py' compute_target='local' ) # Create and run an experiment experiment = Experiment(workspace = ws, name = 'training_experiment') run = experiment.submit(config=estimator)
"batch_size": choice(16, 32, 64, 128) }) # Specify a Policy, check job every 2 iterations policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1) if "training" not in os.listdir(): os.mkdir("./training") # workspaceblobstore is the default blob storage #src.run_config.source_directory_data_store = "workspaceblobstore" # Create a SKLearn estimator for use with train.py est = SKLearn("./training", script_params=None, compute_target=compute_target, entry_script='train.py', conda_packages=['scikit-learn']) # Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy. hyperdrive_config = HyperDriveConfig( estimator=est, hyperparameter_sampling=ps, policy=None, primary_metric_name='validation_acc', primary_metric_goal=PrimaryMetricGoal.MAXIMIZE, max_total_runs=4, max_concurrent_runs=4) # In[4]:
def main(): # argument parsing parser = argparse.ArgumentParser( description='Script to connect to workspace') parser.add_argument('--experiment', type=str, default='classification', help='experiment name for workspace') parser.add_argument('--config', type=str, default='.aml_config/config.json', help='config path for variables') parser.add_argument( '--compute_name', type=str, default='train-classify', ) parser.add_argument('--compute_nodes', type=int, default=4, help='number of nodes in compute cluster') parser.add_argument('--compute_type', type=str, default='STANDARD_D2_V2', help='type of compute in cluster') parser.add_argument('--compute_priority', type=str, default='dedicated', help='compute priority in compute cluster') parser.add_argument('--env', type=str, default='local', help='env argument to get variables') args = parser.parse_args() # set config path env = args.env config_path = args.config # set env variables if env == 'local': status = set_env_vars(config_path) # check for error if not status: print('Setting env variables failed') return -1 else: print('\nEnvironment variables set') # perform service principal auth svc_pr = get_svc_pr() # check for error with service principal auth if not svc_pr: print('Service principal auth failed') return -1 else: print('\nAuthentication succeded') # connect to ws ws = get_ws(svc_pr) # check for error with connection to ws if not ws: print('Workspace connection failed') return -1 else: print(f'\nFound workspace {ws.name} at location {ws.location}') # create classification experiment exp = Experiment(workspace=ws, name=args.experiment) # create compute target compute_target = get_compute_target(ws, args.compute_name, args.compute_type, args.compute_nodes, args.compute_priority) # register dataset to be used in compute web_paths = [ 'http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz', 'http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz', 'http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz', 'http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz' ] dataset = Dataset.File.from_files(path=web_paths) dataset = dataset.register(workspace=ws, name='classification dataset', description='training and test dataset', create_new_version=True) # create directory for training scripts train_folder = os.path.join('..', 'classification', 'training') # create environment for classification env = Environment('classification_env') cd = CondaDependencies.create(pip_packages=[ 'azureml-sdk', 'scikit-learn', 'azureml-dataprep[pandas,fuse]>=1.1.14' ]) env.python.conda_dependencies = cd env.docker.enabled = True # setup hyper parameter values to tune regularizations = np.linspace(0.05, 0.95, 10) # loop over the parameter values for reg in regularizations: # create sklearn estimator train_params = { '--data-folder': dataset.as_named_input('data').as_mount(), '--regularization': reg } est = SKLearn(source_directory=train_folder, script_params=train_params, compute_target=compute_target, environment_definition=env, entry_script='train.py') # submit run for execution _ = exp.submit(config=est)
print(exp) cluster_name = args.trcompute print("Printing computer name") print(cluster_name) try: compute_target = ComputeTarget(workspace=ws, name=cluster_name) print('Found existing compute target') except ComputeTargetException: print('Creating a new compute target...') # Parameters to be passed in training scrript script_params = { '--ws': args.ws, '--rg': args.rg, '--datastore': args.datastore, '--dataset': args.dataset, } estimator = SKLearn(source_directory=project_folder, script_params=script_params, compute_target=compute_target, entry_script='test2.py', conda_dependencies_file="env.yml") print("submitting experiment") run = exp.submit(estimator) print("experiment submitted") run.wait_for_completion(show_output=True)
framework_version = experiment_settings["framework"]["sklearn"][ "framework_version"] enable_optimized_mode = experiment_settings["framework"]["sklearn"][ "_enable_optimized_mode"] estimator = SKLearn( source_directory=experiment_settings["source_directory"], compute_target=compute_target, entry_script=experiment_settings["entry_script"], script_params=experiment_settings["script_parameters"], use_docker=experiment_settings["docker"]["use_docker"], custom_docker_image=experiment_settings["docker"]["custom_image"], image_registry_details=container_registry, user_managed=experiment_settings["user_managed"], conda_packages=experiment_settings["dependencies"]["conda_packages"], pip_packages=experiment_settings["dependencies"]["pip_packages"], conda_dependencies_file=experiment_settings["dependencies"] ["conda_dependencies_file"], pip_requirements_file=experiment_settings["dependencies"] ["pip_requirements_file"], environment_variables=experiment_settings["environment_variables"], inputs=experiment_settings["data_references"], shm_size=experiment_settings["docker"]["shm_size"], max_run_duration_seconds=experiment_settings[ "max_run_duration_seconds"], framework_version=framework_version, _enable_optimized_mode=enable_optimized_mode) else: estimator = Estimator( source_directory=experiment_settings["source_directory"], compute_target=compute_target,
def main(): e = Env() aml_workspace = Workspace.get( name=e.workspace_name, subscription_id=e.subscription_id, resource_group=e.resource_group ) print("get_workspace:") print(aml_workspace) aml_compute = get_compute( aml_workspace, e.compute_name, e.vm_size) if aml_compute is not None: print("aml_compute:") print(aml_compute) environment = get_environment( aml_workspace, e.aml_env_name, create_new=e.rebuild_env) run_config = RunConfiguration() run_config.environment = environment if (e.datastore_name): datastore_name = e.datastore_name else: datastore_name = aml_workspace.get_default_datastore().name run_config.environment.environment_variables["DATASTORE_NAME"] \ = datastore_name dataset_name = e.dataset_name file_name = e.file_name datastore = Datastore.get(aml_workspace, datastore_name) if (dataset_name not in aml_workspace.datasets): raise Exception("Could not find dataset at \"%s\"." % dataset_name) else: dataset = Dataset.get_by_name(aml_workspace, name=dataset_name) dataset.download(target_path='.', overwrite=True) datastore.upload_files([file_name], target_path=dataset_name, overwrite=True) raw_data_file = DataReference(datastore=datastore, data_reference_name="Raw_Data_File", path_on_datastore=dataset_name + '/' + file_name) clean_data_file = PipelineParameter(name="clean_data_file", default_value="/clean_data.csv") clean_data_folder = PipelineData("clean_data_folder", datastore=datastore) prepDataStep = PythonScriptStep(name="Prepare Data", source_directory=e.sources_directory_train, script_name=e.data_prep_script_path, arguments=["--raw_data_file", raw_data_file, "--clean_data_folder", clean_data_folder, "--clean_data_file", clean_data_file], inputs=[raw_data_file], outputs=[clean_data_folder], compute_target=aml_compute, allow_reuse=False) print("Step Prepare Data created") new_model_file = PipelineParameter(name="new_model_file ", default_value='/' + e.model_name + '.pkl') new_model_folder = PipelineData("new_model_folder", datastore=datastore) est = SKLearn(source_directory=e.sources_directory_train, entry_script=e.train_script_path, pip_packages=['azureml-sdk', 'scikit-learn==0.20.3', 'azureml-dataprep[pandas,fuse]>=1.1.14'], compute_target=aml_compute) trainingStep = EstimatorStep( name="Model Training", estimator=est, estimator_entry_script_arguments=["--clean_data_folder", clean_data_folder, "--new_model_folder", new_model_folder, "--clean_data_file", clean_data_file.default_value, "--new_model_file", new_model_file.default_value], runconfig_pipeline_params=None, inputs=[clean_data_folder], outputs=[new_model_folder], compute_target=aml_compute, allow_reuse=False) print("Step Train created") model_name_param = PipelineParameter(name="model_name", default_value=e.model_name) evaluateStep = PythonScriptStep( name="Evaluate Model", source_directory=e.sources_directory_train, script_name=e.evaluate_script_path, arguments=["--model_name", model_name_param], compute_target=aml_compute, allow_reuse=False) print("Step Evaluate created") registerStep = PythonScriptStep( name="Register Model", source_directory=e.sources_directory_train, script_name=e.register_script_path, arguments=["--new_model_folder", new_model_folder, "--new_model_file", new_model_file, "--model_name", model_name_param], inputs=[new_model_folder], compute_target=aml_compute, allow_reuse=False) print("Step Register created") if ((e.run_evaluation).lower() == 'true'): print("Include evaluation step before register step.") trainingStep.run_after(prepDataStep) evaluateStep.run_after(trainingStep) registerStep.run_after(evaluateStep) else: print("Exclude evaluation step and directly run register step.") trainingStep.run_after(prepDataStep) registerStep.run_after(trainingStep) pipeline = Pipeline(workspace=aml_workspace, steps=[registerStep]) pipeline.validate() print("Pipeline is built") pipeline._set_experiment_name published_pipeline = pipeline.publish( name=e.pipeline_name, description="Predict Employee Retention Model training pipeline", version=e.build_id ) print(f'Published pipeline: {published_pipeline.name}') print(f'for build {published_pipeline.version}')
estimator = Estimator(source_directory='experiment_folder', entry_script='training_script.py', compute_target='local', conda_packages=['scikit-learn'] ) # Create and run an experiment experiment = Experiment(workspace = ws, name = 'training_experiment') run = experiment.submit(config=estimator) from azureml.train.sklearn import SKLearn from azureml.core import Experiment # Create an estimator estimator = SKLearn(source_directory='experiment_folder', entry_script='training_script.py' compute_target='local' ) # Create and run an experiment experiment = Experiment(workspace = ws, name = 'training_experiment') run = experiment.submit(config=estimator) from azureml.core import Run import argparse import pandas as pd import numpy as np import joblib from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression
# There's only one parameter, so grid sampling will try each value - with multiple parameters it would try every combination '--regularization': choice(0.001, 0.005, 0.01, 0.05, 0.1, 1.0) }) # Set evaluation policy to stop poorly performing training runs early policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1) # Get the training dataset diabetes_ds = ws.datasets.get("diabetes_dataset") # Create an estimator that uses the remote compute hyper_estimator = SKLearn( source_directory=experiment_folder, inputs=[diabetes_ds.as_named_input('diabetes') ], # Pass the dataset as an input compute_target=gpu_cluster, conda_packages=['pandas', 'ipykernel', 'matplotlib'], pip_packages=['azureml-sdk', 'argparse', 'pyarrow'], entry_script='diabetes_training.py') # Configure hyperdrive settings hyperdrive = HyperDriveConfig(estimator=hyper_estimator, hyperparameter_sampling=params, policy=policy, primary_metric_name='AUC', primary_metric_goal=PrimaryMetricGoal.MAXIMIZE, max_total_runs=6, max_concurrent_runs=4) # Run the experiment run = experiment.submit(config=hyperdrive)
datastore_name='telemetry', container_name=args.storage_container, account_name=args.storage_account, account_key=args.storage_key, ) input_data = DataReference( datastore=telemetry_ds, data_reference_name="input_data", path_on_datastore=args.storage_path, ) preprocessing_est = SKLearn( source_directory='010-preprocessing', compute_target=cpu_cluster, entry_script='dataprep.py', conda_packages=['pandas'], pip_packages=['fastavro'], ) output = PipelineData("output", datastore=telemetry_ds) preprocessing_step = EstimatorStep( name="Preprocessing_Train", estimator=preprocessing_est, estimator_entry_script_arguments=[ "--data_dir", input_data, "--output_data_dir", output ], inputs=[input_data], outputs=[output], compute_target=cpu_cluster, allow_reuse=True,
# Sample a range of parameter values params = GridParameterSampling( { # There's only one parameter, so grid sampling will try each value - with multiple parameters it would try every combination '--regularization': choice(0.001, 0.005, 0.01, 0.05, 0.1, 1.0) } ) # Get the training dataset diabetes_ds = ws.datasets.get("diabetes dataset") # Create an estimator that uses the remote compute hyper_estimator = SKLearn(source_directory=experiment_folder, inputs=[diabetes_ds.as_named_input('diabetes')], # Pass the dataset as an input... pip_packages=['azureml-sdk'], # ...so we need azureml-dataprep (it's in the SDK!) entry_script='diabetes_training.py', compute_target = training_cluster,) # Configure hyperdrive settings hyperdrive = HyperDriveConfig(estimator=hyper_estimator, hyperparameter_sampling=params, policy=None, primary_metric_name='AUC', primary_metric_goal=PrimaryMetricGoal.MAXIMIZE, max_total_runs=6, max_concurrent_runs=4) # Run the experiment experiment = Experiment(workspace = ws, name = 'diabates_training_hyperdrive')