def train_step(train_dir, compute_target):

    max_depth = PipelineParameter(name='max_depth', default_value=5)
    n_estimators = PipelineParameter(name='n_estimators', default_value=500)

    model_dir = PipelineData(name='model_dir',
                             pipeline_output_name='model_dir',
                             datastore=train_dir.datastore,
                             output_mode='mount',
                             is_directory=True)

    outputs = [model_dir]
    outputs_map = {'model_dir': model_dir}

    estimator = SKLearn(source_directory=os.path.dirname(
        os.path.abspath(__file__)),
                        entry_script='train.py',
                        compute_target=compute_target)

    step = EstimatorStep(estimator=estimator,
                         estimator_entry_script_arguments=[
                             '--train_dir', train_dir, '--output_dir',
                             model_dir, '--max_depth', max_depth,
                             '--n_estimators', n_estimators
                         ],
                         inputs=[train_dir],
                         compute_target=compute_target,
                         outputs=outputs,
                         allow_reuse=False)

    return step, outputs_map
def evaluate_step(model_dir, test_dir, compute_target):

    accuracy_file = PipelineData(name='accuracy_file',
                                 pipeline_output_name='accuracy_file',
                                 datastore=test_dir.datastore,
                                 output_mode='mount',
                                 is_directory=False)

    outputs = [accuracy_file]
    outputs_map = {'accuracy_file': accuracy_file}

    estimator = SKLearn(source_directory=os.path.dirname(
        os.path.abspath(__file__)),
                        entry_script='evaluate.py',
                        compute_target=compute_target)

    step = EstimatorStep(estimator=estimator,
                         estimator_entry_script_arguments=[
                             '--test_dir', test_dir, '--model_dir', model_dir,
                             '--accuracy_file', accuracy_file
                         ],
                         inputs=[model_dir, test_dir],
                         outputs=outputs,
                         compute_target=compute_target,
                         allow_reuse=True)

    return step, outputs_map
Пример #3
0
def submit(experiment_name: str,
           compute_name: str,
           kernal: str,
           penalty: float):

    print("This notebook was created using version 1.0.83 of the Azure ML SDK")
    print("You are using version", azureml.core.VERSION, "of the SDK")

    # Get a reference to the workspace. Be sure to download the config.json
    # from your workspace and place in the parent folder.
    ws = Workspace.from_config()
    print('Loaded workspace', ws.name)

    # Reference the experiment
    experiment = Experiment(workspace=ws, name=experiment_name)
    print('Logging to experiment', experiment_name)

    # Get a reference to an existing the compute target.
    compute_target = ws.compute_targets[compute_name]

    # Setup an Estimator for submitting the job. An Estimator further wraps
    # RunConfig with additional configuration for specific cases. There are
    # Estimators provided for many common runtimes such as PyTorch and
    # Tensorflow. In this case we use the SKLearn specific estimator.
    script_params = {
        '--output-dir': "outputs",
        '--kernel': kernal,
        '--penalty': penalty,
    }

    # NOTE: scikit-learn added below until default image includes v22.1+
    estimator = SKLearn(source_directory=".",
                        entry_script='train.py',
                        script_params=script_params,
                        compute_target=compute_target,
                        pip_packages=['matplotlib', 'scikit-learn'])

    # Submit the experiment to get a run and wait for completion
    run = experiment.submit(estimator)
    print('Submitted please wait...')
    run.wait_for_completion(show_output=True)

    # register the trained model
    model = run.register_model(
        model_name='covid-tweets-analyis',
        model_path='outputs/model/covid-tweets-analyis.joblib')

    print('Run number:', run.number)
    print('Run id:', run.id)
    print("Run details are available at:", run.get_portal_url())
    print("Model: {} v{}".format(model.name, model.version))

    if 'azureml.git.dirty' in run.properties:
        if run.properties['azureml.git.dirty']:
            print("WARNNG: You have uncomitted changes. To ensure "
                  "reproducability check in your code before you train.")
    else:
        print('WARNNG: To ensure reproducability you should be using git!')
Пример #4
0
def main(experiment, environment, dataset):
    workspace = Workspace.from_config()
    experiment = Experiment(workspace, experiment)
    compute_target = ComputeTarget(workspace, environment)
    
    # Use the root of the solution as source folder for the run.
    root_folder = Path(__file__).parent.parent

    # Provide each of the datasets to the estimator as a named input.
    # You can acccess these from within the training script.
    datasets = [Dataset.get_by_name(workspace, ds).as_named_input(ds) for ds in dataset]

    estimator = SKLearn(
        source_directory=root_folder,
        entry_script='customer_churn/train.py',
        conda_dependencies_file='conda_dependencies.yml',
        compute_target=compute_target,
        inputs=datasets
    )

    run = experiment.submit(estimator)

    run.wait_for_completion(show_output=True)
# create directory for training scripts
train_folder = os.path.join(os.getcwd(), 'sklearn_classification', 'training')

# create environment for classification
env = Environment('classification_env')
cd = CondaDependencies.create(pip_packages=[
    'azureml-sdk', 'scikit-learn', 'azureml-dataprep[pandas,fuse]>=1.1.14'
])
env.python.conda_dependencies = cd

# setup hyper parameter values to tune
regularizations = np.linspace(0.05, 0.95, 10)

# loop over the parameter values
for reg in regularizations:
    # create sklearn estimator
    train_params = {
        '--data-folder': dataset.as_named_input('data').as_mount(),
        '--regularization': reg
    }

    est = SKLearn(source_directory=train_folder,
                  script_params=train_params,
                  compute_target=compute_target,
                  environment_definition=env,
                  entry_script='train.py')

    # submit run for execution
    run = exp.submit(config=est)
Пример #6
0
# the estimator helps to submit training jobs.
# here define a single node sklearn job.

from azureml.train.sklearn import SKLearn

script_params = {
    '--kernel': 'linear',
    '--penalty': 1.0,
}

estimator = SKLearn(source_directory=project_folder,
                    script_params=script_params,
                    compute_target=compute_target,
                    entry_script='train_iris.py',
                    pip_packages=['joblib==0.13.2']
                    )
Пример #7
0
    def _start_estimator_training(self, training_name: str, estimator_type: str = None, input_datasets: np.array = None, input_datasets_to_download: np.array = None, compute_target:str='local', gpu_compute: bool = False, script_parameters: dict = None, show_widget: bool = True, **kwargs):
        ''' 
        Will start a new training using an Estimator, taking the training name as the folder of the run
        Args:
            training_name (str): The name of a training.  This will be used to create a directory.  Can contain subdirectory
            environment_type (str): one of these values (tensorflow, sklearn, pytorch).  
            input_datasets (np.array): An array of data set names that will be mounted on the compute in a directory of the dataset name
            input_datasets_to_download (np.array): An array of data set names that will be downloaded to the compute in a directory of the dataset name
            compute_target (str): The compute target (default = 'local') on which the training should be executed
            gpu_compute (bool): Indicates if GPU compute is required for this script or not
            script_parameters (dict): A dictionary of key/value parameters that will be passed as arguments to the training script
            show_widget (bool): Will display the live tracking of the submitted Run
        '''
        from azureml.train.estimator import Estimator

        # Check if directory exists
        if not(os.path.exists(training_name) and os.path.isdir(training_name)):
            raise FileNotFoundError(training_name)

        # Check compute target
        if compute_target != 'local':
            self.__check_compute_target(compute_target, gpu_compute)
            

        # Add datasets
        datasets = list()
        if(input_datasets is not None):
            for ds in input_datasets:
                datasets.append(self.__workspace.datasets[ds].as_named_input(ds).as_mount(path_on_compute=ds))
        if(input_datasets_to_download is not None):
            for ds in input_datasets_to_download:
                datasets.append(self.__workspace.datasets[ds].as_named_input(ds).as_download(path_on_compute=ds))

        # as mount - as download
        constructor_parameters = {
            'source_directory':training_name,
            'script_params':script_parameters,
            'inputs':datasets,
            'compute_target':compute_target,
            'entry_script':'train.py',
            'pip_requirements_file':'requirements.txt', 
            'use_gpu':gpu_compute,
            'use_docker':True}
        
        print('Creating estimator of type', estimator_type)

        if(estimator_type is None):
            # Using default Estimator
            estimator = Estimator(**constructor_parameters)
        elif(estimator_type == 'tensorflow'):
            from azureml.train.dnn import TensorFlow
            version_par = 'framework_version'
            if(not version_par in constructor_parameters.keys()):
                print('Defaulting to version 2.0 for TensorFlow')
                constructor_parameters[version_par] = '2.0'
            estimator = TensorFlow(**constructor_parameters)
        elif(estimator_type == 'sklearn'):
            from azureml.train.sklearn import SKLearn
            estimator = SKLearn(**constructor_parameters)
        elif(estimator_type == 'pytorch'):
            from azureml.train.dnn import PyTorch
            estimator = PyTorch(**constructor_parameters)

        # Submit training
        self.__current_run = self.__experiment.submit(estimator)
Пример #8
0
from azureml.train.sklearn import SKLearn
from azureml.core import Experiment
from azureml.widgets import RunDetails

# Set up the parameters
script_params = {
    '--regularization': 0.1, # regularization rate
    '--data-folder': data_ref # data reference to download files from datastore
}


# Create an estimator
estimator = SKLearn(source_directory=experiment_folder,
                    entry_script='diabetes_training.py',
                    script_params=script_params,
                    compute_target = 'local'
                   )

# Create an experiment
experiment_name = 'diabetes-training'
experiment = Experiment(workspace = ws, name = experiment_name)

# Run the experiment
run = experiment.submit(config=estimator)
# Show the run details while running
RunDetails(run).show()
run.wait_for_completion()


Пример #9
0
    def func1(self, tenant_id, service_principal_id, service_principal_password):

        # check core SDK version number
        print("Azure ML SDK Version: ", azureml.core.VERSION)
        logging.info("Azure ML SDK Version: " + str(azureml.core.VERSION))

        svc_pr = ServicePrincipalAuthentication(
        tenant_id,
        service_principal_id,
        service_principal_password)

        # load workspace configuration from the config.json file in the current folder.
        ws = Workspace.from_config(auth=svc_pr)
        print(ws.name, ws.location, ws.resource_group, sep='\t')

        from azureml.core import Experiment
        experiment_name = 'sklearn-mnist'

        exp = Experiment(workspace=ws, name=experiment_name)

        # Create a compute

        from azureml.core.compute import AmlCompute
        from azureml.core.compute import ComputeTarget
        import os

        # choose a name for your cluster
        compute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME", "cpucluster")
        compute_min_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MIN_NODES", 0)
        compute_max_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MAX_NODES", 4)

        # This example uses CPU VM. For using GPU VM, set SKU to STANDARD_NC6
        vm_size = os.environ.get("AML_COMPUTE_CLUSTER_SKU", "STANDARD_D2_V2")


        if compute_name in ws.compute_targets:
            compute_target = ws.compute_targets[compute_name]
            if compute_target and type(compute_target) is AmlCompute:
                print('found compute target. just use it. ' + compute_name)
        else:
            print('creating a new compute target...')
            provisioning_config = AmlCompute.provisioning_configuration(vm_size=vm_size,
                                                                        min_nodes=compute_min_nodes,
                                                                        max_nodes=compute_max_nodes)

            # create the cluster
            compute_target = ComputeTarget.create(
                ws, compute_name, provisioning_config)

            # can poll for a minimum number of nodes and for a specific timeout.
            # if no min node count is provided it will use the scale settings for the cluster
            compute_target.wait_for_completion(
                show_output=True, min_node_count=None, timeout_in_minutes=20)

            # For a more detailed view of current AmlCompute status, use get_status()
            print(compute_target.get_status().serialize())



        from azureml.core.dataset import Dataset
        data_folder = os.path.join(os.getcwd(), 'Data')
        paths = [
            os.path.join(data_folder, "test-images.gz"),
            os.path.join(data_folder, "test-labels.gz"),
            os.path.join(data_folder, "train-images.gz"),
            os.path.join(data_folder, "train-labels.gz")
            ]

        datastore = ws.get_default_datastore()
        datastore.upload(src_dir=data_folder, target_path='mnist', overwrite=True, show_progress=True)

        logging.info("Uploaded")

        datastore_paths = [
            (datastore, "test-images.gz"),
            (datastore, "test-labels.gz"),
            (datastore, "train-images.gz"),
            (datastore, "train-labels.gz"),
        ]

        dataset = Dataset.File.from_files(path = datastore_paths)

        from azureml.core.environment import Environment
        from azureml.core.conda_dependencies import CondaDependencies

        env = Environment('my_env')
        cd = CondaDependencies.create(pip_packages=['azureml-sdk','scikit-learn','azureml-dataprep[pandas,fuse]>=1.1.14'])
        env.python.conda_dependencies = cd

        from azureml.train.sklearn import SKLearn

        script_params = {
            '--data-folder': dataset.as_named_input('mnist').as_mount(),
            '--regularization': 0.5
        }

        script_folder = os.path.join(os.getcwd(), "scripts")

        est = SKLearn(source_directory=script_folder,
                    script_params=script_params,
                    compute_target=compute_target,
                    environment_definition=env, 
                    entry_script='train.py')


        run = exp.submit(config=est)
        run
Пример #10
0
#multi tenant with my account 
from azureml.core.authentication import InteractiveLoginAuthentication
int_auth = InteractiveLoginAuthentication(tenant_id='your_tenant_id')
ws = Workspace.from_config(auth=int_auth)
print(ws.name)

dataset = Dataset.get_by_name(workspace=ws, name = 'demo_wines_live')

#point to compute target
comp = ComputeTarget(ws, name = 'compute-instance-demo')

#estimator with SKlearn by default + azureml-sdk package
est = SKLearn(
                source_directory='./scripts',
                entry_script='train.py',
                compute_target=comp,
                inputs = [dataset.as_named_input('train')], #readable from the script
                pip_packages=['azureml-sdk', 'pyarrow>=0.12.0']
)

exp = Experiment(workspace=ws, name = 'submitted_wine')
run = exp.submit(est)
run.wait_for_completion(show_output=True)

#%%
%%writefile ./scripts/train.py
from azureml.core import Workspace, Datastore, Dataset
from azureml.core.run import Run
from sklearn import datasets 
import pandas as pd
import numpy as np 
Пример #11
0
#%%
import shutil
shutil.copy('utils.py', script_folder)

# create an estimator

#%%
from azureml.train.sklearn import SKLearn

script_params = {
    '--data-folder': ds.path('mnist').as_mount(),
    '--regularization': 0.5
}

est = SKLearn(source_directory=script_folder,
                script_params=script_params,
                compute_target=compute_target,
                entry_script='train.py')


# Submit the job
#                
#%%
run = exp.submit(config=est)
run

# Monitor the run

 #%%
from azureml.widgets import RunDetails
RunDetails(run).show()
# Specify parameter sampler
from azureml.train.hyperdrive.parameter_expressions import choice
ps = RandomParameterSampling( {
                                "--C": uniform(0.1,1),
                                "--max_iter": choice(50,100,150,200)
                                })

# Specify a Policy
policy = BanditPolicy(slack_factor = 0.1, evaluation_interval=1, delay_evaluation=5)

if "training" not in os.listdir():
    os.mkdir("./training")

# Create a SKLearn estimator for use with train.py
est = SKLearn(source_directory = '.', entry_script = 'train.py', compute_target =  cpu_cluster)

# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(hyperparameter_sampling = ps,                                    
                                     primary_metric_name = 'Accuracy',                                    
                                     max_total_runs = 5,                                    
                                     max_concurrent_runs = 2,                                   
                                     primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,  
                                     policy = policy,                                   
                                     estimator = est)


# In[63]:


# Submit your hyperdrive run to the experiment and show run details with the widget.
Пример #13
0
# +
# Specify parameter sampler, usnig Baysesian sampling to quickly choose the most promising combinations
ps = BayesianParameterSampling({
    "--model":
    choice('linear', 'mlpregressor', 'gradientboosting'),
    "--lrf":
    choice(1.0, 0.1, 0.25, 0.5, 2.0),
    "--iterations":
    choice(100, 200),
    "--complexity":
    choice(1.0, 0.25, 0.5, 2.0)
})

# Create a SKLearn estimator for use with train.py
est = SKLearn(source_directory=script_path,
              entry_script=script_file,
              compute_target=compute_target)
# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(
    estimator=est,
    hyperparameter_sampling=ps,
    policy=None,
    primary_metric_name="r2_score",
    primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
    max_total_runs=50,
    max_concurrent_runs=5)
# -

# ### Setup experiment and submit run

experiment_name = 'AzureMLCapstoneExperiment_HyperDrive'
Пример #14
0
joblib.dump(value=model, filename='outputs/model.pkl')

run.complete()

######
from azureml.train.estimator import Estimator
from azureml.core import Experiment

# Create an estimator
estimator = Estimator(source_directory='experiment_folder',
                      entry_script='training_script.py',
                      compute_target='local',
                      conda_packages=['scikit-learn']
                      )

# Create and run an experiment
experiment = Experiment(workspace = ws, name = 'training_experiment')
run = experiment.submit(config=estimator)
#######
from azureml.train.sklearn import SKLearn
from azureml.core import Experiment

# Create an estimator
estimator = SKLearn(source_directory='experiment_folder',
                    entry_script='training_script.py'
                    compute_target='local'
                    )

# Create and run an experiment
experiment = Experiment(workspace = ws, name = 'training_experiment')
run = experiment.submit(config=estimator)
Пример #15
0
    "batch_size": choice(16, 32, 64, 128)
})

# Specify a Policy, check job every 2 iterations
policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1)

if "training" not in os.listdir():
    os.mkdir("./training")

# workspaceblobstore is the default blob storage
#src.run_config.source_directory_data_store = "workspaceblobstore"

# Create a SKLearn estimator for use with train.py
est = SKLearn("./training",
              script_params=None,
              compute_target=compute_target,
              entry_script='train.py',
              conda_packages=['scikit-learn'])

# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(
    estimator=est,
    hyperparameter_sampling=ps,
    policy=None,
    primary_metric_name='validation_acc',
    primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
    max_total_runs=4,
    max_concurrent_runs=4)

# In[4]:
Пример #16
0
def main():
    # argument parsing
    parser = argparse.ArgumentParser(
        description='Script to connect to workspace')
    parser.add_argument('--experiment',
                        type=str,
                        default='classification',
                        help='experiment name for workspace')
    parser.add_argument('--config',
                        type=str,
                        default='.aml_config/config.json',
                        help='config path for variables')
    parser.add_argument(
        '--compute_name',
        type=str,
        default='train-classify',
    )
    parser.add_argument('--compute_nodes',
                        type=int,
                        default=4,
                        help='number of nodes in compute cluster')
    parser.add_argument('--compute_type',
                        type=str,
                        default='STANDARD_D2_V2',
                        help='type of compute in cluster')
    parser.add_argument('--compute_priority',
                        type=str,
                        default='dedicated',
                        help='compute priority in compute cluster')
    parser.add_argument('--env',
                        type=str,
                        default='local',
                        help='env argument to get variables')
    args = parser.parse_args()

    # set config path
    env = args.env
    config_path = args.config

    # set env variables
    if env == 'local':
        status = set_env_vars(config_path)

        # check for error
        if not status:
            print('Setting env variables failed')
            return -1
        else:
            print('\nEnvironment variables set')

    # perform service principal auth
    svc_pr = get_svc_pr()

    # check for error with service principal auth
    if not svc_pr:
        print('Service principal auth failed')
        return -1
    else:
        print('\nAuthentication succeded')
    # connect to ws
    ws = get_ws(svc_pr)

    # check for error with connection to ws
    if not ws:
        print('Workspace connection failed')
        return -1
    else:
        print(f'\nFound workspace {ws.name} at location {ws.location}')

    # create classification experiment
    exp = Experiment(workspace=ws, name=args.experiment)

    # create compute target
    compute_target = get_compute_target(ws, args.compute_name,
                                        args.compute_type, args.compute_nodes,
                                        args.compute_priority)

    # register dataset to be used in compute
    web_paths = [
        'http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz',
        'http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz',
        'http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz',
        'http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz'
    ]
    dataset = Dataset.File.from_files(path=web_paths)

    dataset = dataset.register(workspace=ws,
                               name='classification dataset',
                               description='training and test dataset',
                               create_new_version=True)

    # create directory for training scripts
    train_folder = os.path.join('..', 'classification', 'training')

    # create environment for classification
    env = Environment('classification_env')
    cd = CondaDependencies.create(pip_packages=[
        'azureml-sdk', 'scikit-learn', 'azureml-dataprep[pandas,fuse]>=1.1.14'
    ])
    env.python.conda_dependencies = cd
    env.docker.enabled = True

    # setup hyper parameter values to tune
    regularizations = np.linspace(0.05, 0.95, 10)

    # loop over the parameter values
    for reg in regularizations:
        # create sklearn estimator
        train_params = {
            '--data-folder': dataset.as_named_input('data').as_mount(),
            '--regularization': reg
        }

        est = SKLearn(source_directory=train_folder,
                      script_params=train_params,
                      compute_target=compute_target,
                      environment_definition=env,
                      entry_script='train.py')

        # submit run for execution
        _ = exp.submit(config=est)
print(exp)

cluster_name = args.trcompute
print("Printing computer name")
print(cluster_name)

try:
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing compute target')
except ComputeTargetException:
    print('Creating a new compute target...')

# Parameters to be passed in training scrript

script_params = {
    '--ws': args.ws,
    '--rg': args.rg,
    '--datastore': args.datastore,
    '--dataset': args.dataset,
}

estimator = SKLearn(source_directory=project_folder,
                    script_params=script_params,
                    compute_target=compute_target,
                    entry_script='test2.py',
                    conda_dependencies_file="env.yml")
print("submitting experiment")
run = exp.submit(estimator)
print("experiment submitted")
run.wait_for_completion(show_output=True)
Пример #18
0
    framework_version = experiment_settings["framework"]["sklearn"][
        "framework_version"]
    enable_optimized_mode = experiment_settings["framework"]["sklearn"][
        "_enable_optimized_mode"]

    estimator = SKLearn(
        source_directory=experiment_settings["source_directory"],
        compute_target=compute_target,
        entry_script=experiment_settings["entry_script"],
        script_params=experiment_settings["script_parameters"],
        use_docker=experiment_settings["docker"]["use_docker"],
        custom_docker_image=experiment_settings["docker"]["custom_image"],
        image_registry_details=container_registry,
        user_managed=experiment_settings["user_managed"],
        conda_packages=experiment_settings["dependencies"]["conda_packages"],
        pip_packages=experiment_settings["dependencies"]["pip_packages"],
        conda_dependencies_file=experiment_settings["dependencies"]
        ["conda_dependencies_file"],
        pip_requirements_file=experiment_settings["dependencies"]
        ["pip_requirements_file"],
        environment_variables=experiment_settings["environment_variables"],
        inputs=experiment_settings["data_references"],
        shm_size=experiment_settings["docker"]["shm_size"],
        max_run_duration_seconds=experiment_settings[
            "max_run_duration_seconds"],
        framework_version=framework_version,
        _enable_optimized_mode=enable_optimized_mode)

else:
    estimator = Estimator(
        source_directory=experiment_settings["source_directory"],
        compute_target=compute_target,
def main():
    e = Env()
    aml_workspace = Workspace.get(
        name=e.workspace_name,
        subscription_id=e.subscription_id,
        resource_group=e.resource_group
    )
    print("get_workspace:")
    print(aml_workspace)

    aml_compute = get_compute(
        aml_workspace,
        e.compute_name,
        e.vm_size)
    if aml_compute is not None:
        print("aml_compute:")
        print(aml_compute)

    environment = get_environment(
        aml_workspace, e.aml_env_name, create_new=e.rebuild_env)
    run_config = RunConfiguration()
    run_config.environment = environment

    if (e.datastore_name):
        datastore_name = e.datastore_name
    else:
        datastore_name = aml_workspace.get_default_datastore().name

    run_config.environment.environment_variables["DATASTORE_NAME"] \
        = datastore_name

    dataset_name = e.dataset_name
    file_name = e.file_name
    datastore = Datastore.get(aml_workspace, datastore_name)

    if (dataset_name not in aml_workspace.datasets):
        raise Exception("Could not find dataset at \"%s\"." % dataset_name)
    else:
        dataset = Dataset.get_by_name(aml_workspace, name=dataset_name)
        dataset.download(target_path='.', overwrite=True)
        datastore.upload_files([file_name],
                               target_path=dataset_name,
                               overwrite=True)

    raw_data_file = DataReference(datastore=datastore,
                                  data_reference_name="Raw_Data_File",
                                  path_on_datastore=dataset_name + '/'
                                  + file_name)

    clean_data_file = PipelineParameter(name="clean_data_file",
                                        default_value="/clean_data.csv")
    clean_data_folder = PipelineData("clean_data_folder",
                                     datastore=datastore)

    prepDataStep = PythonScriptStep(name="Prepare Data",
                                    source_directory=e.sources_directory_train,
                                    script_name=e.data_prep_script_path,
                                    arguments=["--raw_data_file",
                                               raw_data_file,
                                               "--clean_data_folder",
                                               clean_data_folder,
                                               "--clean_data_file",
                                               clean_data_file],
                                    inputs=[raw_data_file],
                                    outputs=[clean_data_folder],
                                    compute_target=aml_compute,
                                    allow_reuse=False)

    print("Step Prepare Data created")

    new_model_file = PipelineParameter(name="new_model_file ",
                                       default_value='/' + e.model_name
                                       + '.pkl')
    new_model_folder = PipelineData("new_model_folder", datastore=datastore)
    est = SKLearn(source_directory=e.sources_directory_train,
                  entry_script=e.train_script_path,
                  pip_packages=['azureml-sdk', 'scikit-learn==0.20.3',
                                'azureml-dataprep[pandas,fuse]>=1.1.14'],
                  compute_target=aml_compute)

    trainingStep = EstimatorStep(
        name="Model Training",
        estimator=est,
        estimator_entry_script_arguments=["--clean_data_folder",
                                          clean_data_folder,
                                          "--new_model_folder",
                                          new_model_folder,
                                          "--clean_data_file",
                                          clean_data_file.default_value,
                                          "--new_model_file",
                                          new_model_file.default_value],
        runconfig_pipeline_params=None,
        inputs=[clean_data_folder],
        outputs=[new_model_folder],
        compute_target=aml_compute,
        allow_reuse=False)

    print("Step Train created")

    model_name_param = PipelineParameter(name="model_name",
                                         default_value=e.model_name)

    evaluateStep = PythonScriptStep(
        name="Evaluate Model",
        source_directory=e.sources_directory_train,
        script_name=e.evaluate_script_path,
        arguments=["--model_name", model_name_param],
        compute_target=aml_compute,
        allow_reuse=False)

    print("Step Evaluate created")

    registerStep = PythonScriptStep(
        name="Register Model",
        source_directory=e.sources_directory_train,
        script_name=e.register_script_path,
        arguments=["--new_model_folder", new_model_folder,
                   "--new_model_file", new_model_file,
                   "--model_name", model_name_param],
        inputs=[new_model_folder],
        compute_target=aml_compute,
        allow_reuse=False)

    print("Step Register created")

    if ((e.run_evaluation).lower() == 'true'):
        print("Include evaluation step before register step.")
        trainingStep.run_after(prepDataStep)
        evaluateStep.run_after(trainingStep)
        registerStep.run_after(evaluateStep)
    else:
        print("Exclude evaluation step and directly run register step.")
        trainingStep.run_after(prepDataStep)
        registerStep.run_after(trainingStep)

    pipeline = Pipeline(workspace=aml_workspace, steps=[registerStep])
    pipeline.validate()
    print("Pipeline is built")

    pipeline._set_experiment_name
    published_pipeline = pipeline.publish(
        name=e.pipeline_name,
        description="Predict Employee Retention Model training pipeline",
        version=e.build_id
    )
    print(f'Published pipeline: {published_pipeline.name}')
    print(f'for build {published_pipeline.version}')
Пример #20
0
estimator = Estimator(source_directory='experiment_folder',
                      entry_script='training_script.py',
                      compute_target='local',
                      conda_packages=['scikit-learn']
                      )

# Create and run an experiment
experiment = Experiment(workspace = ws, name = 'training_experiment')
run = experiment.submit(config=estimator)


from azureml.train.sklearn import SKLearn
from azureml.core import Experiment

# Create an estimator
estimator = SKLearn(source_directory='experiment_folder',
                    entry_script='training_script.py'
                    compute_target='local'
                    )

# Create and run an experiment
experiment = Experiment(workspace = ws, name = 'training_experiment')
run = experiment.submit(config=estimator)


from azureml.core import Run
import argparse
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
Пример #21
0
    # There's only one parameter, so grid sampling will try each value - with multiple parameters it would try every combination
    '--regularization':
    choice(0.001, 0.005, 0.01, 0.05, 0.1, 1.0)
})

# Set evaluation policy to stop poorly performing training runs early
policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1)

# Get the training dataset
diabetes_ds = ws.datasets.get("diabetes_dataset")

# Create an estimator that uses the remote compute
hyper_estimator = SKLearn(
    source_directory=experiment_folder,
    inputs=[diabetes_ds.as_named_input('diabetes')
            ],  # Pass the dataset as an input
    compute_target=gpu_cluster,
    conda_packages=['pandas', 'ipykernel', 'matplotlib'],
    pip_packages=['azureml-sdk', 'argparse', 'pyarrow'],
    entry_script='diabetes_training.py')

# Configure hyperdrive settings
hyperdrive = HyperDriveConfig(estimator=hyper_estimator,
                              hyperparameter_sampling=params,
                              policy=policy,
                              primary_metric_name='AUC',
                              primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                              max_total_runs=6,
                              max_concurrent_runs=4)

# Run the experiment
run = experiment.submit(config=hyperdrive)
    datastore_name='telemetry',
    container_name=args.storage_container,
    account_name=args.storage_account,
    account_key=args.storage_key,
)

input_data = DataReference(
    datastore=telemetry_ds,
    data_reference_name="input_data",
    path_on_datastore=args.storage_path,
)

preprocessing_est = SKLearn(
    source_directory='010-preprocessing',
    compute_target=cpu_cluster,
    entry_script='dataprep.py',
    conda_packages=['pandas'],
    pip_packages=['fastavro'],
)

output = PipelineData("output", datastore=telemetry_ds)
preprocessing_step = EstimatorStep(
    name="Preprocessing_Train",
    estimator=preprocessing_est,
    estimator_entry_script_arguments=[
        "--data_dir", input_data, "--output_data_dir", output
    ],
    inputs=[input_data],
    outputs=[output],
    compute_target=cpu_cluster,
    allow_reuse=True,
Пример #23
0
# Sample a range of parameter values
params = GridParameterSampling(
    {
        # There's only one parameter, so grid sampling will try each value - with multiple parameters it would try every combination
        '--regularization': choice(0.001, 0.005, 0.01, 0.05, 0.1, 1.0)
    }
)


# Get the training dataset
diabetes_ds = ws.datasets.get("diabetes dataset")

# Create an estimator that uses the remote compute
hyper_estimator = SKLearn(source_directory=experiment_folder,
                          inputs=[diabetes_ds.as_named_input('diabetes')], # Pass the dataset as an input...
                          pip_packages=['azureml-sdk'], # ...so we need azureml-dataprep (it's in the SDK!)
                          entry_script='diabetes_training.py',
                          compute_target = training_cluster,)


# Configure hyperdrive settings
hyperdrive = HyperDriveConfig(estimator=hyper_estimator, 
                          hyperparameter_sampling=params, 
                          policy=None, 
                          primary_metric_name='AUC', 
                          primary_metric_goal=PrimaryMetricGoal.MAXIMIZE, 
                          max_total_runs=6,
                          max_concurrent_runs=4)

# Run the experiment
experiment = Experiment(workspace = ws, name = 'diabates_training_hyperdrive')