Exemplo n.º 1
0
def run_azure_pytorch():

    compute_target = setup_azure_gpu()

    experiment_name = 'pytorch'

    exp = Experiment(workspace=ws, name=experiment_name)

    ds = ws.get_default_datastore()

    print(ds.datastore_type, ds.account_name, ds.container_name)

    # ds.upload(src_dir='./data', target_path='mnist', overwrite=True, show_progress=True)

    script_params = {'--data_dir': ds}

    pt_est = PyTorch(source_directory='./train-scripts',
                     script_params=script_params,
                     compute_target=compute_target,
                     entry_script='train-pytorch.py',
                     use_gpu=True)

    run = exp.submit(pt_est)
    run
    run.wait_for_completion(show_output=True)
    print(run.get_metrics())
    print(run.get_file_names())

    # register model
    model = run.register_model(model_name=experiment_name,
                               model_path='outputs/pytorch_model.pt')
    print(model.name, model.id, model.version, sep='\t')
    compute_target.delete()
Exemplo n.º 2
0
def GetEstimator(environmentInfo, inpData):
    ''' Defines estimator for AML experiment.
        Method signature defined by AP.Data.

    Args:
        environmentInfo:
            workspace: The workspace with the correct svc that the run will be submitted to. This gives you access              to the default datastore, keyvault, container registry.
            datastore: Datastore where the data is located in
            compute : compute cluster the run should target
        inputData:
            dataDir: dataset directory
            dataset_name: name of dataset
            training_config: path of training configuration file
            toolsDir: directory with auxialiary tools
            scriptConfig: path of additional configuration for this method
            sourceDir: directory with experiment code
    '''

    conda_packages = None
    pip_packages = None

    # authenticated workspace. You can get access to the default key vault from here
    workspace = environmentInfo.workspace

    # https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.keyvault(class)?view=azure-ml-py
    keyvault = workspace.get_default_keyvault()

    ds = environmentInfo.datastore

    script_params = {
        "--data_dir": inpData.dataDir,
        "--training_config":
        json.dumps(read_from_json(inpData.training_config))
    }

    print(f'using script_params {script_params}')
    estimatorConfig = read_from_json(inpData.scriptConfig)

    conda_packages = estimatorConfig["conda_packages"]
    pip_packages = estimatorConfig["pip_packages"]
    print(
        f'got {conda_packages} and {pip_packages} from config for conda and pip packages'
    )

    # default if json not past in or not set.
    if conda_packages is None:
        conda_packages = ["numpy", "pillow"]
    if pip_packages is None:
        pip_packages = [
            "facenet-pytorch", "torch===1.4.0", "torchvision===0.5.0"
        ]

    return PyTorch(source_directory=inpData.sourceDir,
                   script_params=script_params,
                   compute_target=environmentInfo.compute,
                   entry_script='train_model_pytorch.py',
                   use_gpu=True,
                   source_directory_data_store=environmentInfo.datastore,
                   conda_packages=conda_packages,
                   pip_packages=pip_packages)
Exemplo n.º 3
0
def train_step(train_dir, valid_dir, compute_target):
    '''
    This step will fine-tune a RESNET-18 model on our dataset using PyTorch. 
    It will use the corresponding input image directories as training and validation data.

    :param train_dir: The reference to the directory containing the training data
    :type train_dir: DataReference
    :param valid_dir: The reference to the directory containing the validation data
    :type valid_dir: DataReference
    :param compute_target: The compute target to run the step on
    :type compute_target: ComputeTarget
    
    :return: The training step, step outputs dictionary (keys: model_dir)
    :rtype: EstimatorStep, dict
    '''

    num_epochs = PipelineParameter(name='num_epochs', default_value=25)
    batch_size = PipelineParameter(name='batch_size', default_value=16)
    learning_rate = PipelineParameter(name='learning_rate', default_value=0.001)
    momentum = PipelineParameter(name='momentum', default_value=0.9)

    model_dir = PipelineData(
        name='model_dir', 
        pipeline_output_name='model_dir',
        datastore=train_dir.datastore,
        output_mode='mount',
        is_directory=True)

    outputs = [model_dir]
    outputs_map = { 'model_dir': model_dir }

    estimator = PyTorch(
        source_directory=os.path.dirname(os.path.abspath(__file__)),
        entry_script='train.py',
        framework_version='1.3',
        compute_target=compute_target,
        use_gpu=True)

    step = EstimatorStep(
        name="Train Model",
        estimator=estimator,
        estimator_entry_script_arguments=[
            '--train_dir', train_dir, 
            '--valid_dir', valid_dir, 
            '--output_dir', model_dir, 
            '--num_epochs', num_epochs, 
            '--batch_size', batch_size,
            '--learning_rate', learning_rate, 
            '--momentum', momentum
        ],
        inputs=[train_dir, valid_dir],
        compute_target=compute_target,
        outputs=outputs,
        allow_reuse=False)

    return step, outputs_map
Exemplo n.º 4
0
def main(args, ws):
    compute = ws.compute_targets[args.cluster]
    print(compute.get_status().serialize())

    experiment = Experiment(ws, name='pytorch-distributed-horovod')
    estimator = PyTorch(source_directory=os.path.dirname(
        os.path.abspath(__file__)),
                        compute_target=compute,
                        entry_script='train.py',
                        node_count=args.nodes,
                        distributed_training=Mpi(),
                        use_gpu=True)
    run = experiment.submit(estimator)
    print(run)
Exemplo n.º 5
0
def pytorch_version_from_conda_dependencies(conda_dependencies: CondaDependencies) -> Optional[str]:
    """
    Given a CondaDependencies object, look for a spec of the form "pytorch=...", and return
    whichever supported version is compatible with the value, or None if there isn't one.
    """
    supported_versions = PyTorch.get_supported_versions()
    for spec in conda_dependencies.conda_packages:
        components = spec.split("=")
        if len(components) == 2 and components[0] == "pytorch":
            version = components[1]
            for supported in supported_versions:
                if version.startswith(supported) or supported.startswith(version):
                    return supported
    return None
Exemplo n.º 6
0
def evaluate_step(model_dir, test_dir, compute_target):
    '''
    This step evaluates the trained model on the testing data and outputs the accuracy.

    :param model_dir: The reference to the directory containing the trained model
    :type model_dir: DataReference
    :param test_dir: The reference to the directory containing the testing data
    :type test_dir: DataReference
    :param compute_target: The compute target to run the step on
    :type compute_target: ComputeTarget
    
    :return: The evaluate step, step outputs dictionary (keys: accuracy_file)
    :rtype: EstimatorStep, dict
    '''

    accuracy_file = PipelineData(
        name='accuracy_file', 
        pipeline_output_name='accuracy_file',
        datastore=test_dir.datastore,
        output_mode='mount',
        is_directory=False)

    outputs = [accuracy_file]
    outputs_map = { 'accuracy_file': accuracy_file }
    
    estimator = PyTorch(
        source_directory=os.path.dirname(os.path.abspath(__file__)),
        entry_script='evaluate.py',
        framework_version='1.3',
        compute_target=compute_target,
        use_gpu=True)

    step = EstimatorStep(
        name="Evaluate Model",
        estimator=estimator,
        estimator_entry_script_arguments=[
            '--test_dir', test_dir, 
            '--model_dir', model_dir, 
            '--accuracy_file', accuracy_file
        ],
        inputs=[model_dir, test_dir],
        outputs=outputs,
        compute_target=compute_target,
        allow_reuse=True)

    return step, outputs_map
Exemplo n.º 7
0
# %%

project_folder = './pytorch-mnist'
os.makedirs(project_folder, exist_ok=True)

# %%
import shutil

shutil.copy('mnist.py', project_folder)

# %%
from azureml.train.dnn import PyTorch

estimator = PyTorch(source_directory=project_folder,
                    script_params={'--output-dir': './outputs'},
                    compute_target=compute_target,
                    entry_script='mnist.py',
                    use_gpu=False)

estimator.conda_dependencies.remove_conda_package('pytorch=0.4.0')
estimator.conda_dependencies.add_conda_package('pytorch-nightly')
estimator.conda_dependencies.add_channel('pytorch')

# %%
run = exp.submit(estimator)
run.wait_for_completion(show_output=True)

# %%
run.get_file_names()
model_path = os.path.join('outputs', 'mnist.onnx')
run.download_file(model_path, output_file_path=model_path)
Exemplo n.º 8
0
    def my_azure_app(cfg: DictConfig) -> None:
        print(cfg.pretty())
        args_dict = OmegaConf.to_container(cfg, resolve=False)

        yaml_file_nm = args_dict["yaml_file"].split("/")[-1].split(".")[0]
        conf_file = os.path.join(
            args_dict["root_path"],
            yaml_file_nm + "_" + str(datetime.datetime.now()) + ".json",
        )
        print(conf_file)

        with open(conf_file, "w") as out:
            out.write(json.dumps(args_dict))

        # First, list the supported VM families for Azure Machine Learning Compute
        # ws = Workspace.get('experiments')
        cluster_name = "gpucluster"
        experiment_name = args_dict["experiment_name"] + "_azure"
        disable_gpu = args_dict["disable_gpu"]
        script_folder = "."  # todo. this is overriden by hydra
        script_folder = (hydra.utils.get_original_cwd()
                         )  # todo. this is overriden by hydra
        data_path = os.path.join(args_dict["root_path"],
                                 args_dict["data_subdir"])

        sub_id = os.getenv("AZ_SUBS_ID")

        assert sub_id is not None
        # Edit a run configuration property on the fly.
        run_local = RunConfiguration()
        run_local.environment.python.user_managed_dependencies = True

        ws = Workspace.get(
            name="experiments",
            subscription_id=sub_id,
            resource_group="default_resource_group",
        )

        # print(AmlCompute.supported_vmsizes(workspace=ws))

        # Create a new runconfig object
        _ = RunConfiguration()

        # Signal that you want to use AmlCompute to execute the script
        # run_temp_compute.target = "amlcompute"

        # AmlCompute is created in the same region as your workspace
        # Set the VM size for AmlCompute from the list of supported_vmsizes

        try:
            compute_target = ComputeTarget(workspace=ws, name=cluster_name)
            print("Found existing compute target")
        except ComputeTargetException:
            print("Creating a new compute target...")
            compute_config = AmlCompute.provisioning_configuration(
                vm_size=args_dict["vm_size"], max_nodes=1)

            compute_target = ComputeTarget.create(ws, cluster_name,
                                                  compute_config)
            compute_target.wait_for_completion(show_output=True,
                                               min_node_count=None,
                                               timeout_in_minutes=10)

        s = ws.get_default_datastore()

        # A reference to the root_path in azure after uplaoding
        _ = s.upload(
            src_dir=data_path,
            target_path=data_path,
            overwrite=False,
            show_progress=True,
        )

        # All path except file_name
        # script_target_path = "/".join(args_dict['yaml_file'].split("/")[:-1])
        script_target_path = "/".join(
            conf_file.split("/")[:-1])  # All path except file_name
        print(script_target_path)
        # script_fname = args.config_file.split("/")[-1]
        script_fname = conf_file.split("/")[-1]
        print(script_fname)
        print("---" * 100)

        azure_script_path = s.upload_files(
            files=[conf_file],
            target_path=script_target_path,
            overwrite=True,
            show_progress=True,
        )

        print(azure_script_path)

        azure_script_abs_path = DataReference(datastore=s,
                                              data_reference_name="input_data",
                                              path_on_datastore=conf_file)

        azure_root_path = DataReference(
            datastore=s,
            data_reference_name="root_data",
            path_on_datastore=args_dict["root_path"],
        )

        exp = Experiment(workspace=ws, name=experiment_name)

        # src = ScriptRunConfig(source_directory = script_folder,
        # script = 'run.py', arguments=['--config_file', 'local/pairs.json'],
        # run_config = run_temp_compute)

        # Using pytorch estimator - proper way to submit pytorch jobs
        script_params = {
            "--config_file": azure_script_abs_path,
            "--root_path": azure_root_path,
            "--experiment_name": experiment_name,
        }

        print("GPU Disabled: {}".format(disable_gpu))

        estimator = PyTorch(
            source_directory=script_folder,
            script_params=script_params,
            compute_target=compute_target,
            entry_script="run.py",
            use_gpu=not disable_gpu,
            pip_packages=["pillow==5.4.1"],
        )

        # you can name this as run
        _ = exp.submit(estimator)
import azureml.core
from azureml.core import Workspace
from azureml.core.run import Run
from azureml.core.experiment import Experiment
from azureml.train.dnn import PyTorch

subscription_id = "" # The ID of the Azure Subscription
resource_group = "AdvanceAnalytics.Aml.Experiments" # Name of a logical resource group
workspace_name = "aa-ml-aml-workspace" # The name of the workspace to look for or to create
workspace_region = 'eastus' # Location of the workspace
computetarget_vm= 'Standard_NC6' # Size of the VM to use
experiment_name = 'azureml-gpubenchmark'
train_script = 'train_and_track.py'

ws = Workspace.create(
    name = workspace_name,
    subscription_id = subscription_id,
    resource_group = resource_group, 
    location = workspace_region,
    exist_ok = True)

src = PyTorch(source_directory =  r'.\fastai', compute_target='amlcompute', vm_size=computetarget_vm, entry_script = train_script, use_gpu = True, pip_packages = ['fastai', "azureml-sdk"])
experiment = Experiment(workspace=ws, name=experiment_name)
run = experiment.submit(src)

run.wait_for_completion(show_output = True)




Exemplo n.º 10
0
    def _start_estimator_training(self, training_name: str, estimator_type: str = None, input_datasets: np.array = None, input_datasets_to_download: np.array = None, compute_target:str='local', gpu_compute: bool = False, script_parameters: dict = None, show_widget: bool = True, **kwargs):
        ''' 
        Will start a new training using an Estimator, taking the training name as the folder of the run
        Args:
            training_name (str): The name of a training.  This will be used to create a directory.  Can contain subdirectory
            environment_type (str): one of these values (tensorflow, sklearn, pytorch).  
            input_datasets (np.array): An array of data set names that will be mounted on the compute in a directory of the dataset name
            input_datasets_to_download (np.array): An array of data set names that will be downloaded to the compute in a directory of the dataset name
            compute_target (str): The compute target (default = 'local') on which the training should be executed
            gpu_compute (bool): Indicates if GPU compute is required for this script or not
            script_parameters (dict): A dictionary of key/value parameters that will be passed as arguments to the training script
            show_widget (bool): Will display the live tracking of the submitted Run
        '''
        from azureml.train.estimator import Estimator

        # Check if directory exists
        if not(os.path.exists(training_name) and os.path.isdir(training_name)):
            raise FileNotFoundError(training_name)

        # Check compute target
        if compute_target != 'local':
            self.__check_compute_target(compute_target, gpu_compute)
            

        # Add datasets
        datasets = list()
        if(input_datasets is not None):
            for ds in input_datasets:
                datasets.append(self.__workspace.datasets[ds].as_named_input(ds).as_mount(path_on_compute=ds))
        if(input_datasets_to_download is not None):
            for ds in input_datasets_to_download:
                datasets.append(self.__workspace.datasets[ds].as_named_input(ds).as_download(path_on_compute=ds))

        # as mount - as download
        constructor_parameters = {
            'source_directory':training_name,
            'script_params':script_parameters,
            'inputs':datasets,
            'compute_target':compute_target,
            'entry_script':'train.py',
            'pip_requirements_file':'requirements.txt', 
            'use_gpu':gpu_compute,
            'use_docker':True}
        
        print('Creating estimator of type', estimator_type)

        if(estimator_type is None):
            # Using default Estimator
            estimator = Estimator(**constructor_parameters)
        elif(estimator_type == 'tensorflow'):
            from azureml.train.dnn import TensorFlow
            version_par = 'framework_version'
            if(not version_par in constructor_parameters.keys()):
                print('Defaulting to version 2.0 for TensorFlow')
                constructor_parameters[version_par] = '2.0'
            estimator = TensorFlow(**constructor_parameters)
        elif(estimator_type == 'sklearn'):
            from azureml.train.sklearn import SKLearn
            estimator = SKLearn(**constructor_parameters)
        elif(estimator_type == 'pytorch'):
            from azureml.train.dnn import PyTorch
            estimator = PyTorch(**constructor_parameters)

        # Submit training
        self.__current_run = self.__experiment.submit(estimator)
Exemplo n.º 11
0
    arguments=["--model_name_or_path", model_name_param, "--max_seq_length", max_seq_len_param],
    outputs=[prepared_dataset],
    source_directory=prep_project_folder,
    compute_target=compute_target,
    runconfig=run_config,
    allow_reuse=True,
)

estimator = PyTorch(
    source_directory=train_project_folder,
    compute_target=compute_target,
    entry_script=train_script_name,
    use_gpu=True,
    pip_packages=[
        "azureml-sdk",
        "nlp==0.2.0",
        "pytorch-lightning==0.8.0rc4",
        "transformers==2.11.0",
        "pandas",
        "scipy",
        "scikit-learn",
    ],
    framework_version="1.5",
)

train_step = EstimatorStep(
    name="Training Step",
    estimator=estimator,
    estimator_entry_script_arguments=[
        "--model_name_or_path",
        model_name_param,
        "--task",
Exemplo n.º 12
0
if args.reset:
    script_params['--reset'] = ''

if args.sink:
    script_params['--sink'] = ''

shared_memory_size = '8g'
if args.shared_memory_size:
    shared_memory_size = args.shared_memory_size

cluster = ComputeTarget(workspace=ws, name=args.cluster_name)
run_config.target = cluster

project_dir = './pytorch'
experiment_name = 'gc_' + name

experiment = Experiment(ws, name=experiment_name)

src = PyTorch(source_directory=project_dir,
              script_params=script_params,
              compute_target=cluster,
              entry_script='main.py',
              use_gpu=True,
              shm_size=shared_memory_size,
              pip_packages=['numpy==1.17.0', 'Pillow==6.1.0', 'scipy==1.3.0'])

run = experiment.submit(src)
if args.show_output:
    run.wait_for_completion(args.show_output)
Exemplo n.º 13
0
    "num_epochs":
    choice(1, 2),
    "batch_size":
    choice(10, 20, 50, 100, 200, 300, 500, 1000),
    "hidden_size":
    choice(300, 400)
})

# Define Run Configuration
estimator = PyTorch(
    entry_script='train.py',
    source_directory=os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                  '../', 'modeling'),
    compute_target=compute_target_hyper,
    distributed_training=MpiConfiguration(),
    framework_version='1.4',
    use_gpu=True,
    pip_packages=[
        'numpy==1.15.4', 'pandas==0.23.4', 'scikit-learn==0.20.1',
        'scipy==1.0.0', 'matplotlib==3.0.2', 'utils==0.9.0',
        'onnxruntime==1.2.0', 'onnx==1.6.0'
    ])

# Define the pipeline step
hypertuning = HyperDriveStep(
    name='hypertrain',
    hyperdrive_config=HyperDriveConfig(
        estimator=estimator,
        hyperparameter_sampling=param_sampling,
        policy=None,
        primary_metric_name="accuracy",
Exemplo n.º 14
0
## Training Step ##
# train.py does the training based on the processed data #
# Output location for the produced model
model = PipelineData(name="model", datastore=ds, output_path_on_compute="model")

# Estimator script params
estimator_script_params = [
    "--data-folder", training_data_location,
    "--output-folder", model
]

# Create the tensorflow Estimator
trainEstimator = PyTorch(
    source_directory = script_folder,
    compute_target = cluster,
    entry_script = "steps/train.py", 
    use_gpu = True,
    framework_version='1.3'
)

# Create a pipeline step with the TensorFlow Estimator
trainOnGpuStep = EstimatorStep(
    name='Train Estimator Step',
    estimator=trainEstimator,
    inputs=[training_data_location],
    outputs=[model],
    compute_target=cluster,
    estimator_entry_script_arguments = estimator_script_params
) 

## Register Model Step ##
Exemplo n.º 15
0
        "--exp_name": workdir.split('/')[-1],
    }

    def make_container_registry(address, username, password):
        cr = ContainerRegistry()
        cr.address = address
        cr.username = username
        cr.password = password
        return cr


    estimator = PyTorch(source_directory='./',
                        script_params=script_params,
                        compute_target=ct,
                        use_gpu=True,
                        shm_size='256G',
                        # image_registry_details= my_registry,
                        entry_script=entry_script,
                        custom_docker_image=custom_docker_image,
                        user_managed=True,
                        )


    if myargs.itp > 0:
        cmk8sconfig = K8sComputeConfiguration()

        cmk8s = dict()
        cmk8s['gpu_count'] = myargs.card

        cmk8sconfig.configuration = cmk8s
        estimator.run_config.cmk8scompute = cmk8sconfig
Exemplo n.º 16
0
    # define script parameters
    script_params_3 = {
        '--models': models,
        '--data_folder_train':
        dataset_train.as_named_input('train').as_mount(),
        '--data_folder_test': dataset_test.as_named_input('test').as_mount(),
        '--local': 'no'
    }

    estimator = PyTorch(
        entry_script='train.py',
        script_params=script_params_3,
        source_directory=os.path.dirname(os.path.realpath(__file__)),
        compute_target=workspace.compute_targets["alwaysoncluster"],
        distributed_training=MpiConfiguration(),
        framework_version='1.4',
        use_gpu=True,
        pip_packages=[
            'numpy==1.15.4', 'pandas==0.23.4', 'scikit-learn==0.20.1',
            'scipy==1.0.0', 'matplotlib==3.0.2', 'utils==0.9.0',
            'onnxruntime==1.2.0', 'onnx==1.6.0'
        ])

    experiment = Experiment(workspace=workspace, name="deeplearning")
    run = experiment.submit(estimator)

    if hyperdrive is True:
        # Define multi-run configuration
        hyperdrive_run_config = HyperDriveConfig(
            estimator=estimator,
            hyperparameter_sampling=param_sampling,
Exemplo n.º 17
0
    compute_target, compute_target_created = get_compute_target(
        workspace, "lowpriority")
    dataset = Dataset.get_by_name(workspace=workspace, name=args.dataset_name)
    data_directory = dataset.as_mount()
    experiment = Experiment(workspace, name=args.experiment_name)
    script_params = {
        "--action": "final_layer",
        "--epochs": args.epochs,
        "--learning-rate": args.learning_rate,
        "--gamma": args.gamma,
        "--momentum": args.momentum,
        "--step-size": args.step_size,
        "--environment": "azure",
        "--model-dir": "./outputs",
        "--data-dir": data_directory,
    }
    estimator = PyTorch(
        source_directory="hymenoptera",
        script_params=script_params,
        compute_target=compute_target,
        entry_script="train.py",
        use_gpu=True,
        pip_packages=["azureml-dataprep[pandas,fuse]", "azureml-mlflow"],
    )
    run = experiment.submit(estimator)
    run.wait_for_completion(show_output=True)

    if compute_target_created:
        print("Deleting compute target")
        compute_target.delete()
Exemplo n.º 18
0
ws = Workspace.from_config()
print("Workspace details:")
print(ws.name, ws.location, ws.resource_group, ws.location, sep='\t')
script_folder = os.getcwd()

# Create PyTorch experiment
compute_name = "gpu-nc6-1"

if compute_name in ws.compute_targets:
    compute_target = ws.compute_targets[compute_name]
    if compute_target and type(compute_target) is AmlCompute:
        print('Found compute target: ' + compute_name)
    else:
        print("compute target not found")

# Create experiment
experiment_name = 'my_experiment'
exp = Experiment(workspace=ws, name=experiment_name)

script_params = {}

pt_est = PyTorch(source_directory=script_folder,
                 script_params=script_params,
                 compute_target=compute_target,
                 entry_script='pytorch_net.py',
                 use_gpu=True)

# Submit PyTorch experiment
run = exp.submit(pt_est)
run.wait_for_completion(show_output=True)
Exemplo n.º 19
0
experiment_name = "pytorch-mnist"
exp = Experiment(workspace=ws, name=experiment_name)

compute = get_or_create_compute(ws)

prepared_data = MNISTPrepareData(root="./data")

ds = ws.get_default_datastore()
# ds.upload(src_dir=prepared_data.processed_folder, target_path='mnist_pytorch', overwrite=True, show_progress=False)

script_params = {'--data-folder': ds.as_mount()}

script_folder = './scripts'

est = PyTorch(source_directory=script_folder,
              script_params=script_params,
              compute_target=compute,
              entry_script='train.py')

run = exp.submit(config=est)
status = run.get_status()

while status != "Completed":
    if status in ["Failed", "Canceled"]:
        print('Run failed or cancelled')
        break
    else:
        print('Still running, Sleeping for a min and checking again...')
        time.sleep(60)
        status = run.get_status()

print(run.get_metrics())
Exemplo n.º 20
0
def main(req: func.HttpRequest) -> (func.HttpResponse):
    logging.info('Python HTTP trigger function processed a request.')

    # For now this can be a POST where we have <base url>/api/HttpTrigger?start=<any string>
    image_url = req.params.get('start')
    logging.info(type(image_url))

    # Use service principal secrets to create authentication vehicle and 
    # define workspace object
    try:    
        svc_pr = ServicePrincipalAuthentication(
            tenant_id=os.getenv('TENANT_ID', ''),
            service_principal_id=os.getenv('APP_ID', ''),
            service_principal_password=os.getenv('PRINCIPAL_PASSWORD', ''))

        ws = Workspace(subscription_id=os.getenv('AZURE_SUB', ''),
                    resource_group=os.getenv('RESOURCE_GROUP', ''),
                    workspace_name=os.getenv('WORKSPACE_NAME',''),
                    auth=svc_pr)
        print("Found workspace {} at location {} using Azure CLI \
            authentication".format(ws.name, ws.location))
    # Usually because authentication didn't work
    except ProjectSystemException as err:
        print('Authentication did not work.')
        return json.dumps('ProjectSystemException')
    # Need to create the workspace
    except Exception as err:
        ws = Workspace.create(name=os.getenv('WORKSPACE_NAME', ''),
                    subscription_id=os.getenv('AZURE_SUB', ''), 
                    resource_group=os.getenv('RESOURCE_GROUP', ''),
                    create_resource_group=True,
                    location='westus', # Or other supported Azure region   
                    auth=svc_pr)
        print("Created workspace {} at location {}".format(ws.name, ws.location))

       

    # choose a name for your cluster - under 16 characters
    cluster_name = "gpuforpytorch"

    try:
        compute_target = ComputeTarget(workspace=ws, name=cluster_name)
        print('Found existing compute target.')
    except ComputeTargetException:
        print('Creating a new compute target...')
        # AML Compute config - if max_nodes are set, it becomes persistent storage that scales
        compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6',
                                                            min_nodes=0,
                                                            max_nodes=2)
        # create the cluster
        compute_target = ComputeTarget.create(ws, cluster_name, compute_config)
        compute_target.wait_for_completion(show_output=True)

    # use get_status() to get a detailed status for the current cluster. 
    # print(compute_target.get_status().serialize())

    # # Create a project directory and copy training script to ii
    project_folder = os.path.join(os.getcwd(), 'HttpTrigger', 'project')
    # os.makedirs(project_folder, exist_ok=True)
    # shutil.copy(os.path.join(os.getcwd(), 'HttpTrigger', 'pytorch_train.py'), project_folder)

    # Create an experiment
    experiment_name = 'fish-no-fish'
    experiment = Experiment(ws, name=experiment_name)

    # Use an AML Data Store for training data
    ds = Datastore.register_azure_blob_container(workspace=ws, 
        datastore_name='funcdefaultdatastore', 
        container_name=os.getenv('STORAGE_CONTAINER_NAME_TRAINDATA', ''),
        account_name=os.getenv('STORAGE_ACCOUNT_NAME', ''), 
        account_key=os.getenv('STORAGE_ACCOUNT_KEY', ''),
        create_if_not_exists=True)

    # Use an AML Data Store to save models back up to
    ds_models = Datastore.register_azure_blob_container(workspace=ws, 
        datastore_name='modelsdatastorage', 
        container_name=os.getenv('STORAGE_CONTAINER_NAME_MODELS', ''),
        account_name=os.getenv('STORAGE_ACCOUNT_NAME', ''), 
        account_key=os.getenv('STORAGE_ACCOUNT_KEY', ''),
        create_if_not_exists=True)

    # Set up for training ("trans" flag means - use transfer learning and 
    # this should download a model on compute)
    # Using /tmp to store model and info due to the fact that
    # creating new folders and files on the Azure Function host
    # will trigger the function to restart.
    script_params = {
        '--data_dir': ds.as_mount(),
        '--num_epochs': 30,
        '--learning_rate': 0.01,
        '--output_dir': '/tmp/outputs',
        '--trans': 'True'
    }

    # Instantiate PyTorch estimator with upload of final model to
    # a specified blob storage container (this can be anything)
    estimator = PyTorch(source_directory=project_folder, 
                        script_params=script_params,
                        compute_target=compute_target,
                        entry_script='pytorch_train.py',
                        use_gpu=True,
                        inputs=[ds_models.as_upload(path_on_compute='./outputs/model_finetuned.pth')])

    run = experiment.submit(estimator)
    print(run.get_details())
    
    # # The following would certainly be blocking, but that's ok for debugging
    # while run.get_status() not in ['Completed', 'Failed']: # For example purposes only, not exhaustive
    #    print('Run {} not in terminal state'.format(run.id))
    #    time.sleep(10)

    return json.dumps(run.get_status())
def main(req: func.HttpRequest) -> (func.HttpResponse):
    logging.info('Python HTTP trigger function processed a request.')

    # For now this can be a POST where we have <base url>/api/HttpTrigger?start=<any string>
    image_url = req.params.get('start')
    logging.info(type(image_url))

    # Write a config.json (fill in template values with system vars)
    config_temp = {
        'subscription_id': os.getenv('AZURE_SUB', ''),
        'resource_group': os.getenv('RESOURCE_GROUP', ''),
        'workspace_name': os.getenv('WORKSPACE_NAME', '')
    }
    with open(os.path.join(os.getcwd(), 'HttpTrigger', 'config.json'),
              'w') as f:
        json.dump(config_temp, f)

    # Get the workspace from config.json
    try:
        ws = Workspace.from_config(
            os.path.join(os.getcwd(), 'HttpTrigger', 'config.json'))
    # Authentication didn't work
    except ProjectSystemException as err:
        return json.dumps('ProjectSystemException')
    # Need to create the workspace
    except Exception as err:
        ws = Workspace.create(
            name=os.getenv('WORKSPACE_NAME', ''),
            subscription_id=os.getenv('AZURE_SUB', ''),
            resource_group=os.getenv('RESOURCE_GROUP', ''),
            create_resource_group=True,
            location='eastus2'  # Or other supported Azure region   
        )

    # choose a name for your cluster
    cluster_name = "gpuclusterplease"

    try:
        compute_target = ComputeTarget(workspace=ws, name=cluster_name)
        print('Found existing compute target.')
    except ComputeTargetException:
        print('Creating a new compute target...')
        # AML Compute config - if max_nodes are set, it becomes persistent storage that scales
        compute_config = AmlCompute.provisioning_configuration(
            vm_size='STANDARD_NC6', max_nodes=4)
        # create the cluster
        compute_target = ComputeTarget.create(ws, cluster_name, compute_config)
        compute_target.wait_for_completion(show_output=True)

    # use get_status() to get a detailed status for the current cluster.
    print(compute_target.get_status().serialize())

    # Create a project directory and copy training script to ii
    project_folder = os.path.join(os.getcwd(), 'HttpTrigger', 'project')
    os.makedirs(project_folder, exist_ok=True)
    shutil.copy(os.path.join(os.getcwd(), 'HttpTrigger', 'pytorch_train.py'),
                project_folder)

    # Create an experiment
    experiment_name = 'fish-no-fish'
    experiment = Experiment(ws, name=experiment_name)

    # Use an AML Data Store for training data
    ds = Datastore.register_azure_blob_container(
        workspace=ws,
        datastore_name='funcdefaultdatastore',
        container_name=os.getenv('STORAGE_CONTAINER_NAME_TRAINDATA', ''),
        account_name=os.getenv('STORAGE_ACCOUNT_NAME', ''),
        account_key=os.getenv('STORAGE_ACCOUNT_KEY', ''),
        create_if_not_exists=True)

    # Use an AML Data Store to save models back up to
    ds_models = Datastore.register_azure_blob_container(
        workspace=ws,
        datastore_name='modelsdatastorage',
        container_name=os.getenv('STORAGE_CONTAINER_NAME_MODELS', ''),
        account_name=os.getenv('STORAGE_ACCOUNT_NAME', ''),
        account_key=os.getenv('STORAGE_ACCOUNT_KEY', ''),
        create_if_not_exists=True)

    # Set up for training ("trans" flag means - use transfer learning and
    # this should download a model on compute)
    script_params = {
        '--data_dir': ds.as_mount(),
        '--num_epochs': 30,
        '--learning_rate': 0.01,
        '--output_dir': './outputs',
        '--trans': 'True'
    }

    # Instantiate PyTorch estimator with upload of final model to
    # a specified blob storage container (this can be anything)
    estimator = PyTorch(
        source_directory=project_folder,
        script_params=script_params,
        compute_target=compute_target,
        entry_script='pytorch_train.py',
        use_gpu=True,
        inputs=[
            ds_models.as_upload(
                path_on_compute='./outputs/model_finetuned.pth')
        ])

    run = experiment.submit(estimator)
    run.wait_for_completion(show_output=True)

    return json.dumps('Job complete')
Exemplo n.º 22
0
experiment = Experiment(ws, name=experiment_name)

# Create a PyTorch estimator

# The Azure ML SDK's PyTorch estimator enables you to easily submit PyTorch training jobs for both single-node and distributed runs. For more information on the PyTorch estimator, refer here. The following code will define a single-node PyTorch job.

from azureml.train.dnn import PyTorch

script_params = {
    '--num_epochs': 30,
    '--output_dir': './outputs'
}

estimator = PyTorch(source_directory=project_folder, 
                    script_params=script_params,
                    compute_target=compute_target,
                    entry_script='pytorch_train.py',
                    use_gpu=True)

# Now that we've seen how to do a simple PyTorch training run using the SDK, let's see if we can further improve the accuracy of our model. We can optimize our model's hyperparameters using Azure Machine Learning's hyperparameter tuning capabilities.
# Start a hyperparameter sweep

# First, we will define the hyperparameter space to sweep over. Since our training script uses a learning rate schedule to decay the learning rate every several epochs, let's tune the initial learning rate and the momentum parameters. In this example we will use random sampling to try different configuration sets of hyperparameters to maximize our primary metric, the best validation accuracy (best_val_acc).

# Then, we specify the early termination policy to use to early terminate poorly performing runs. Here we use the BanditPolicy, which will terminate any run that doesn't fall within the slack factor of our primary evaluation metric. In this tutorial, we will apply this policy every epoch (since we report our best_val_acc metric every epoch and evaluation_interval=1). Notice we will delay the first policy evaluation until after the first 10 epochs (delay_evaluation=10). Refer here for more information on the BanditPolicy and other policies available.

from azureml.train.hyperdrive import RandomParameterSampling, HyperDriveRunConfig, BanditPolicy, PrimaryMetricGoal, uniform

param_sampling = RandomParameterSampling( {
        'learning_rate': uniform(0.0005, 0.005),
        'momentum': uniform(0.9, 0.99)
Exemplo n.º 23
0
if args.do_train:
    logging.warning(f'[INFO] Running train for {args.project_name}')
    for task in tasks:
        exp = Experiment(workspace=ws,
                         name=f'{args.project_name}_train_{task}')
        config = tasks.get(task)
        if config.get('type') == 'classification':
            script_params = {
                '--task': int(task),
                '--use_cuda': '',
                '--register_model': ''
            }
            est = PyTorch(source_directory=script_folder,
                          compute_target=compute_target,
                          script_params=script_params,
                          entry_script='src/classification.py',
                          pip_packages=pip_packages,
                          use_gpu=True)

            ### Hyperparameters params
            if language == 'en':
                model_type = choice('roberta', 'bert', 'albert')
            elif language == 'de':
                model_type = choice('distilbert', 'bert', 'roberta')
            elif language == 'it' or language == 'es':
                model_type = choice('bert')
            elif language == 'fr':
                model_type = choice('camembert', 'bert')
            param_sampling = RandomParameterSampling({
                '--n_epochs':
                choice(3, 5, 10),
Exemplo n.º 24
0
    enable_optimized_mode = experiment_settings["framework"]["pytorch"][
        "_enable_optimized_mode"]

    estimator = PyTorch(
        source_directory=experiment_settings["source_directory"],
        compute_target=compute_target,
        entry_script=experiment_settings["entry_script"],
        script_params=experiment_settings["script_parameters"],
        node_count=experiment_settings["distributed_training"]["node_count"],
        distributed_training=distrib_training_backend,
        use_docker=experiment_settings["docker"]["use_docker"],
        custom_docker_image=experiment_settings["docker"]["custom_image"],
        image_registry_details=container_registry,
        user_managed=experiment_settings["user_managed"],
        conda_packages=experiment_settings["dependencies"]["conda_packages"],
        pip_packages=experiment_settings["dependencies"]["pip_packages"],
        conda_dependencies_file=experiment_settings["dependencies"]
        ["conda_dependencies_file"],
        pip_requirements_file=experiment_settings["dependencies"]
        ["pip_requirements_file"],
        environment_variables=experiment_settings["environment_variables"],
        inputs=experiment_settings["data_references"],
        source_directory_data_store=experiment_settings[
            "source_directory_datastore"],
        shm_size=experiment_settings["docker"]["shm_size"],
        max_run_duration_seconds=experiment_settings[
            "max_run_duration_seconds"],
        framework_version=framework_version,
        _enable_optimized_mode=enable_optimized_mode)

elif experiment_settings["framework"]["name"] == "tensorflow":
    framework_version = experiment_settings["framework"]["tensorflow"][
preprocessing_step = EstimatorStep(
    name="Preprocessing_Train",
    estimator=preprocessing_est,
    estimator_entry_script_arguments=[
        "--data_dir", input_data, "--output_data_dir", output
    ],
    inputs=[input_data],
    outputs=[output],
    compute_target=cpu_cluster,
    allow_reuse=True,
)

pytorch_est = PyTorch(
    source_directory='020-ann',
    compute_target=cpu_cluster,
    entry_script='pytorch_train.py',
    use_gpu=False,
    framework_version='1.1',
    conda_packages=['pandas'],
)

pytorch_step = EstimatorStep(
    name="PyTorch_Train",
    estimator=pytorch_est,
    estimator_entry_script_arguments=["--data_dir", output],
    inputs=[output],
    compute_target=cpu_cluster,
    allow_reuse=True,
)

pipeline = Pipeline(workspace=ws, steps=[preprocessing_step, pytorch_step])
run = Experiment(ws, args.experiment).submit(pipeline)
def create_estimator_from_configs(
        azure_config: AzureConfig, source_config: SourceConfig,
        estimator_inputs: List[DatasetConsumptionConfig]) -> PyTorch:
    """
    Create an return a PyTorch estimator from the provided configuration information.
    :param azure_config: Azure configuration, used to store various values for the job to be submitted
    :param source_config: source configutation, for other needed values
    :param estimator_inputs: value for the "inputs" field of the estimator.
    :return:
    """
    # AzureML seems to sometimes expect the entry script path in Linux format, hence convert to posix path
    entry_script_relative_path = Path(source_config.entry_script).relative_to(
        source_config.root_folder).as_posix()
    logging.info(
        f"Entry script {entry_script_relative_path} ({source_config.entry_script} relative to "
        f"source directory {source_config.root_folder})")
    environment_variables = {
        "AZUREML_OUTPUT_UPLOAD_TIMEOUT_SEC":
        str(source_config.upload_timeout_seconds),
        "MKL_SERVICE_FORCE_INTEL":
        "1",
        **(source_config.environment_variables or {})
    }
    # Merge the project-specific dependencies with the packages that InnerEye itself needs. This should not be
    # necessary if the innereye package is installed. It is necessary when working with an outer project and
    # InnerEye as a git submodule and submitting jobs from the local machine.
    # In case of version conflicts, the package version in the outer project is given priority.
    conda_dependencies = merge_conda_dependencies(
        source_config.conda_dependencies_files)  # type: ignore
    if azure_config.pip_extra_index_url:
        # When an extra-index-url is supplied, swap the order in which packages are searched for.
        # This is necessary if we need to consume packages from extra-index that clash with names of packages on
        # pypi
        conda_dependencies.set_pip_option(
            f"--index-url {azure_config.pip_extra_index_url}")
        conda_dependencies.set_pip_option(
            "--extra-index-url https://pypi.org/simple")
    # create Estimator environment
    framework_version = pytorch_version_from_conda_dependencies(
        conda_dependencies)
    logging.info(f"PyTorch framework version: {framework_version}")
    max_run_duration = None
    if azure_config.max_run_duration:
        max_run_duration = run_duration_string_to_seconds(
            azure_config.max_run_duration)
    workspace = azure_config.get_workspace()
    estimator = PyTorch(
        source_directory=source_config.root_folder,
        entry_script=entry_script_relative_path,
        script_params=source_config.script_params,
        compute_target=azure_config.cluster,
        # Use blob storage for storing the source, rather than the FileShares section of the storage account.
        source_directory_data_store=workspace.datastores.get(
            WORKSPACE_DEFAULT_BLOB_STORE_NAME),
        inputs=estimator_inputs,
        environment_variables=environment_variables,
        shm_size=azure_config.docker_shm_size,
        use_docker=True,
        use_gpu=True,
        framework_version=framework_version,
        max_run_duration_seconds=max_run_duration)
    estimator.run_config.environment.python.conda_dependencies = conda_dependencies
    # We'd like to log the estimator config, but conversion to string fails when the Estimator has some inputs.
    # logging.info(azure_util.estimator_to_string(estimator))
    if azure_config.hyperdrive:
        estimator = source_config.hyperdrive_config_func(
            estimator)  # type: ignore
    return estimator
Exemplo n.º 27
0
    except ComputeTargetException:
        print('Creating a new compute target...')
        compute_config = AmlCompute.provisioning_configuration(
            vm_size='STANDARD_NC6', max_nodes=1)

        compute_target_gpu = ComputeTarget.create(ws, cluster_name,
                                                  compute_config)
        compute_target_gpu.wait_for_completion(show_output=True,
                                               min_node_count=None,
                                               timeout_in_minutes=0)

    script_params = {}
    estimator = PyTorch(source_directory='./',
                        script_params=script_params,
                        compute_target=compute_target_gpu,
                        entry_script='train.py',
                        use_gpu=True,
                        pip_packages=[],
                        framework_version='1.2')

    est_step = EstimatorStep(name="Train_Step",
                             estimator=estimator,
                             estimator_entry_script_arguments=auth_params,
                             runconfig_pipeline_params=None,
                             inputs=[],
                             outputs=[],
                             compute_target=compute_target_gpu)
    est_step.run_after(process_step)

    #     step 3
Exemplo n.º 28
0
                            inputs=[input_dir],
                            outputs=[processed_dir],
                            compute_target=cluster_name,
                            runconfig=run_config,
                            source_directory=PREPROCESS_DIR
                        )

#%% [markdown]
# ## Pipeline second step: training
#
# For the second step, we start by defining the pytorch estimator that will be used to traing the Stochastic variational deep kernel learning model using Gpytorch.

#%%
estimator = PyTorch(source_directory=TRAIN_DIR,
                    conda_packages=['pandas', 'numpy', 'scikit-learn'],
                    pip_packages=['gpytorch'],
                    compute_target=cluster,
                    entry_script='svdkl_entry.py',
                    use_gpu=True)

#%% [markdown]
# Here, we configure Hyperdrive by defining the hyperparametes space and select choose Area under the curve as the metric to optimize for.

#%%
ps = RandomParameterSampling({
    '--batch-size': choice(4096, 8192),
    '--epochs': choice(500),
    '--neural-net-lr': loguniform(-4, -2),
    '--likelihood-lr': loguniform(-4, -2),
    '--grid-size': choice(32, 64),
    '--grid-bounds': choice(-1, 0),
    '--latent-dim': choice(2),
Exemplo n.º 29
0
    '--momentum': 0.9,
    '--num-dataload-workers': 6,
    '--epochs-before-unfreeze-all':
    '0',  # Don't unfreeze the model - since the performance degrades based on the number of images we have in the test set
}

conda_packages = ['pytorch', 'scikit-learn']
pip_packages = ['pydocumentdb', 'torchvision']

#%%

estimator = PyTorch(source_directory='./aml-image-models',
                    compute_target=ct,
                    entry_script='train_network.py',
                    script_params=script_params,
                    node_count=1,
                    process_count_per_node=1,
                    conda_packages=conda_packages,
                    pip_packages=pip_packages,
                    use_gpu=True)

#%%

# Create Experiment object - this will be used to submit the Hyperdrive run and store all the given parameters
experiment_hd = Experiment(workspace=ws, name='hyperdrive')

#%% [markdown]

###### Create Random Parameter Sampler

#%%