def setup(num): workspace_name = '%s-%s-%02d' % (workspace_prefix, location, num) try: ws = Workspace.get( name=workspace_name, subscription_id=subscription_id, resource_group=resource_group) print('Found existing workspace %s' % workspace_name) except WorkspaceException: print('Creating new workspace %s...' % workspace_name) ws = Workspace.create( name=workspace_name, subscription_id=subscription_id, resource_group=resource_group, location=location) try: compute_target = AmlCompute(ws, compute_name) print('Found existing compute %s' % compute_name) compute_target.update(min_nodes=min_nodes, max_nodes=max_nodes) except ComputeTargetException: print('Creating new compute target %s...' % compute_name) compute_config = AmlCompute.provisioning_configuration(vm_size=vm_size, min_nodes=min_nodes, max_nodes=max_nodes) compute_target = ComputeTarget.create(ws, compute_name, compute_config) compute_target.wait_for_completion(show_output=True, timeout_in_minutes=20) ds = ws.get_default_datastore() ds.upload("testdata") dataset_name = 'sample_dataset' if dataset_name not in ws.datasets: data = Dataset.File.from_files(path=[(ds, 'testdata.txt')]) data.register( workspace = ws, name = dataset_name, description = 'Sample data for load test') print('Dataset successfully registered') else: print('Dataset already exists')
class ClusterConnector: def __init__( self, workspace, cluster_name, ssh_key, vm_type, admin_username="******", ): """Thin wrapper class around azureml.core.compute.AmlCluster Provides parallel ssh objects and helper for master node and all node commands and file copies. Usage: >>> cc = ClusterConnector(workspace, "MyCluster", sshkey, "Standard_ND40rs_v2") >>> cc.initialize(min_nodes=0, max_nodes=4, idle_timeout_secs=30) >>> cluster = cc.cluster >>> [print(node['name']) for node in cc.cluster.list_nodes()] """ self.cluster_name = cluster_name self.workspace = workspace self.ssh_key = ssh_key self.vm_type = vm_type self.admin_username = admin_username enable_host_logger() hlog = logging.getLogger("pssh.host_logger") tstr = datetime.now().isoformat(timespec="minutes") [ hlog.removeHandler(h) for h in hlog.handlers if isinstance(h, logging.StreamHandler) ] os.makedirs("clusterlogs", exist_ok=True) self.logfile = "clusterlogs/{}_{}.log".format(self.workspace.name, tstr) hlog.addHandler(logging.FileHandler(self.logfile)) self.cluster = None self._master_scp = None self._master_ssh = None self._all_ssh = None def initialise(self, min_nodes=0, max_nodes=0, idle_timeout_secs=1800): """Initialise underlying AmlCompute cluster instance""" self._create_or_update_cluster(min_nodes, max_nodes, idle_timeout_secs) def _check_logs_emessage(self, host, port): msg = "Remote command failed on {}:{}. For details see {}".format( host, port, self.logfile) return msg def terminate(self): print('Attempting to terminate cluster "{}"'.format( colored(self.cluster_name, "green"))) try: self.cluster.update(min_nodes=0, max_nodes=0, idle_seconds_before_scaledown=10) self.cluster.wait_for_completion() except ComputeTargetException as err: raise RuntimeError( "Failed to terminate cluster nodes ({})".format(err)) if len(self.cluster.list_nodes()): raise RuntimeError( "Failed to terminate cluster nodes (nodes still running)") @property def cluster_nodes(self): self.cluster.refresh_state() return sorted(self.cluster.list_nodes(), key=lambda n: n["port"]) def _create_or_update_cluster(self, min_nodes, max_nodes, idle_timeout_secs): try: self.cluster = AmlCompute(workspace=self.workspace, name=self.cluster_name) print('Updating existing cluster "{}"'.format( colored(self.cluster_name, "green"))) self.cluster.update( min_nodes=min_nodes, max_nodes=max_nodes, idle_seconds_before_scaledown=idle_timeout_secs, ) except ComputeTargetException: print('Creating new cluster "{}"'.format( colored(self.cluster_name, "green"))) cluster_config = AmlCompute.provisioning_configuration( vm_size=self.vm_type, min_nodes=min_nodes, max_nodes=max_nodes, idle_seconds_before_scaledown=idle_timeout_secs, admin_username=self.admin_username, admin_user_ssh_key=self.ssh_key, remote_login_port_public_access="Enabled", ) self.cluster = AmlCompute.create(self.workspace, self.cluster_name, cluster_config) self.cluster.wait_for_completion() if len(self.cluster_nodes) < min_nodes: sleep(30) if len(self.cluster_nodes) < min_nodes: raise RuntimeError("Failed to provision sufficient nodes") def _copy_nodefile_to_nodes(self): if len(self.cluster_nodes) == 1: cprint("Single node cluster -- skipping IB config", "yellow") return print("Collecting cluster IB info") outputs = self._all_ssh.run_command( r'ifconfig ib0 | grep -oe "inet[^6][adr: ]*[0-9.]*" | cut -d" " -f2', shell="bash -c", ) self._all_ssh.join(outputs) ibaddrs = [] for output in outputs: host = output.host port = output.client.port if output.exit_code != 0: print(list(output.stdout)) print(list(output.stderr)) raise RuntimeError("Failed to get IB ip for {}:{}".format( host, port)) try: ibaddr = list(output.stdout)[0].split()[0] except IndexError: raise RuntimeError("Failed to get IB ip for {}:{} - " "No ib interface found!".format(host, port)) print("Mapping {}:{} -> {}".format(host, port, ibaddr)) if port == self._master_scp.port: cprint("IB Master: {}".format(ibaddr), "green") ibaddrs = [ibaddr] + ibaddrs else: ibaddrs.append(ibaddr) with NamedTemporaryFile(delete=False, mode="wt") as nfh: self.nodefile = nfh.name for addr in ibaddrs: nfh.write("{}\n".format(addr)) self.ibaddrs = ibaddrs self.copy_to_all_nodes(self.nodefile, "./nodefile") def _create_cluster_ssh_conns(self): hostips = [n["publicIpAddress"] for n in self.cluster_nodes] hostconfigs = [HostConfig(port=n["port"]) for n in self.cluster_nodes] self._all_ssh = ParallelSSHClient(hostips, host_config=hostconfigs, user=self.admin_username) self._master_ssh = ParallelSSHClient(hostips[:1], host_config=hostconfigs[:1], user=self.admin_username) self._master_scp = SSHClient(hostips[0], port=hostconfigs[0].port, user=self.admin_username) def copy_to_all_nodes(self, source, dest): copy_jobs = self._all_ssh.copy_file(source, dest) joinall(copy_jobs, raise_error=True) def copy_to_master_node(self, source, dest): self._master_scp.copy_file(source, dest) def copy_from_master_node(self, source, dest): self._master_scp.copy_remote_file(source, dest) def run_on_all_nodes(self, command): outputs = self._all_ssh.run_command(command, shell="bash -c") self._all_ssh.join(outputs, consume_output=True) for output in outputs: if int(output.exit_code) != 0: host = output.host port = output.client.port raise RuntimeError(self._check_logs_emessage(host, port)) def run_on_master_node(self, command): outputs = self._master_ssh.run_command(command, shell="bash -c") self._master_ssh.join(outputs) for output in outputs: if int(output.exit_code) != 0: host = output.host port = output.client.port raise RuntimeError(self._check_logs_emessage(host, port)) def attempt_termination(self): try: self.terminate() except RuntimeError as err: print(colored("ERROR: {}\n\n", "red", attrs=["bold"]).format(err)) self.warn_unterminated() def warn_unterminated(self): print( colored("WARNING: {}", "red", attrs=["bold"]).format( colored( "Cluster {} is still running - terminate manually to avoid " "additional compute costs".format( colored(self.cluster_name, "green")), "red", )))
# Check settings and redeploy if required settings have changed print("Found existing cluster") if cluster.vm_size.lower() != aml_settings["vm_size"].lower( ) or cluster.vm_priority.lower() != aml_settings["vm_priority"].lower(): cluster.delete() cluster.wait_for_completion(show_output=True) raise ComputeTargetException( "Cluster is of incorrect size or has incorrect priority. Deleting cluster and provisioning a new one." ) # Update AMLCompute #if cluster.provisioning_configuration.min_nodes != aml_settings["min_nodes"] or cluster.provisioning_configuration.max_nodes != aml_settings["max_nodes"] or cluster.provisioning_configuration.idle_seconds_before_scaledown != aml_settings["idle_seconds_before_scaledown"]: print("Updating settings of Cluster") cluster.update(min_nodes=aml_settings["min_nodes"], max_nodes=aml_settings["max_nodes"], idle_seconds_before_scaledown=aml_settings[ "idle_seconds_before_scaledown"]) # Wait until the operation has completed cluster.wait_for_completion(show_output=True) print("Successfully updated Cluster definition") except ComputeTargetException: print("Loading failed") print("Creating new AML Compute resource") compute_config = AmlCompute.provisioning_configuration( vm_size=aml_settings["vm_size"], vm_priority=aml_settings["vm_priority"], min_nodes=aml_settings["min_nodes"], max_nodes=aml_settings["max_nodes"], idle_seconds_before_scaledown=aml_settings[
def main(): # Loading input values print("::debug::Loading input values") parameters_file = os.environ.get("INPUT_PARAMETERSFILE", default="workspace.json") azure_credentials = os.environ.get("INPUT_AZURECREDENTIALS", default="{}") azure_credentials = json.loads(azure_credentials) # Loading parameters file print("::debug::Loading parameters file") parameters_file_path = os.path.join(".aml", parameters_file) try: with open(parameters_file_path) as f: parameters = json.load(f) except FileNotFoundError: print( f"::error::Could not find parameter file in {parameters_file_path}. Please provide a parameter file in your repository (e.g. .aml/workspace.json)." ) return # Loading Workspace sp_auth = ServicePrincipalAuthentication( tenant_id=azure_credentials.get("tenantId", ""), service_principal_id=azure_credentials.get("clientId", ""), service_principal_password=azure_credentials.get("clientSecret", "")) try: print("::debug::Loading existing Workspace") ws = Workspace.get( name=parameters.get("name", None), subscription_id=azure_credentials.get("subscriptionId", ""), resource_group=parameters.get("resourceGroup", None), auth=sp_auth) print("::debug::Successfully loaded existing Workspace") except AuthenticationException as exception: print( f"::error::Could not retrieve user token. Please paste output of `az ad sp create-for-rbac --name <your-sp-name> --role contributor --scopes /subscriptions/<your-subscriptionId>/resourceGroups/<your-rg> --sdk-auth` as value of secret variable: AZURE_CREDENTIALS: {exception}" ) return except AuthenticationError as exception: print(f"::error::Microsoft REST Authentication Error: {exception}") return except AdalError as exception: print( f"::error::Active Directory Authentication Library Error: {exception}" ) return except ProjectSystemException as exception: print(f"::error::Workspace authorizationfailed: {exception}") return # TODO: Create compute if not existing. try: # Loading AMLCompute print("::debug::Loading existing AML Compute") cluster = AmlCompute(workspace=ws, name=parameters["name"]) # Check settings and redeploy if required settings have changed print("::debug::Found existing cluster") if cluster.vm_size.lower() != parameters["vm_size"].lower( ) or cluster.vm_priority.lower() != parameters["vm_priority"].lower(): cluster.delete() cluster.wait_for_completion(show_output=True) raise ComputeTargetException( "Cluster is of incorrect size or has incorrect priority. Deleting cluster and provisioning a new one." ) # Update AMLCompute #if cluster.provisioning_configuration.min_nodes != aml_settings["min_nodes"] or cluster.provisioning_configuration.max_nodes != aml_settings["max_nodes"] or cluster.provisioning_configuration.idle_seconds_before_scaledown != aml_settings["idle_seconds_before_scaledown"]: print("::debug::Updating settings of Cluster") cluster.update(min_nodes=parameters["min_nodes"], max_nodes=parameters["max_nodes"], idle_seconds_before_scaledown=parameters[ "idle_seconds_before_scaledown"]) # Wait until the operation has completed cluster.wait_for_completion(show_output=True) print("::debug::Successfully updated Cluster definition") except ComputeTargetException: print("::debug::Loading failed") print("::debug::Creating new AML Compute resource") compute_config = AmlCompute.provisioning_configuration( vm_size=parameters["vm_size"], vm_priority=parameters["vm_priority"], min_nodes=parameters["min_nodes"], max_nodes=parameters["max_nodes"], idle_seconds_before_scaledown=parameters[ "idle_seconds_before_scaledown"], tags=parameters["tags"], description=parameters["description"]) # Deploy to VNET if provided if parameters["vnet_resource_group_name"] and parameters[ "vnet_name"] and parameters["subnet_name"]: compute_config.vnet_resourcegroup_name = parameters[ "vnet_resource_group_name"] compute_config.vnet_name = parameters["vnet_name"] compute_config.subnet_name = parameters["subnet_name"] # Set Credentials if provided if parameters["admin_username"] and parameters["admin_user_password"]: compute_config.admin_username = parameters["admin_username"] compute_config.admin_user_password = parameters[ "admin_user_password"] elif parameters["admin_username"] and parameters["admin_user_ssh_key"]: compute_config.admin_username = parameters["admin_username"] compute_config.admin_user_ssh_key = parameters[ "admin_user_ssh_key"] # Create Compute Target cluster = ComputeTarget.create( workspace=ws, name=parameters["name"], provisioning_configuration=compute_config) # Wait until the cluster is attached cluster.wait_for_completion(show_output=True) # Checking status of AMLCompute Cluster print("::debug::Checking status of AMLCompute Cluster") if cluster.provisioning_state == "Failed": cluster.delete() raise Exception( "::debug::Deployment of AMLCompute Cluster failed with the following status: {} and logs: \n{}" .format(cluster.provisioning_state, cluster.provisioning_errors)) print(parameters) print( "::debug::Successfully finished Azure Machine Learning Compute Action")