def create(deployment_path: str, **kwargs): # Late import. import yaml from maro.cli.grass.executors.grass_azure_executor import GrassAzureExecutor from maro.cli.grass.executors.grass_local_executor import GrassLocalExecutor from maro.cli.grass.executors.grass_on_premises_executor import GrassOnPremisesExecutor from maro.utils.exception.cli_exception import BadRequestError, FileOperationError, InvalidDeploymentTemplateError try: with open(deployment_path, "r") as fr: create_deployment = yaml.safe_load(fr) if create_deployment["mode"] == "grass/azure": GrassAzureExecutor.create(create_deployment=create_deployment) elif create_deployment["mode"] == "grass/on-premises": GrassOnPremisesExecutor.create(create_deployment=create_deployment) elif create_deployment["mode"] == "grass/local": executor = GrassLocalExecutor( cluster_name=create_deployment["name"], cluster_details=create_deployment) executor.create() else: raise BadRequestError( f"Unsupported operation in mode '{create_deployment['mode']}'." ) except KeyError as e: raise InvalidDeploymentTemplateError(f"Missing key '{e.args[0]}'.") except FileNotFoundError: raise FileOperationError("Invalid template file path.")
def stop_node(self, replicas: int, node_size: str): """Stop MARO Node VMs in parallel. Args: replicas (int): number of MARO Node in specific node_size to stop. node_size (str): size of the MARO Node VM, see https://docs.microsoft.com/en-us/azure/virtual-machines/sizes for reference. Returns: None. """ # Get nodes details nodes_details = self.master_api_client.list_nodes() # Get stoppable nodes stoppable_nodes_details = [] for node_details in nodes_details: if (node_details["node_size"] == node_size and node_details["state"]["status"] == NodeStatus.RUNNING and self._count_running_containers(node_details) == 0): stoppable_nodes_details.append(node_details) # Check replicas if len(stoppable_nodes_details) < replicas: raise BadRequestError( f"No more '{node_size}' nodes can be stopped, only {len(stoppable_nodes_details)} are stoppable" ) # Parallel stop params = [[node_details] for node_details in stoppable_nodes_details[:replicas]] with ThreadPool(GlobalParams.PARALLELS) as pool: pool.starmap(self._stop_node, params)
def start_schedule(self, deployment_path: str): # Load start_schedule_deployment with open(deployment_path, "r") as fr: start_schedule_deployment = yaml.safe_load(fr) schedule_name = start_schedule_deployment["name"] start_schedule_deployment = self._completed_local_job_deployment(start_schedule_deployment) # Check resource is_satisfied, _ = resource_op( self.cluster_details["master"]["resource"], start_schedule_deployment["total_request_resource"], ResourceOperation.ALLOCATION ) if not is_satisfied: raise BadRequestError(f"No enough resource to start schedule {schedule_name} in {self.cluster_name}.") # push schedule details to Redis self._redis_connection.hset( f"{self.cluster_name}:job_details", schedule_name, json.dumps(start_schedule_deployment) ) job_list = start_schedule_deployment["job_names"] # switch schedule details into job details job_detail = copy.deepcopy(start_schedule_deployment) del job_detail["job_names"] for job_name in job_list: job_detail["name"] = job_name self._push_pending_job(job_detail)
def start_node(self, replicas: int, node_size: str): """Start MARO Node VMs in parallel. Args: replicas (int): number of MARO Node in specific node_size to start. node_size (str): size of the MARO Node VM, see https://docs.microsoft.com/en-us/azure/virtual-machines/sizes for reference. Returns: None. """ # Get nodes details nodes_details = self.master_api_client.list_nodes() # Get startable nodes startable_nodes = [] for node_details in nodes_details: if node_details["node_size"] == node_size and node_details[ "state"]["status"] == NodeStatus.STOPPED: startable_nodes.append(node_details["name"]) # Check replicas if len(startable_nodes) < replicas: raise BadRequestError( f"No enough '{node_size}' nodes can be started, only {len(startable_nodes)} is able to start" ) # Parallel start params = [[startable_node] for startable_node in startable_nodes[:replicas]] with ThreadPool(GlobalParams.PARALLELS) as pool: pool.starmap(self._start_node, params)
def scale_node(self, replicas: int, node_size: str): """Scale up/down MARO Node using predefined Node Image. Args: replicas (int): desired number of MARO Node in specific node_size. node_size (str): size of the MARO Node VM, see https://docs.microsoft.com/en-us/azure/virtual-machines/sizes for reference. Returns: None. """ # Load details nodes_details = self.master_api_client.list_nodes() # Init node_size_to_count node_size_to_count = collections.defaultdict(lambda: 0) for node_details in nodes_details: node_size_to_count[node_details["node_size"]] += 1 # Get node_size_to_spec node_size_to_spec = self._get_node_size_to_spec() if node_size not in node_size_to_spec: raise BadRequestError(f"Invalid node_size '{node_size}'") # Scale nodes if node_size_to_count[node_size] > replicas: self._delete_nodes(num=node_size_to_count[node_size] - replicas, node_size=node_size) elif node_size_to_count[node_size] < replicas: self._create_nodes(num=replicas - node_size_to_count[node_size], node_size=node_size) else: logger.warning_yellow("Replica is match, no create or delete")
def scale_node(self, replicas: int, node_size: str) -> None: """Scale up/down MARO Node. Args: replicas (int): desired number of MARO Node in specific node_size. node_size (str): size of the MARO Node VM, see https://docs.microsoft.com/en-us/azure/virtual-machines/sizes for reference. Returns: None. """ # Get node_size_to_info node_size_to_info = self._get_node_size_to_info() # Get node_size_to_spec, and check if node_size is valid node_size_to_spec = self._get_node_size_to_spec() if node_size not in node_size_to_spec: raise BadRequestError(f"Invalid node_size '{node_size}'") # Scale node if node_size not in node_size_to_info: self._build_node_pool(replicas=replicas, node_size=node_size) elif node_size_to_info[node_size]["count"] != replicas: self._scale_node_pool(replicas=replicas, node_size=node_size, node_size_to_info=node_size_to_info) else: logger.warning_yellow("Replica is match, no create or delete")
def node_leave(cluster_name: str, node_name: str, **kwargs): cluster_details = load_cluster_details(cluster_name) if cluster_details["mode"] != "grass/on-premises": raise BadRequestError("Node join cluster interrupted: Invalid mode.") executor = GrassOnPremisesExecutor(cluster_name) executor.node_leave_cluster(node_name)
def delete(cluster_name: str, **kwargs): cluster_details = load_cluster_details(cluster_name=cluster_name) if cluster_details["mode"] == "k8s/aks": executor = K8sAksExecutor(cluster_name=cluster_name) executor.delete() else: raise BadRequestError(f"Unsupported command in mode '{cluster_details['mode']}'.")
def stop_node(cluster_name: str, replicas: int, node_size: str, **kwargs): cluster_details = load_cluster_details(cluster_name=cluster_name) if cluster_details["mode"] == "grass/azure": executor = GrassAzureExecutor(cluster_name=cluster_name) executor.stop_node(replicas=replicas, node_size=node_size) else: raise BadRequestError( f"Unsupported command in mode '{cluster_details['mode']}'.")
def remove_data(cluster_name: str, remote_path: str, **kwargs): cluster_details = load_cluster_details(cluster_name=cluster_name) if cluster_details["mode"] == "k8s/aks": executor = K8sAksExecutor(cluster_name=cluster_name) executor.remove_data(remote_path=remote_path) else: raise BadRequestError( f"Unsupported command in mode '{cluster_details['mode']}'.")
def status(cluster_name: str, resource_name: str, **kwargs): cluster_details = load_cluster_details(cluster_name=cluster_name) if cluster_details["mode"] in ["grass/azure", "grass/on-premises"]: executor = GrassAzureExecutor(cluster_name=cluster_name) executor.status(resource_name=resource_name) else: raise BadRequestError( f"Unsupported command in mode '{cluster_details['mode']}'.")
def get_job_logs(cluster_name: str, job_name: str, **kwargs): # Load details cluster_details = load_cluster_details(cluster_name=cluster_name) if cluster_details["mode"] in ["grass/azure", "grass/on-premises"]: executor = GrassAzureExecutor(cluster_name=cluster_name) executor.get_job_logs(job_name=job_name) else: raise BadRequestError(f"Unsupported command in mode '{cluster_details['mode']}'.")
def pull_data(cluster_name: str, local_path: str, remote_path: str, **kwargs): cluster_details = load_cluster_details(cluster_name=cluster_name) if cluster_details["mode"] in ["grass/azure", "grass/on-premises"]: executor = GrassAzureExecutor(cluster_name=cluster_name) executor.pull_data(local_path=local_path, remote_path=remote_path) else: raise BadRequestError( f"Unsupported command in mode '{cluster_details['mode']}'.")
def start_job(cluster_name: str, deployment_path: str, **kwargs): # Load details cluster_details = load_cluster_details(cluster_name=cluster_name) if cluster_details["mode"] in ["grass/azure", "grass/on-premises"]: executor = GrassAzureExecutor(cluster_name=cluster_name) executor.start_job(deployment_path=deployment_path) else: raise BadRequestError(f"Unsupported command in mode '{cluster_details['mode']}'.")
def push_image(self, image_name: str, image_path: str, remote_context_path: str, remote_image_name: str) -> None: """Push docker image from local to the MARO Cluster. Args: image_name (str): name of the image. image_path (str): path of the image file. remote_context_path (str): path of the remote context (for remote build). remote_image_name (str): name of the image (for remote build). Returns: None. """ # Push image TODO: design a new paradigm for remote build if image_name or image_path: if image_name: # Push image from local docker client. new_file_name = NameCreator.get_valid_file_name(image_name) abs_image_path = f"{GlobalPaths.ABS_MARO_CLUSTERS}/{self.cluster_name}/image_files/{new_file_name}" DockerController.save_image(image_name=image_name, abs_export_path=abs_image_path) else: # Push image from local image file. file_name = os.path.basename(image_path) new_file_name = NameCreator.get_valid_file_name(file_name) abs_image_path = f"{GlobalPaths.ABS_MARO_CLUSTERS}/{self.cluster_name}/image_files/{new_file_name}" FileSynchronizer.copy_and_rename( source_path=image_path, target_dir= f"{GlobalPaths.ABS_MARO_CLUSTERS}/{self.cluster_name}/image_files", new_name=new_file_name) # Use md5_checksum to skip existed image file. remote_image_file_details = self.master_api_client.get_image_file( image_file_name=new_file_name) local_md5_checksum = self._get_md5_checksum(path=abs_image_path) if ("md5_checksum" in remote_image_file_details and remote_image_file_details["md5_checksum"] == local_md5_checksum): logger.info_green( f"The image file '{new_file_name}' already exists") return FileSynchronizer.copy_files_to_node( local_path=abs_image_path, remote_dir= f"{GlobalPaths.MARO_SHARED}/clusters/{self.cluster_name}/image_files", node_username=self.master_username, node_hostname=self.master_public_ip_address, node_ssh_port=self.master_ssh_port) self.master_api_client.create_image_file( image_file_details={ "name": new_file_name, "md5_checksum": local_md5_checksum }) logger.info_green(f"Image {image_name} is loaded") else: raise BadRequestError("Invalid arguments")
def get_job_logs(cluster_name: str, job_name: str, **kwargs): # Load details cluster_details = load_cluster_details(cluster_name=cluster_name) if cluster_details["mode"] == "k8s/aks": executor = K8sAksExecutor(cluster_name=cluster_name) executor.get_job_logs(job_name=job_name) else: raise BadRequestError( f"Unsupported command in mode '{cluster_details['mode']}'.")
def start_job(cluster_name: str, deployment_path: str, **kwargs): # Load details cluster_details = load_cluster_details(cluster_name=cluster_name) if cluster_details["mode"] == "k8s/aks": executor = K8sAksExecutor(cluster_name=cluster_name) executor.start_job(deployment_path=deployment_path) else: raise BadRequestError( f"Unsupported command in mode '{cluster_details['mode']}'.")
def stop_schedule(cluster_name: str, schedule_name: str, **kwargs): # Load details cluster_details = load_cluster_details(cluster_name=cluster_name) if cluster_details["mode"] == "k8s/aks": executor = K8sAksExecutor(cluster_name=cluster_name) executor.stop_schedule(schedule_name=schedule_name) else: raise BadRequestError( f"Unsupported command in mode '{cluster_details['mode']}'.")
def clean(cluster_name: str, **kwargs): # Load details cluster_details = load_cluster_details(cluster_name=cluster_name) if cluster_details["mode"] == "grass/azure": executor = GrassAzureExecutor(cluster_name=cluster_name) executor.clean() else: raise BadRequestError( f"Unsupported command in mode '{cluster_details['mode']}'.")
def stop_schedule(cluster_name: str, schedule_name: str, **kwargs): # Load details cluster_details = load_cluster_details(cluster_name=cluster_name) if cluster_details["mode"] in ["grass/azure", "grass/on-premises"]: executor = GrassAzureExecutor(cluster_name=cluster_name) executor.stop_schedule(schedule_name=schedule_name) else: raise BadRequestError( f"Unsupported command in mode '{cluster_details['mode']}'.")
def scale_node(cluster_name: str, replicas: int, node_size: str, **kwargs): cluster_details = load_cluster_details(cluster_name=cluster_name) if cluster_details["mode"] == "k8s/aks": executor = K8sAksExecutor(cluster_name=cluster_name) executor.scale_node( replicas=replicas, node_size=node_size ) else: raise BadRequestError(f"Unsupported command in mode '{cluster_details['mode']}'.")
def create(self): logger.info("Creating cluster") # Get cluster name and save cluster details. if os.path.isdir(f"{GlobalPaths.ABS_MARO_CLUSTERS}/{self.cluster_name}"): raise BadRequestError(f"Cluster '{self.cluster_name}' is exist.") # Build connection with Resource Redis self._resource_redis.add_cluster() # Allocation cluster_resource = self.cluster_details["master"]["resource"] available_resource = self._resource_redis.get_available_resource() # Update resource is_satisfied, updated_resource = resource_op( available_resource, cluster_resource, ResourceOperation.ALLOCATION ) if not is_satisfied: self._resource_redis.sub_cluster() raise BadRequestError("No enough resource for this cluster.") self._resource_redis.set_available_resource(updated_resource) # Start agents. self._agents_start() # Set available resource for cluster self._redis_connection.hset( f"{self.cluster_name}:runtime_detail", "available_resource", json.dumps(cluster_resource) ) # Save cluster config locally. DetailsWriter.save_cluster_details( cluster_name=self.cluster_name, cluster_details=self.cluster_details ) logger.info(f"{self.cluster_name} is created.")
def delete(cluster_name: str, **kwargs): cluster_details = load_cluster_details(cluster_name=cluster_name) if cluster_details["mode"] == "grass/azure": executor = GrassAzureExecutor(cluster_name=cluster_name) executor.delete() elif cluster_details["mode"] == "grass/on-premises": executor = GrassOnPremisesExecutor(cluster_name=cluster_name) executor.delete() else: raise BadRequestError( f"Unsupported command in mode '{cluster_details['mode']}'.")
def push_image(cluster_name: str, image_name: str, image_path: str, remote_context_path: str, remote_image_name: str, **kwargs): cluster_details = load_cluster_details(cluster_name=cluster_name) if cluster_details["mode"] in ["grass/azure", "grass/on-premises"]: executor = GrassAzureExecutor(cluster_name=cluster_name) executor.push_image(image_name=image_name, image_path=image_path, remote_context_path=remote_context_path, remote_image_name=remote_image_name) else: raise BadRequestError( f"Unsupported command in mode '{cluster_details['mode']}'.")
def delete(cluster_name: str, **kwargs): # Late import. from maro.cli.k8s.executors.k8s_aks_executor import K8sAksExecutor from maro.cli.utils.details_reader import DetailsReader from maro.utils.exception.cli_exception import BadRequestError cluster_details = DetailsReader.load_cluster_details(cluster_name=cluster_name) if cluster_details["mode"] == "k8s/aks": executor = K8sAksExecutor(cluster_name=cluster_name) executor.delete() else: raise BadRequestError(f"Unsupported operation in mode '{cluster_details['mode']}'.")
def create(create_deployment: dict): """Create MARO Cluster with create_deployment. Args: create_deployment (dict): create_deployment of grass/on-premises. See lib/deployments/internal for reference. Returns: None. """ logger.info("Creating cluster") # Get standardized cluster_details cluster_details = GrassOnPremisesExecutor._standardize_cluster_details( create_deployment=create_deployment) cluster_name = cluster_details["name"] if os.path.isdir(f"{GlobalPaths.ABS_MARO_CLUSTERS}/{cluster_name}"): raise BadRequestError(f"Cluster '{cluster_name}' is exist") # Start creating try: GrassOnPremisesExecutor._init_master( cluster_details=cluster_details) GrassOnPremisesExecutor._create_user( cluster_details=cluster_details) # Remote create master, cluster after initialization master_api_client = MasterApiClientV1( master_hostname=cluster_details["master"]["public_ip_address"], master_api_server_port=cluster_details["master"]["api_server"] ["port"], user_id=cluster_details["user"]["id"], master_to_dev_encryption_private_key=cluster_details["user"] ["master_to_dev_encryption_private_key"], dev_to_master_encryption_public_key=cluster_details["user"] ["dev_to_master_encryption_public_key"], dev_to_master_signing_private_key=cluster_details["user"] ["dev_to_master_signing_private_key"]) master_api_client.create_master( master_details=cluster_details["master"]) master_api_client.create_cluster(cluster_details=cluster_details) except Exception as e: # If failed, remove details folder, then raise shutil.rmtree( path=f"{GlobalPaths.ABS_MARO_CLUSTERS}/{cluster_name}") logger.error_red(f"Failed to create cluster '{cluster_name}'") raise e logger.info_green(f"Cluster {cluster_name} has been created.")
def clean(cluster_name: str, **kwargs): # Late import. from maro.cli.grass.executors.grass_azure_executor import GrassAzureExecutor from maro.cli.utils.details_reader import DetailsReader from maro.utils.exception.cli_exception import BadRequestError cluster_details = DetailsReader.load_cluster_details( cluster_name=cluster_name) if cluster_details["mode"] == "grass/azure": executor = GrassAzureExecutor(cluster_name=cluster_name) executor.clean() else: raise BadRequestError( f"Unsupported operation in mode '{cluster_details['mode']}'.")
def node_join(node_join_path: str, **kwargs): try: with open(node_join_path, "r") as fr: node_join_info = yaml.safe_load(fr) fr.close() if node_join_info["mode"] != "grass/on-premises": raise BadRequestError( f"Node join cluster interrupted: Invalid mode: {node_join_info['mode']}" ) executor = GrassOnPremisesExecutor(node_join_info["cluster"]) executor.node_join_cluster(node_join_info) except FileNotFoundError: raise FileOperationError("Invalid template file path.")
def create(deployment_path: str, **kwargs): try: with open(deployment_path, 'r') as fr: create_deployment = yaml.safe_load(fr) if create_deployment["mode"] == "k8s/aks": K8sAksExecutor.build_cluster_details( create_deployment=create_deployment) executor = K8sAksExecutor(cluster_name=create_deployment["name"]) executor.create() else: raise BadRequestError( f"Unsupported command in mode '{create_deployment['mode']}'.") except KeyError as e: raise InvalidDeploymentTemplateError(f"Missing key '{e.args[0]}'.") except FileNotFoundError: raise FileOperationError("Invalid template file path.")
def start_schedule(cluster_name: str, deployment_path: str, **kwargs): # Late import. from maro.cli.k8s.executors.k8s_aks_executor import K8sAksExecutor from maro.cli.utils.details_reader import DetailsReader from maro.utils.exception.cli_exception import BadRequestError # Load details cluster_details = DetailsReader.load_cluster_details( cluster_name=cluster_name) if cluster_details["mode"] == "k8s/aks": executor = K8sAksExecutor(cluster_name=cluster_name) executor.start_schedule(deployment_path=deployment_path) else: raise BadRequestError( f"Unsupported operation in mode '{cluster_details['mode']}'.")