def affinitize_task_to_master(spark_client, cluster_id, task): cluster = spark_client.get_cluster(cluster_id) if cluster.master_node_id is None: raise AztkError("Master has not yet been selected. Please wait until the cluster is finished provisioning.") master_node = spark_client.batch_client.compute_node.get(pool_id=cluster_id, node_id=cluster.master_node_id) task.affinity_info = batch_models.AffinityInformation(affinity_id=master_node.affinity_id) return task
def affinitize_task_to_master(spark_client, cluster_id, task): cluster = spark_client.get_cluster(cluster_id) master_node = spark_client.batch_client.compute_node.get( pool_id=cluster_id, node_id=cluster.master_node_id) task.affinity_info = batch_models.AffinityInformation( affinity_id=master_node.affinity_id) return task
def affinitize_task_to_master(batch_client, cluster_id, task): pool = batch_client.pool.get(config.pool_id) master_node_id = get_master_node_id(pool) master_node = batch_client.compute_node.get(pool_id=cluster_id, node_id=master_node_id) task.affinity_info = batch_models.AffinityInformation( affinity_id=master_node.affinity_id) return task
def designate_master_docker_swarm_node(batch_client, pool_id, nodes, job_id): """Designate a master docker swarm node by selecting a node in the pool to be the swarm manager. This is accomplished via IP selection in the pool of nodes and running the swarm init command via an affinitized task. This is for Docker 1.12+. :param batch_client: The batch client to use. :type batch_client: `batchserviceclient.BatchServiceClient` :param str pool_id: The id of the pool. :param list nodes: list of `batchserviceclient.models.ComputeNode` :param str job_id: The id of the job to create. :rtype: tuple :return: ((master ipaddress, master node id), swarm token) """ # designate the lowest ip address node as the master nodes = sorted(nodes, key=lambda node: node.ip_address) master_node_ip_address = nodes[0].ip_address master_node_id = nodes[0].id master_node_affinity_id = nodes[0].affinity_id master_node = (master_node_ip_address, master_node_id) print('master node is: {}'.format(master_node)) # create job job = batchmodels.JobAddParameter( id=job_id, pool_info=batchmodels.PoolInformation(pool_id=pool_id)) batch_client.job.add(job) # add docker swarm manage as an affinitized task to run on the master node # NOTE: task affinity is weak. if the node has no available scheduling # slots, the task may be executed on a different node. for this example, # it is not an issue since this node should be available for scheduling. task_commands = [ 'docker swarm init --advertise-addr {}'.format(master_node_ip_address), 'docker swarm join-token -q worker', ] print('initializing docker swarm cluster via Azure Batch task...') task = batchmodels.TaskAddParameter( id='swarm-manager', affinity_info=batchmodels.AffinityInformation( affinity_id=master_node_affinity_id), command_line=common.helpers.wrap_commands_in_shell( 'linux', task_commands), run_elevated=True, ) batch_client.task.add(job_id=job.id, task=task) # wait for task to complete common.helpers.wait_for_tasks_to_complete(batch_client, job_id, datetime.timedelta(minutes=5)) # retrieve the swarm token stdout = common.helpers.read_task_file_as_string( batch_client, job.id, task.id, common.helpers._STANDARD_OUT_FILE_NAME) token = stdout.splitlines()[-1].strip() print('swarm token: {}'.format(token)) return master_node, token
def designate_master_docker_swarm_node(batch_client, pool_id, nodes, job_id): """Designate a master docker swarm node by selecting a node in the pool to be the swarm manager. This is accomplished via IP selection in the pool of nodes and running the swarm manage command via an affinitized task. :param batch_client: The batch client to use. :type batch_client: `batchserviceclient.BatchServiceClient` :param str pool_id: The id of the pool to create. :param list nodes: list of `batchserviceclient.models.ComputeNode` :param str job_id: The id of the job to create. :rtype: tuple :return: (ipaddress, master node id) """ # designate the lowest ip address node as the master nodes = sorted(nodes, key=lambda node: node.ip_address) master_node_ip_address = nodes[0].ip_address master_node_id = nodes[0].id master_node = (master_node_ip_address, master_node_id) # create a node list based on the number of nodes and master lastoctet = int(master_node_ip_address.split('.')[-1]) nodelist = '10.0.0.[{}:{}]'.format(lastoctet, lastoctet + len(nodes) - 1) print('master node is: {} nodelist is: {}'.format(master_node, nodelist)) # create job job = batchmodels.JobAddParameter( id=job_id, pool_info=batchmodels.PoolInformation(pool_id=pool_id)) batch_client.job.add(job) # add docker swarm manage as an affinitized task to run on the master node task_commands = [ ('docker run -d -p 3375:3375 -t swarm manage -H tcp://0.0.0.0:3375 ' '"nodes://{}:2375"').format(nodelist) ] print('creating docker swarm cluster via Azure Batch task...') task = batchmodels.TaskAddParameter( id="swarm-master", affinity_info=batchmodels.AffinityInformation( affinity_id=master_node_id), command_line=common.helpers.wrap_commands_in_shell( 'linux', task_commands), run_elevated=True, ) batch_client.task.add(job_id=job.id, task=task) # wait for task to complete common.helpers.wait_for_tasks_to_complete( batch_client, job_id, datetime.timedelta(minutes=5)) print('docker swarm cluster created.') return master_node
def add_nodes_to_swarm(batch_client, pool_id, nodes, job_id, master_node, swarm_token): """Add compute nodes to swarm :param batch_client: The batch client to use. :type batch_client: `batchserviceclient.BatchServiceClient` :param str pool_id: The id of the pool to create. :param list nodes: list of `batchserviceclient.models.ComputeNode` :param str job_id: The id of the job. :param tuple master_node: master node info :param str swarm_token: swarm token """ task_commands = [ 'docker swarm join --token {} {}:2377'.format(swarm_token, master_node[0]), ] print('joining docker swarm for each compute node via Azure Batch task...') i = 0 for node in nodes: # manager node is already part of the swarm, so skip it if node.id == master_node[1]: continue task = batchmodels.TaskAddParameter( id='swarm-join-{0:03d}'.format(i), affinity_info=batchmodels.AffinityInformation( affinity_id=node.affinity_id), command_line=common.helpers.wrap_commands_in_shell( 'linux', task_commands), run_elevated=True, ) batch_client.task.add(job_id=job_id, task=task) i += 1 # wait for task to complete common.helpers.wait_for_tasks_to_complete(batch_client, job_id, datetime.timedelta(minutes=5)) print('docker swarm cluster created.')
def submit_application(spark_client, cluster_id, application, wait: bool = False): """ Submit a spark app """ resource_files = [] app_resource_file = helpers.upload_file_to_container(container_name=application.name, file_path=application.application, blob_client=spark_client.blob_client, use_full_path=False) # Upload application file resource_files.append(app_resource_file) # Upload dependent JARS jar_resource_file_paths = [] for jar in application.jars: current_jar_resource_file_path = helpers.upload_file_to_container(container_name=application.name, file_path=jar, blob_client=spark_client.blob_client, use_full_path=False) jar_resource_file_paths.append(current_jar_resource_file_path) resource_files.append(current_jar_resource_file_path) # Upload dependent python files py_files_resource_file_paths = [] for py_file in application.py_files: current_py_files_resource_file_path = helpers.upload_file_to_container(container_name=application.name, file_path=py_file, blob_client=spark_client.blob_client, use_full_path=False) py_files_resource_file_paths.append( current_py_files_resource_file_path) resource_files.append(current_py_files_resource_file_path) # Upload other dependent files files_resource_file_paths = [] for file in application.files: files_resource_file_path = helpers.upload_file_to_container(container_name=application.name, file_path=file, blob_client=spark_client.blob_client, use_full_path=False) files_resource_file_paths.append(files_resource_file_path) resource_files.append(files_resource_file_path) # create command to submit task cmd = __app_submit_cmd( spark_client=spark_client, cluster_id=cluster_id, name=application.name, app=app_resource_file.file_path, app_args=application.application_args, main_class=application.main_class, jars=[jar_resource_file_path.file_path for jar_resource_file_path in jar_resource_file_paths], py_files=[py_files_resource.file_path for py_files_resource in py_files_resource_file_paths], files=[file_resource_file_path.file_path for file_resource_file_path in files_resource_file_paths], driver_java_options=application.driver_java_options, driver_library_path=application.driver_library_path, driver_class_path=application.driver_class_path, driver_memory=application.driver_memory, executor_memory=application.executor_memory, driver_cores=application.driver_cores, executor_cores=application.executor_cores) # Get cluster size cluster = spark_client.get_cluster(cluster_id) # Affinitize task to master node # master_node_affinity_id = helpers.get_master_node_id(cluster_id, spark_client.batch_client) rls = spark_client.get_remote_login_settings(cluster.id, cluster.master_node_id) # Create task task = batch_models.TaskAddParameter( id=application.name, affinity_info=batch_models.AffinityInformation( affinity_id=cluster.master_node_id), command_line=helpers.wrap_commands_in_shell(cmd), resource_files=resource_files, user_identity=batch_models.UserIdentity( auto_user=batch_models.AutoUserSpecification( scope=batch_models.AutoUserScope.task, elevation_level=batch_models.ElevationLevel.admin)) ) # Add task to batch job (which has the same name as cluster_id) job_id = cluster_id spark_client.batch_client.task.add(job_id=job_id, task=task) if wait: helpers.wait_for_task_to_complete(job_id=job_id, task_id=task.id, batch_client=spark_client.batch_client)