示例#1
0
def test_with_disabled_options():
    cmd = CommandBuilder("ssh")

    cmd.add_option("--verbose", enable=True)
    cmd.add_option("-p", None)
    cmd.add_option("-L", "8080:localhost:8080", enable=False)
    assert cmd.to_str() == "ssh --verbose"
示例#2
0
 def __init__(self,
              name: str,
              docker_repo: str,
              cmd: str,
              gpu_enabled=False):
     if gpu_enabled:
         self.cmd = CommandBuilder('nvidia-docker run')
     else:
         self.cmd = CommandBuilder('docker run')
     self.cmd.add_option('--net', 'host')
     self.cmd.add_option('--name', name)
     self.cmd.add_argument('-d')
     self.cmd.add_argument(docker_repo)
     self.cmd.add_argument(cmd)
示例#3
0
 def __init__(self,
              name: str,
              docker_repo: str,
              docker_run_options: str,
              cmd: str,
              gpu_enabled=False):
     if gpu_enabled:
         self.cmd = CommandBuilder("nvidia-docker run")
     else:
         self.cmd = CommandBuilder("docker run")
     self.cmd.add_option("--net", "host")
     self.cmd.add_option("--name", name)
     self.cmd.add_argument("-d")
     self.cmd.add_argument(docker_run_options)
     self.cmd.add_argument(docker_repo)
     self.cmd.add_argument(cmd)
示例#4
0
def generate_task(spark_client, container_id, application, remote=False):
    resource_files = []

    # The application provided is not hosted remotely and therefore must be uploaded
    if not remote:
        app_resource_file = helpers.upload_file_to_container(
            container_name=container_id,
            application_name=application.name,
            file_path=application.application,
            blob_client=spark_client.blob_client,
            use_full_path=False,
        )

        # Upload application file
        resource_files.append(app_resource_file)

        application.application = "$AZ_BATCH_TASK_WORKING_DIR/" + os.path.basename(
            application.application)

    # Upload dependent JARS
    jar_resource_file_paths = []
    for jar in application.jars:
        current_jar_resource_file_path = helpers.upload_file_to_container(
            container_name=container_id,
            application_name=application.name,
            file_path=jar,
            blob_client=spark_client.blob_client,
            use_full_path=False,
        )
        jar_resource_file_paths.append(current_jar_resource_file_path)
        resource_files.append(current_jar_resource_file_path)

    # Upload dependent python files
    py_files_resource_file_paths = []
    for py_file in application.py_files:
        current_py_files_resource_file_path = helpers.upload_file_to_container(
            container_name=container_id,
            application_name=application.name,
            file_path=py_file,
            blob_client=spark_client.blob_client,
            use_full_path=False,
        )
        py_files_resource_file_paths.append(
            current_py_files_resource_file_path)
        resource_files.append(current_py_files_resource_file_path)

    # Upload other dependent files
    files_resource_file_paths = []
    for file in application.files:
        files_resource_file_path = helpers.upload_file_to_container(
            container_name=container_id,
            application_name=application.name,
            file_path=file,
            blob_client=spark_client.blob_client,
            use_full_path=False,
        )
        files_resource_file_paths.append(files_resource_file_path)
        resource_files.append(files_resource_file_path)

    # Upload application definition
    application.jars = [os.path.basename(jar) for jar in application.jars]
    application.py_files = [
        os.path.basename(py_files) for py_files in application.py_files
    ]
    application.files = [
        os.path.basename(files) for files in application.files
    ]
    application_definition_file = helpers.upload_text_to_container(
        container_name=container_id,
        application_name=application.name,
        file_path="application.yaml",
        content=yaml.dump(vars(application)),
        blob_client=spark_client.blob_client,
    )
    resource_files.append(application_definition_file)

    # create command to submit task
    task_cmd = CommandBuilder("sudo docker exec")
    task_cmd.add_argument("-i")
    task_cmd.add_option(
        "-e", "AZ_BATCH_TASK_WORKING_DIR=$AZ_BATCH_TASK_WORKING_DIR")
    task_cmd.add_option("-e",
                        "STORAGE_LOGS_CONTAINER={0}".format(container_id))
    task_cmd.add_argument("spark /bin/bash >> output.log 2>&1")
    task_cmd.add_argument(
        r'-c "source ~/.bashrc; '
        r"export PYTHONPATH=$PYTHONPATH:\$AZTK_WORKING_DIR; "
        r"cd \$AZ_BATCH_TASK_WORKING_DIR; "
        r'\$AZTK_WORKING_DIR/.aztk-env/.venv/bin/python \$AZTK_WORKING_DIR/aztk/node_scripts/submit.py"'
    )

    # Create task
    task = batch_models.TaskAddParameter(
        id=application.name,
        command_line=helpers.wrap_commands_in_shell([task_cmd.to_str()]),
        resource_files=resource_files,
        constraints=batch_models.TaskConstraints(
            max_task_retry_count=application.max_retry_count),
        user_identity=batch_models.UserIdentity(
            auto_user=batch_models.AutoUserSpecification(
                scope=batch_models.AutoUserScope.task,
                elevation_level=batch_models.ElevationLevel.admin)),
    )

    return task
示例#5
0
def __docker_run_cmd(docker_repo: str = None,
                     gpu_enabled: bool = False,
                     worker_on_master: bool = True,
                     file_mounts = None,
                     plugins = None,
                     mixed_mode = False) -> str:
    """
        Build the docker run command by setting up the environment variables
    """
    if gpu_enabled:
        cmd = CommandBuilder('nvidia-docker run')
    else:
        cmd = CommandBuilder('docker run')
    cmd.add_option('--net', 'host')
    cmd.add_option('--name', constants.DOCKER_SPARK_CONTAINER_NAME)
    cmd.add_option('-v', '/mnt/batch/tasks:/mnt/batch/tasks')

    if file_mounts:
        for mount in file_mounts:
            cmd.add_option('-v', '{0}:{0}'.format(mount.mount_path))

    cmd.add_option('-e', 'DOCKER_WORKING_DIR=/mnt/batch/tasks/startup/wd')
    cmd.add_option('-e', 'AZ_BATCH_ACCOUNT_NAME=$AZ_BATCH_ACCOUNT_NAME')
    cmd.add_option('-e', 'BATCH_ACCOUNT_KEY=$BATCH_ACCOUNT_KEY')
    cmd.add_option('-e', 'BATCH_SERVICE_URL=$BATCH_SERVICE_URL')
    cmd.add_option('-e', 'STORAGE_ACCOUNT_NAME=$STORAGE_ACCOUNT_NAME')
    cmd.add_option('-e', 'STORAGE_ACCOUNT_KEY=$STORAGE_ACCOUNT_KEY')
    cmd.add_option('-e', 'STORAGE_ACCOUNT_SUFFIX=$STORAGE_ACCOUNT_SUFFIX')
    cmd.add_option('-e', 'SP_TENANT_ID=$SP_TENANT_ID')
    cmd.add_option('-e', 'SP_CLIENT_ID=$SP_CLIENT_ID')
    cmd.add_option('-e', 'SP_CREDENTIAL=$SP_CREDENTIAL')
    cmd.add_option('-e', 'SP_BATCH_RESOURCE_ID=$SP_BATCH_RESOURCE_ID')
    cmd.add_option('-e', 'SP_STORAGE_RESOURCE_ID=$SP_STORAGE_RESOURCE_ID')
    cmd.add_option('-e', 'AZ_BATCH_POOL_ID=$AZ_BATCH_POOL_ID')
    cmd.add_option('-e', 'AZ_BATCH_NODE_ID=$AZ_BATCH_NODE_ID')
    cmd.add_option(
        '-e', 'AZ_BATCH_NODE_IS_DEDICATED=$AZ_BATCH_NODE_IS_DEDICATED')
    if worker_on_master is not None:
        cmd.add_option('-e', 'WORKER_ON_MASTER={}'.format(worker_on_master))
    else:
        # default to True if not specified
        cmd.add_option('-e', 'WORKER_ON_MASTER={}'.format(True))
    cmd.add_option('-e', 'MIXED_MODE={}'.format(mixed_mode))
    cmd.add_option('-e', 'SPARK_WEB_UI_PORT=$SPARK_WEB_UI_PORT')
    cmd.add_option('-e', 'SPARK_WORKER_UI_PORT=$SPARK_WORKER_UI_PORT')
    cmd.add_option('-e', 'SPARK_CONTAINER_NAME=$SPARK_CONTAINER_NAME')
    cmd.add_option('-e', 'SPARK_SUBMIT_LOGS_FILE=$SPARK_SUBMIT_LOGS_FILE')
    cmd.add_option('-e', 'SPARK_JOB_UI_PORT=$SPARK_JOB_UI_PORT')
    cmd.add_option('-p', '8080:8080')       # Spark Master UI
    cmd.add_option('-p', '7077:7077')       # Spark Master
    cmd.add_option('-p', '7337:7337')       # Spark Shuffle Service
    cmd.add_option('-p', '4040:4040')       # Job UI
    cmd.add_option('-p', '18080:18080')     # Spark History Server UI
    cmd.add_option('-p', '3022:3022')       # Docker SSH
    if plugins:
        for plugin in plugins:
            for port in plugin.ports:
                cmd.add_option('-p', '{0}:{1}'.format(port.internal, port.internal))       # Jupyter UI

    cmd.add_option('-d', docker_repo)
    cmd.add_argument('/bin/bash /mnt/batch/tasks/startup/wd/docker_main.sh')

    return cmd.to_str()
示例#6
0
class DockerCmd:
    """
    Class helping to write a docker command
    """
    def __init__(self,
                 name: str,
                 docker_repo: str,
                 docker_run_options: str,
                 cmd: str,
                 gpu_enabled=False):
        if gpu_enabled:
            self.cmd = CommandBuilder("nvidia-docker run")
        else:
            self.cmd = CommandBuilder("docker run")
        self.cmd.add_option("--net", "host")
        self.cmd.add_option("--name", name)
        self.cmd.add_argument("-d")
        self.cmd.add_argument(docker_run_options)
        self.cmd.add_argument(docker_repo)
        self.cmd.add_argument(cmd)

    def add_env(self, env: str, value: str):
        self.cmd.add_option("-e", "{0}={1}".format(env, value))

    def pass_env(self, env: str):
        """
        Give the value of an environment variable in the main process to the docker image
        """
        self.cmd.add_option("-e", "{0}".format(env))

    def share_folder(self, folder: str):
        self.cmd.add_option("-v", "{0}:{0}".format(folder))

    def open_port(self, port: int):
        self.cmd.add_option("-p", "{0}:{0}".format(port))  # Spark Master UI

    def to_str(self):
        return self.cmd.to_str()
示例#7
0
def __app_cmd():
    docker_exec = CommandBuilder("sudo docker exec")
    docker_exec.add_argument("-i")
    docker_exec.add_option(
        "-e", "AZ_BATCH_TASK_WORKING_DIR=$AZ_BATCH_TASK_WORKING_DIR")
    docker_exec.add_option("-e", "AZ_BATCH_JOB_ID=$AZ_BATCH_JOB_ID")
    docker_exec.add_argument(
        "spark /bin/bash >> output.log 2>&1 -c \"python \$DOCKER_WORKING_DIR/job_submission.py\""
    )
    return docker_exec.to_str()
示例#8
0
def __app_cmd(scheduling_target=None, resource_files=None):
    resource_file_sas_list = None
    if resource_files:
        resource_file_sas_list = ' '.join([
            '\\\"{}\\\"'.format(task_def.blob_source)
            for task_def in resource_files
        ])

    docker_exec = CommandBuilder("sudo docker exec")
    docker_exec.add_argument("-i")
    docker_exec.add_option(
        "-e", "AZ_BATCH_TASK_WORKING_DIR=$AZ_BATCH_TASK_WORKING_DIR")
    docker_exec.add_option("-e", "AZ_BATCH_JOB_ID=$AZ_BATCH_JOB_ID")
    docker_exec.add_argument(
        r'spark /bin/bash >> output.log 2>&1 -c "'
        r"source ~/.bashrc; "
        r"export PYTHONPATH=$PYTHONPATH:\$AZTK_WORKING_DIR; "
        r"cd \$AZ_BATCH_TASK_WORKING_DIR; "
        r'\$AZTK_WORKING_DIR/.aztk-env/.venv/bin/python \$AZTK_WORKING_DIR/aztk/node_scripts/scheduling/job_submission.py {0} {1}"'
        .format(scheduling_target if scheduling_target else "",
                resource_file_sas_list if resource_file_sas_list else ""))
    return docker_exec.to_str()
示例#9
0
def __docker_run_cmd(docker_repo: str = None, gpu_enabled: bool = False, file_mounts = []) -> str:
    """
        Build the docker run command by setting up the environment variables
    """
    if gpu_enabled:
        cmd = CommandBuilder('nvidia-docker run')
    else:
        cmd = CommandBuilder('docker run')
    cmd.add_option('--net', 'host')
    cmd.add_option('--name', constants.DOCKER_SPARK_CONTAINER_NAME)
    cmd.add_option('-v', '/mnt/batch/tasks:/batch')

    if file_mounts:
        for mount in file_mounts:
            cmd.add_option('-v', '{0}:{0}'.format(mount.mount_path))

    cmd.add_option('-e', 'DOCKER_WORKING_DIR=/batch/startup/wd')
    cmd.add_option('-e', 'AZ_BATCH_ACCOUNT_NAME=$AZ_BATCH_ACCOUNT_NAME')
    cmd.add_option('-e', 'BATCH_ACCOUNT_KEY=$BATCH_ACCOUNT_KEY')
    cmd.add_option('-e', 'BATCH_ACCOUNT_URL=$BATCH_ACCOUNT_URL')
    cmd.add_option('-e', 'STORAGE_ACCOUNT_NAME=$STORAGE_ACCOUNT_NAME')
    cmd.add_option('-e', 'STORAGE_ACCOUNT_KEY=$STORAGE_ACCOUNT_KEY')
    cmd.add_option('-e', 'STORAGE_ACCOUNT_SUFFIX=$STORAGE_ACCOUNT_SUFFIX')
    cmd.add_option('-e', 'AZ_BATCH_POOL_ID=$AZ_BATCH_POOL_ID')
    cmd.add_option('-e', 'AZ_BATCH_NODE_ID=$AZ_BATCH_NODE_ID')
    cmd.add_option(
        '-e', 'AZ_BATCH_NODE_IS_DEDICATED=$AZ_BATCH_NODE_IS_DEDICATED')
    cmd.add_option('-e', 'SPARK_WEB_UI_PORT=$SPARK_WEB_UI_PORT')
    cmd.add_option('-e', 'SPARK_WORKER_UI_PORT=$SPARK_WORKER_UI_PORT')
    cmd.add_option('-e', 'SPARK_JUPYTER_PORT=$SPARK_JUPYTER_PORT')
    cmd.add_option('-e', 'SPARK_JOB_UI_PORT=$SPARK_JOB_UI_PORT')
    cmd.add_option('-p', '8080:8080')       # Spark Master UI
    cmd.add_option('-p', '7077:7077')       # Spark Master
    cmd.add_option('-p', '4040:4040')       # Job UI
    cmd.add_option('-p', '8888:8888')       # Jupyter UI
    cmd.add_option('-p', '8787:8787')       # Rstudio Server
    cmd.add_option('-p', '18080:18080')     # Spark History Server UI
    cmd.add_option('-p', '3022:3022')       # Docker SSH
    cmd.add_option('-p', '8020:8020')       # Namenode IPC: ClientProtocol
    cmd.add_option('-p', '9000:9000')       # Namenode IPC: ClientProtocol
    cmd.add_option('-p', '50010:50010')     # Datanode http data transfer
    cmd.add_option('-p', '50020:50020')     # Datanode IPC metaata operations
    cmd.add_option('-p', '50070:50070')     # Namenode WebUI
    cmd.add_option('-p', '50075:50075')     # DataNode WebUI
    cmd.add_option('-p', '50090:50090')     # Secondary NameNode http address
    cmd.add_option('-d', docker_repo)
    cmd.add_argument('/bin/bash /batch/startup/wd/docker_main.sh')
    
    return cmd.to_str()
示例#10
0
def test_only_command():
    cmd = CommandBuilder("ssh")
    assert cmd.to_str() == "ssh"
示例#11
0
def test_with_arg_and_option():
    cmd = CommandBuilder("ssh")
    cmd.add_argument("[email protected]")
    cmd.add_option("-p", "2020")
    assert cmd.to_str() == "ssh -p 2020 [email protected]"
示例#12
0
def test_with_multiple_options():
    cmd = CommandBuilder("ssh")
    cmd.add_option("-L", "8080:localhost:8080")
    cmd.add_option("-p", "2020")
    assert cmd.to_str() == "ssh -L 8080:localhost:8080 -p 2020"
示例#13
0
def __docker_run_cmd(docker_repo: str = None,
                     gpu_enabled: bool = False,
                     worker_on_master: bool = True,
                     file_mounts = None,
                     mixed_mode = False) -> str:
    """
        Build the docker run command by setting up the environment variables
    """
    if gpu_enabled:
        cmd = CommandBuilder('nvidia-docker run')
    else:
        cmd = CommandBuilder('docker run')
    cmd.add_option('--net', 'host')
    cmd.add_option('--name', constants.DOCKER_SPARK_CONTAINER_NAME)
    cmd.add_option('-v', '/mnt/batch/tasks:/mnt/batch/tasks')

    if file_mounts:
        for mount in file_mounts:
            cmd.add_option('-v', '{0}:{0}'.format(mount.mount_path))

    cmd.add_option('-e', 'DOCKER_WORKING_DIR=/mnt/batch/tasks/startup/wd')
    cmd.add_option('-e', 'AZ_BATCH_ACCOUNT_NAME=$AZ_BATCH_ACCOUNT_NAME')
    cmd.add_option('-e', 'BATCH_ACCOUNT_KEY=$BATCH_ACCOUNT_KEY')
    cmd.add_option('-e', 'BATCH_SERVICE_URL=$BATCH_SERVICE_URL')
    cmd.add_option('-e', 'STORAGE_ACCOUNT_NAME=$STORAGE_ACCOUNT_NAME')
    cmd.add_option('-e', 'STORAGE_ACCOUNT_KEY=$STORAGE_ACCOUNT_KEY')
    cmd.add_option('-e', 'STORAGE_ACCOUNT_SUFFIX=$STORAGE_ACCOUNT_SUFFIX')
    cmd.add_option('-e', 'SP_TENANT_ID=$SP_TENANT_ID')
    cmd.add_option('-e', 'SP_CLIENT_ID=$SP_CLIENT_ID')
    cmd.add_option('-e', 'SP_CREDENTIAL=$SP_CREDENTIAL')
    cmd.add_option('-e', 'SP_BATCH_RESOURCE_ID=$SP_BATCH_RESOURCE_ID')
    cmd.add_option('-e', 'SP_STORAGE_RESOURCE_ID=$SP_STORAGE_RESOURCE_ID')
    cmd.add_option('-e', 'AZ_BATCH_POOL_ID=$AZ_BATCH_POOL_ID')
    cmd.add_option('-e', 'AZ_BATCH_NODE_ID=$AZ_BATCH_NODE_ID')
    cmd.add_option(
        '-e', 'AZ_BATCH_NODE_IS_DEDICATED=$AZ_BATCH_NODE_IS_DEDICATED')
    if worker_on_master is not None:
        cmd.add_option('-e', 'WORKER_ON_MASTER={}'.format(worker_on_master))
    else:
        # default to True if not specified
        cmd.add_option('-e', 'WORKER_ON_MASTER={}'.format(True))
    cmd.add_option('-e', 'MIXED_MODE={}'.format(mixed_mode))
    cmd.add_option('-e', 'SPARK_WEB_UI_PORT=$SPARK_WEB_UI_PORT')
    cmd.add_option('-e', 'SPARK_WORKER_UI_PORT=$SPARK_WORKER_UI_PORT')
    cmd.add_option('-e', 'SPARK_CONTAINER_NAME=$SPARK_CONTAINER_NAME')
    cmd.add_option('-e', 'SPARK_SUBMIT_LOGS_FILE=$SPARK_SUBMIT_LOGS_FILE')
    cmd.add_option('-e', 'SPARK_JUPYTER_PORT=$SPARK_JUPYTER_PORT')
    cmd.add_option('-e', 'SPARK_JOB_UI_PORT=$SPARK_JOB_UI_PORT')
    cmd.add_option('-p', '8080:8080')       # Spark Master UI
    cmd.add_option('-p', '7077:7077')       # Spark Master
    cmd.add_option('-p', '7337:7337')       # Spark Shuffle Service
    cmd.add_option('-p', '4040:4040')       # Job UI
    cmd.add_option('-p', '8888:8888')       # Jupyter UI
    cmd.add_option('-p', '8787:8787')       # Rstudio Server
    cmd.add_option('-p', '18080:18080')     # Spark History Server UI
    cmd.add_option('-p', '3022:3022')       # Docker SSH
    cmd.add_option('-p', '8020:8020')       # Namenode IPC: ClientProtocol
    cmd.add_option('-p', '9000:9000')       # Namenode IPC: ClientProtocol
    cmd.add_option('-p', '50010:50010')     # Datanode http data transfer
    cmd.add_option('-p', '50020:50020')     # Datanode IPC metaata operations
    cmd.add_option('-p', '50070:50070')     # Namenode WebUI
    cmd.add_option('-p', '50075:50075')     # DataNode WebUI
    cmd.add_option('-p', '50090:50090')     # Secondary NameNode http address
    cmd.add_option('-d', docker_repo)
    cmd.add_argument('/bin/bash /mnt/batch/tasks/startup/wd/docker_main.sh')

    return cmd.to_str()
示例#14
0
def __app_submit_cmd(
        spark_client,
        cluster_id: str,
        name: str,
        app: str,
        app_args: str,
        main_class: str,
        jars: List[str],
        py_files: List[str],
        files: List[str],
        driver_java_options: str,
        driver_library_path: str,
        driver_class_path: str,
        driver_memory: str,
        executor_memory: str,
        driver_cores: str,
        executor_cores: str):
    cluster = spark_client.get_cluster(cluster_id)
    master_id = cluster.master_node_id
    master_ip = __get_node(spark_client, master_id, cluster_id).ip_address

    spark_home = constants.DOCKER_SPARK_HOME

    # set file paths to correct path on container
    files_path = '/batch/workitems/{0}/{1}/{2}/wd/'.format(cluster_id, "job-1", name)
    jars = [files_path + jar for jar in jars]
    py_files = [files_path + py_file for py_file in py_files]
    files = [files_path + f for f in files]

    # 2>&1 redirect stdout and stderr to be in the same file
    spark_submit_cmd = CommandBuilder(
        '{0}/bin/spark-submit'.format(spark_home))
    spark_submit_cmd.add_option(
        '--master', 'spark://{0}:7077'.format(master_ip))
    spark_submit_cmd.add_option('--name', name)
    spark_submit_cmd.add_option('--class', main_class)
    spark_submit_cmd.add_option('--jars', jars and ','.join(jars))
    spark_submit_cmd.add_option('--py-files', py_files and ','.join(py_files))
    spark_submit_cmd.add_option('--files', files and ','.join(files))
    spark_submit_cmd.add_option('--driver-java-options', driver_java_options)
    spark_submit_cmd.add_option('--driver-library-path', driver_library_path)
    spark_submit_cmd.add_option('--driver-class-path', driver_class_path)
    spark_submit_cmd.add_option('--driver-memory', driver_memory)
    spark_submit_cmd.add_option('--executor-memory', executor_memory)
    spark_submit_cmd.add_option('--driver-cores', driver_cores)
    spark_submit_cmd.add_option('--executor-cores', executor_cores)

    spark_submit_cmd.add_argument(
        '/batch/workitems/{0}/{1}/{2}/wd/'.format(cluster_id, "job-1", name) +
        app + ' ' + ' '.join(['\'' + app_arg + '\'' for app_arg in app_args if app_args]))

    if cluster.gpu_enabled:
        docker_exec_cmd = CommandBuilder('sudo nvidia-docker exec')
    else:
        docker_exec_cmd = CommandBuilder('sudo docker exec')
    
    docker_exec_cmd.add_option('-i', constants.DOCKER_SPARK_CONTAINER_NAME)
    docker_exec_cmd.add_argument('/bin/bash  >> {0} 2>&1 -c \"cd '.format(
        constants.SPARK_SUBMIT_LOGS_FILE) + files_path + '; ' + spark_submit_cmd.to_str() + '\"')

    return [
        docker_exec_cmd.to_str()
    ]
示例#15
0
def test_with_option():
    cmd = CommandBuilder("ssh")
    cmd.add_option("-L", "8080:localhost:8080")
    assert cmd.to_str() == "ssh -L 8080:localhost:8080"
示例#16
0
def __app_cmd():
    docker_exec = CommandBuilder("sudo docker exec")
    docker_exec.add_argument("-i")
    docker_exec.add_option("-e", "AZ_BATCH_TASK_WORKING_DIR=$AZ_BATCH_TASK_WORKING_DIR")
    docker_exec.add_option("-e", "AZ_BATCH_JOB_ID=$AZ_BATCH_JOB_ID")
    docker_exec.add_argument(
        r'spark /bin/bash >> output.log 2>&1 -c "'
        r"source ~/.bashrc; "
        r"export PYTHONPATH=$PYTHONPATH:\$AZTK_WORKING_DIR; "
        r"cd \$AZ_BATCH_TASK_WORKING_DIR; "
        r'\$AZTK_WORKING_DIR/.aztk-env/.venv/bin/python \$AZTK_WORKING_DIR/aztk/node_scripts/job_submission.py"')
    return docker_exec.to_str()
示例#17
0
def generate_task(spark_client, container_id, application):
    resource_files = []

    app_resource_file = helpers.upload_file_to_container(
        container_name=container_id,
        application_name=application.name,
        file_path=application.application,
        blob_client=spark_client.blob_client,
        use_full_path=False)

    # Upload application file
    resource_files.append(app_resource_file)

    # Upload dependent JARS
    jar_resource_file_paths = []
    for jar in application.jars:
        current_jar_resource_file_path = helpers.upload_file_to_container(
            container_name=container_id,
            application_name=application.name,
            file_path=jar,
            blob_client=spark_client.blob_client,
            use_full_path=False)
        jar_resource_file_paths.append(current_jar_resource_file_path)
        resource_files.append(current_jar_resource_file_path)

    # Upload dependent python files
    py_files_resource_file_paths = []
    for py_file in application.py_files:
        current_py_files_resource_file_path = helpers.upload_file_to_container(
            container_name=container_id,
            application_name=application.name,
            file_path=py_file,
            blob_client=spark_client.blob_client,
            use_full_path=False)
        py_files_resource_file_paths.append(
            current_py_files_resource_file_path)
        resource_files.append(current_py_files_resource_file_path)

    # Upload other dependent files
    files_resource_file_paths = []
    for file in application.files:
        files_resource_file_path = helpers.upload_file_to_container(
            container_name=container_id,
            application_name=application.name,
            file_path=file,
            blob_client=spark_client.blob_client,
            use_full_path=False)
        files_resource_file_paths.append(files_resource_file_path)
        resource_files.append(files_resource_file_path)

    # Upload application definition
    application.application = os.path.basename(application.application)
    application.jars = [os.path.basename(jar) for jar in application.jars]
    application.py_files = [
        os.path.basename(py_files) for py_files in application.py_files
    ]
    application.files = [
        os.path.basename(files) for files in application.files
    ]
    application_definition_file = helpers.upload_text_to_container(
        container_name=container_id,
        application_name=application.name,
        file_path='application.yaml',
        content=yaml.dump(vars(application)),
        blob_client=spark_client.blob_client)
    resource_files.append(application_definition_file)

    # create command to submit task
    task_cmd = CommandBuilder('sudo docker exec')
    task_cmd.add_argument('-i')
    task_cmd.add_option(
        '-e', 'AZ_BATCH_TASK_WORKING_DIR=$AZ_BATCH_TASK_WORKING_DIR')
    task_cmd.add_option('-e',
                        'STORAGE_LOGS_CONTAINER={0}'.format(container_id))
    task_cmd.add_argument('spark /bin/bash >> output.log 2>&1')
    task_cmd.add_argument('-c "source ~/.bashrc; '\
                          'cd $AZ_BATCH_TASK_WORKING_DIR; ' \
                          '\$(pyenv root)/versions/\$AZTK_PYTHON_VERSION/bin/python ' \
                          '\$DOCKER_WORKING_DIR/aztk/node_scripts/submit.py"')

    # Create task
    task = batch_models.TaskAddParameter(
        id=application.name,
        command_line=helpers.wrap_commands_in_shell([task_cmd.to_str()]),
        resource_files=resource_files,
        constraints=batch_models.TaskConstraints(
            max_task_retry_count=application.max_retry_count),
        user_identity=batch_models.UserIdentity(
            auto_user=batch_models.AutoUserSpecification(
                scope=batch_models.AutoUserScope.task,
                elevation_level=batch_models.ElevationLevel.admin)))

    return task
示例#18
0
def __app_submit_cmd(
        name: str,
        app: str,
        app_args: List[str],
        main_class: str,
        jars: List[str],
        py_files: List[str],
        files: List[str],
        driver_java_options: str,
        driver_library_path: str,
        driver_class_path: str,
        driver_memory: str,
        executor_memory: str,
        driver_cores: int,
        executor_cores: int,
):
    spark_home = os.environ["SPARK_HOME"]
    with open(os.path.join(spark_home, "conf", "master")) as f:
        master_ip = f.read().rstrip()

    # set file paths to correct path on container
    files_path = os.environ["AZ_BATCH_TASK_WORKING_DIR"]
    jars = [os.path.join(files_path, os.path.basename(jar)) for jar in jars]
    py_files = [os.path.join(files_path, os.path.basename(py_file)) for py_file in py_files]
    files = [os.path.join(files_path, os.path.basename(f)) for f in files]

    # 2>&1 redirect stdout and stderr to be in the same file
    spark_submit_cmd = CommandBuilder("{0}/bin/spark-submit".format(spark_home))
    spark_submit_cmd.add_option("--master", "spark://{0}:7077".format(master_ip))
    spark_submit_cmd.add_option("--name", name)
    spark_submit_cmd.add_option("--class", main_class)
    spark_submit_cmd.add_option("--jars", jars and ",".join(jars))
    spark_submit_cmd.add_option("--py-files", py_files and ",".join(py_files))
    spark_submit_cmd.add_option("--files", files and ",".join(files))
    spark_submit_cmd.add_option("--driver-java-options", driver_java_options)
    spark_submit_cmd.add_option("--driver-library-path", driver_library_path)
    spark_submit_cmd.add_option("--driver-class-path", driver_class_path)
    spark_submit_cmd.add_option("--driver-memory", driver_memory)
    spark_submit_cmd.add_option("--executor-memory", executor_memory)
    if driver_cores:
        spark_submit_cmd.add_option("--driver-cores", str(driver_cores))
    if executor_cores:
        spark_submit_cmd.add_option("--executor-cores", str(executor_cores))

    spark_submit_cmd.add_argument(
        os.path.expandvars(app) + " " + " ".join(["'" + str(app_arg) + "'" for app_arg in (app_args or [])]))

    with open("spark-submit.txt", mode="w", encoding="UTF-8") as stream:
        stream.write(spark_submit_cmd.to_str())

    return spark_submit_cmd
示例#19
0
def __app_submit_cmd(name: str, app: str, app_args: List[str], main_class: str,
                     jars: List[str], py_files: List[str], files: List[str],
                     driver_java_options: str, driver_library_path: str,
                     driver_class_path: str, driver_memory: str,
                     executor_memory: str, driver_cores: int,
                     executor_cores: int):
    cluster_id = os.environ['AZ_BATCH_POOL_ID']
    spark_home = os.environ['SPARK_HOME']
    with open(os.path.join(spark_home, 'conf', 'master')) as f:
        master_ip = f.read().rstrip()

    # set file paths to correct path on container
    files_path = os.environ['AZ_BATCH_TASK_WORKING_DIR']
    jars = [os.path.join(files_path, os.path.basename(jar)) for jar in jars]
    py_files = [
        os.path.join(files_path, os.path.basename(py_file))
        for py_file in py_files
    ]
    files = [os.path.join(files_path, os.path.basename(f)) for f in files]

    # 2>&1 redirect stdout and stderr to be in the same file
    spark_submit_cmd = CommandBuilder(
        '{0}/bin/spark-submit'.format(spark_home))
    spark_submit_cmd.add_option('--master',
                                'spark://{0}:7077'.format(master_ip))
    spark_submit_cmd.add_option('--name', name)
    spark_submit_cmd.add_option('--class', main_class)
    spark_submit_cmd.add_option('--jars', jars and ','.join(jars))
    spark_submit_cmd.add_option('--py-files', py_files and ','.join(py_files))
    spark_submit_cmd.add_option('--files', files and ','.join(files))
    spark_submit_cmd.add_option('--driver-java-options', driver_java_options)
    spark_submit_cmd.add_option('--driver-library-path', driver_library_path)
    spark_submit_cmd.add_option('--driver-class-path', driver_class_path)
    spark_submit_cmd.add_option('--driver-memory', driver_memory)
    spark_submit_cmd.add_option('--executor-memory', executor_memory)
    if driver_cores:
        spark_submit_cmd.add_option('--driver-cores', str(driver_cores))
    if executor_cores:
        spark_submit_cmd.add_option('--executor-cores', str(executor_cores))

    spark_submit_cmd.add_argument(
        os.path.expandvars(app) + ' ' +
        ' '.join(['\'' + str(app_arg) + '\'' for app_arg in (app_args or [])]))

    with open("spark-submit.txt", mode="w", encoding="UTF-8") as stream:
        stream.write(spark_submit_cmd.to_str())

    return spark_submit_cmd