def generate_task(spark_client, job, application_tasks): resource_files = [] for application, task in application_tasks: task_definition_resource_file = helpers.upload_text_to_container( container_name=job.id, application_name=application.name + ".yaml", file_path=application.name + ".yaml", content=yaml.dump(task), blob_client=spark_client.blob_client, ) resource_files.append(task_definition_resource_file) task_cmd = __app_cmd() # Create task task = batch_models.JobManagerTask( id=job.id, command_line=helpers.wrap_commands_in_shell([task_cmd]), resource_files=resource_files, kill_job_on_completion=False, allow_low_priority_node=True, user_identity=batch_models.UserIdentity( auto_user=batch_models.AutoUserSpecification( scope=batch_models.AutoUserScope.task, elevation_level=batch_models.ElevationLevel.admin)), ) return task
def upload_serialized_task_to_storage(blob_client, cluster_id, task): return helpers.upload_text_to_container( container_name=cluster_id, application_name=task.id, file_path="task.yaml", content=yaml.dump(task), blob_client=blob_client, )
def generate_task(spark_client, container_id, application, remote=False): resource_files = [] # The application provided is not hosted remotely and therefore must be uploaded if not remote: app_resource_file = helpers.upload_file_to_container( container_name=container_id, application_name=application.name, file_path=application.application, blob_client=spark_client.blob_client, use_full_path=False, ) # Upload application file resource_files.append(app_resource_file) application.application = "$AZ_BATCH_TASK_WORKING_DIR/" + os.path.basename( application.application) # Upload dependent JARS jar_resource_file_paths = [] for jar in application.jars: current_jar_resource_file_path = helpers.upload_file_to_container( container_name=container_id, application_name=application.name, file_path=jar, blob_client=spark_client.blob_client, use_full_path=False, ) jar_resource_file_paths.append(current_jar_resource_file_path) resource_files.append(current_jar_resource_file_path) # Upload dependent python files py_files_resource_file_paths = [] for py_file in application.py_files: current_py_files_resource_file_path = helpers.upload_file_to_container( container_name=container_id, application_name=application.name, file_path=py_file, blob_client=spark_client.blob_client, use_full_path=False, ) py_files_resource_file_paths.append( current_py_files_resource_file_path) resource_files.append(current_py_files_resource_file_path) # Upload other dependent files files_resource_file_paths = [] for file in application.files: files_resource_file_path = helpers.upload_file_to_container( container_name=container_id, application_name=application.name, file_path=file, blob_client=spark_client.blob_client, use_full_path=False, ) files_resource_file_paths.append(files_resource_file_path) resource_files.append(files_resource_file_path) # Upload application definition application.jars = [os.path.basename(jar) for jar in application.jars] application.py_files = [ os.path.basename(py_files) for py_files in application.py_files ] application.files = [ os.path.basename(files) for files in application.files ] application_definition_file = helpers.upload_text_to_container( container_name=container_id, application_name=application.name, file_path="application.yaml", content=yaml.dump(vars(application)), blob_client=spark_client.blob_client, ) resource_files.append(application_definition_file) # create command to submit task task_cmd = CommandBuilder("sudo docker exec") task_cmd.add_argument("-i") task_cmd.add_option( "-e", "AZ_BATCH_TASK_WORKING_DIR=$AZ_BATCH_TASK_WORKING_DIR") task_cmd.add_option("-e", "STORAGE_LOGS_CONTAINER={0}".format(container_id)) task_cmd.add_argument("spark /bin/bash >> output.log 2>&1") task_cmd.add_argument( r'-c "source ~/.bashrc; ' r"export PYTHONPATH=$PYTHONPATH:\$AZTK_WORKING_DIR; " r"cd \$AZ_BATCH_TASK_WORKING_DIR; " r'\$AZTK_WORKING_DIR/.aztk-env/.venv/bin/python \$AZTK_WORKING_DIR/aztk/node_scripts/submit.py"' ) # Create task task = batch_models.TaskAddParameter( id=application.name, command_line=helpers.wrap_commands_in_shell([task_cmd.to_str()]), resource_files=resource_files, constraints=batch_models.TaskConstraints( max_task_retry_count=application.max_retry_count), user_identity=batch_models.UserIdentity( auto_user=batch_models.AutoUserSpecification( scope=batch_models.AutoUserScope.task, elevation_level=batch_models.ElevationLevel.admin)), ) return task
def generate_task(spark_client, container_id, application): resource_files = [] app_resource_file = helpers.upload_file_to_container( container_name=container_id, application_name=application.name, file_path=application.application, blob_client=spark_client.blob_client, use_full_path=False) # Upload application file resource_files.append(app_resource_file) # Upload dependent JARS jar_resource_file_paths = [] for jar in application.jars: current_jar_resource_file_path = helpers.upload_file_to_container( container_name=container_id, application_name=application.name, file_path=jar, blob_client=spark_client.blob_client, use_full_path=False) jar_resource_file_paths.append(current_jar_resource_file_path) resource_files.append(current_jar_resource_file_path) # Upload dependent python files py_files_resource_file_paths = [] for py_file in application.py_files: current_py_files_resource_file_path = helpers.upload_file_to_container( container_name=container_id, application_name=application.name, file_path=py_file, blob_client=spark_client.blob_client, use_full_path=False) py_files_resource_file_paths.append( current_py_files_resource_file_path) resource_files.append(current_py_files_resource_file_path) # Upload other dependent files files_resource_file_paths = [] for file in application.files: files_resource_file_path = helpers.upload_file_to_container( container_name=container_id, application_name=application.name, file_path=file, blob_client=spark_client.blob_client, use_full_path=False) files_resource_file_paths.append(files_resource_file_path) resource_files.append(files_resource_file_path) # Upload application definition application.application = os.path.basename(application.application) application.jars = [os.path.basename(jar) for jar in application.jars] application.py_files = [ os.path.basename(py_files) for py_files in application.py_files ] application.files = [ os.path.basename(files) for files in application.files ] application_definition_file = helpers.upload_text_to_container( container_name=container_id, application_name=application.name, file_path='application.yaml', content=yaml.dump(vars(application)), blob_client=spark_client.blob_client) resource_files.append(application_definition_file) # create command to submit task task_cmd = CommandBuilder('sudo docker exec') task_cmd.add_argument('-i') task_cmd.add_option( '-e', 'AZ_BATCH_TASK_WORKING_DIR=$AZ_BATCH_TASK_WORKING_DIR') task_cmd.add_option('-e', 'STORAGE_LOGS_CONTAINER={0}'.format(container_id)) task_cmd.add_argument('spark /bin/bash >> output.log 2>&1') task_cmd.add_argument('-c "source ~/.bashrc; '\ 'cd $AZ_BATCH_TASK_WORKING_DIR; ' \ '\$(pyenv root)/versions/\$AZTK_PYTHON_VERSION/bin/python ' \ '\$DOCKER_WORKING_DIR/aztk/node_scripts/submit.py"') # Create task task = batch_models.TaskAddParameter( id=application.name, command_line=helpers.wrap_commands_in_shell([task_cmd.to_str()]), resource_files=resource_files, constraints=batch_models.TaskConstraints( max_task_retry_count=application.max_retry_count), user_identity=batch_models.UserIdentity( auto_user=batch_models.AutoUserSpecification( scope=batch_models.AutoUserScope.task, elevation_level=batch_models.ElevationLevel.admin))) return task