def pickle_functions_and_zip_stage(cur_function_zip_path, cur_function, stage_id): cur_function_pickle_path = 'job/%s-%s.pkl' % (cur_function.get_string(), stage_id) rel_function_paths = cur_function.get_rel_function_paths() with open(cur_function_pickle_path, 'wb') as f: pickle.dump(cur_function.get_function(), f) if isinstance(cur_function, MapShuffleFunction): partition_function_pickle_path = 'job/%s-%s.pkl' % ("partition", stage_id) with open(partition_function_pickle_path, 'wb') as f: pickle.dump(cur_function.get_partition_function(), f) combiner_function_pickle_path = 'job/%s-%s.pkl' % ("combiner", stage_id) with open(combiner_function_pickle_path, 'wb') as f: pickle.dump(cur_function.get_combiner_function(), f) zip.zip_lambda(rel_function_paths, cur_function_zip_path) return rel_function_paths
def register_driver(self, main_file_path, function_paths): stage_id = 1 function_filepaths = [] # The first function should be a map/map_shuffle function for pipeline_id, pipeline in self.pipelines.items(): functions = pipeline.get_functions() pipeline_static_job_info = overwrite_existing_job_info( pipeline.get_config()) self.static_job_info = pipeline_static_job_info dependent_pipeline_ids = pipeline.get_dependent_pipeline_ids() if len(dependent_pipeline_ids) == 0: set_up_local_input_data(pipeline_static_job_info) for i in range(len(functions)): cur_function = functions[i] cur_function_zip_path = "%s-%s.zip" % ( cur_function.get_string(), stage_id) # Prepare Lambda functions rel_function_paths = pickle_functions_and_zip_stage( cur_function_zip_path, cur_function, stage_id) function_filepaths += rel_function_paths stage_id += 1 with open(StaticVariables.SERVERLESS_PIPELINES_INFO_PATH, 'wb') as f: pickle.dump(self.pipelines, f) zip.zip_lambda([StaticVariables.COORDINATOR_HANDLER_PATH], StaticVariables.COORDINATOR_ZIP_PATH) zip.zip_driver_lambda(StaticVariables.DRIVER_ZIP_PATH, function_filepaths) serverless_driver = lambda_manager.LambdaManager( self.lambda_client, self.s3_client, self.region, StaticVariables.DRIVER_ZIP_PATH, self.job_name, self.driver_lambda_name, StaticVariables.SERVERLESS_DRIVER_HANDLER_FUNCTION_PATH) serverless_driver.update_code_or_create_on_no_exist( self.total_num_functions) registered_job_information = { 'jobName': self.job_name, 'driverLambdaName': self.driver_lambda_name, 'registeredTime': datetime.utcnow().strftime("%Y-%m-%d_%H.%M.%S"), 'shufflingBucket': self.static_job_info[StaticVariables.SHUFFLING_BUCKET_FN], 'inputSource': self.static_job_info[StaticVariables.INPUT_SOURCE_FN], "outputSource": self.static_job_info[StaticVariables.OUTPUT_SOURCE_FN], 'totalNumPipelines': len(self.pipelines), 'totalNumStages': stage_id - 1 } self.s3_client.put_object( Bucket=StaticVariables.S3_JOBS_INFORMATION_BUCKET_NAME, Key=(StaticVariables.S3_UI_REGISTERED_JOB_INFORMATION_PATH % self.job_name), Body=json.dumps(registered_job_information)) self.s3_client.put_object( Bucket=StaticVariables.S3_JOBS_INFORMATION_BUCKET_NAME, Key=(StaticVariables.S3_UI_REGISTERED_JOB_DRIVER_CONFIG_PATH % self.job_name), Body=json.dumps(self.config)) static_job_info_file_path = 'configuration/static-job-info.json' driver_file_path = 'configuration/driver.json' registered_job_source_info = [{ 'filePath': main_file_path, 'location': StaticVariables.S3_UI_REGISTERED_JOB_SOURCE_FILES_PATH % (self.job_name, str(main_file_path)) }, { 'filePath': static_job_info_file_path, 'location': StaticVariables.S3_UI_REGISTERED_JOB_SOURCE_FILES_PATH % (self.job_name, static_job_info_file_path) }, { 'filePath': driver_file_path, 'location': StaticVariables.S3_UI_REGISTERED_JOB_SOURCE_FILES_PATH % (self.job_name, driver_file_path) }] os.chdir(StaticVariables.PROJECT_WORKING_DIRECTORY) print("Main.py file path is: %s" % main_file_path) self.s3_client.upload_file( Filename=main_file_path, Bucket=StaticVariables.S3_JOBS_INFORMATION_BUCKET_NAME, Key=StaticVariables.S3_UI_REGISTERED_JOB_SOURCE_FILES_PATH % (self.job_name, str(main_file_path))) os.chdir(StaticVariables.LIBRARY_WORKING_DIRECTORY) self.s3_client.upload_file( Filename=static_job_info_file_path, Bucket=StaticVariables.S3_JOBS_INFORMATION_BUCKET_NAME, Key=StaticVariables.S3_UI_REGISTERED_JOB_SOURCE_FILES_PATH % (self.job_name, str(static_job_info_file_path))) self.s3_client.upload_file( Filename=driver_file_path, Bucket=StaticVariables.S3_JOBS_INFORMATION_BUCKET_NAME, Key=StaticVariables.S3_UI_REGISTERED_JOB_SOURCE_FILES_PATH % (self.job_name, str(driver_file_path))) for function_path in function_paths: registered_job_source_info.append({ 'filePath': function_path, 'location': StaticVariables.S3_UI_REGISTERED_JOB_SOURCE_FILES_PATH % (self.job_name, str(function_path)) }) self.s3_client.upload_file( Filename=function_path, Bucket=StaticVariables.S3_JOBS_INFORMATION_BUCKET_NAME, Key=StaticVariables.S3_UI_REGISTERED_JOB_SOURCE_FILES_PATH % (self.job_name, str(function_path))) main_job_source_info = { 'main': main_file_path, 'sourceInfo': registered_job_source_info } self.s3_client.put_object( Bucket=StaticVariables.S3_JOBS_INFORMATION_BUCKET_NAME, Key=(StaticVariables.S3_UI_REGISTERED_JOB_SOURCE_INFO_PATH % self.job_name), Body=json.dumps(main_job_source_info)) delete_files(glob.glob(StaticVariables.FUNCTIONS_PICKLE_GLOB_PATH)) delete_files(glob.glob(StaticVariables.LAMBDA_ZIP_GLOB_PATH))
def _create_lambdas(self): job_name = self.static_job_info[StaticVariables.JOB_NAME_FN] lambda_name_prefix = self.static_job_info[StaticVariables.LAMBDA_NAME_PREFIX_FN] \ if StaticVariables.LAMBDA_NAME_PREFIX_FN in self.static_job_info \ else StaticVariables.DEFAULT_LAMBDA_NAME_PREFIX shuffling_bucket = self.static_job_info[ StaticVariables.SHUFFLING_BUCKET_FN] region = self.config[StaticVariables.REGION_FN] \ if StaticVariables.REGION_FN in self.config else StaticVariables.DEFAULT_REGION stage_id = 1 num_operators = 0 function_lambdas = [] stage_config = {} mapping_stage_id_pipeline_id = {} adj_list = defaultdict(list) self.in_degrees = {} invoking_pipelines_info = {} pipelines_last_stage_num_operators = {} pipelines_first_last_stage_ids = {} stage_type_of_operations = {} cur_coordinator_lambda_name = "%s-%s-%s" % ( job_name, lambda_name_prefix, "coordinator") # The first function should be a map/map_shuffle function for pipeline_id, pipeline in self.pipelines.items(): functions = pipeline.get_functions() pipeline_static_job_info = self._overwrite_existing_job_info( pipeline.get_config()) # TODO: The next line is correct? self.static_job_info = pipeline_static_job_info dependent_pipeline_ids = pipeline.get_dependent_pipeline_ids() for dependent_pipeline_id in dependent_pipeline_ids: adj_list[dependent_pipeline_id].append(pipeline_id) self.in_degrees[pipeline_id] = self.in_degrees.get( pipeline_id, 0) + 1 if len(dependent_pipeline_ids) == 0: if not self.is_serverless: set_up_local_input_data(pipeline_static_job_info) all_keys, num_operators, batches = self._get_all_keys( pipeline_static_job_info) first_function = functions[0] invoking_pipelines_info[pipeline_id] = [ all_keys, num_operators, batches, first_function, stage_id ] else: num_operators = 0 for dependent_pipeline_id in dependent_pipeline_ids: num_operators += pipelines_last_stage_num_operators[ dependent_pipeline_id] pipelines_first_last_stage_ids[pipeline_id] = [stage_id] for i in range(len(functions)): mapping_stage_id_pipeline_id[stage_id] = pipeline_id cur_function = functions[i] cur_function_zip_path = "%s-%s.zip" % ( cur_function.get_string(), stage_id) stage_type_of_operations[stage_id] = cur_function.get_string() # Prepare Lambda functions if driver running in local machine if not self.is_serverless: pickle_functions_and_zip_stage(cur_function_zip_path, cur_function, stage_id) cur_function_lambda_name = "%s-%s-%s-%s" % ( job_name, lambda_name_prefix, cur_function.get_string(), stage_id) cur_function_lambda = lambda_manager.LambdaManager( self.lambda_client, self.s3_client, region, cur_function_zip_path, job_name, cur_function_lambda_name, cur_function.get_handler_function_path()) if isinstance(cur_function, MapShuffleFunction): assert i + 1 < len(functions) and isinstance( functions[i + 1], ReduceFunction) cur_function_lambda.update_code_or_create_on_no_exist( self.total_num_functions, submission_time=self.submission_time, coordinator_lambda_name=cur_coordinator_lambda_name, stage_id=stage_id, num_reducers=functions[i + 1].get_num_reducers()) else: cur_function_lambda.update_code_or_create_on_no_exist( self.total_num_functions, submission_time=self.submission_time, coordinator_lambda_name=cur_coordinator_lambda_name, stage_id=stage_id) function_lambdas.append(cur_function_lambda) # Coordinator cur_function_pickle_path = 'job/%s-%s.pkl' % ( cur_function.get_string(), stage_id) dependent_last_stage_ids = [] for dependent_pipeline_id in dependent_pipeline_ids: dependent_last_stage_ids.append( pipelines_first_last_stage_ids[dependent_pipeline_id] [1]) if isinstance(cur_function, MapShuffleFunction): partition_function_pickle_path = 'job/%s-%s.pkl' % ( "partition", stage_id) combiner_function_pickle_path = 'job/%s-%s.pkl' % ( "combiner", stage_id) stage_config[stage_id] = \ create_stage_config_file(num_operators, 1, cur_function_lambda_name, cur_function_pickle_path, dependent_last_stage_ids, partition_function_pickle_path, combiner_function_pickle_path) else: if isinstance(cur_function, ReduceFunction): num_operators = cur_function.get_num_reducers() stage_config[stage_id] = \ create_stage_config_file(num_operators, 2, cur_function_lambda_name, cur_function_pickle_path, dependent_last_stage_ids) stage_id += 1 pipelines_first_last_stage_ids[pipeline_id].append(stage_id - 1) pipelines_last_stage_num_operators[pipeline_id] = num_operators coordinator_zip_path = StaticVariables.COORDINATOR_ZIP_PATH if not self.is_serverless: self._write_config_to_local(adj_list, mapping_stage_id_pipeline_id, pipelines_first_last_stage_ids, stage_config) zip.zip_lambda([StaticVariables.COORDINATOR_HANDLER_PATH], coordinator_zip_path) else: self._write_config_to_s3(adj_list, mapping_stage_id_pipeline_id, pipelines_first_last_stage_ids, stage_config, shuffling_bucket) # Web UI information if StaticVariables.OPTIMISATION_FN not in self.static_job_info \ or not self.static_job_info[StaticVariables.OPTIMISATION_FN]: dag_information = construct_dag_information( adj_list, mapping_stage_id_pipeline_id, pipelines_first_last_stage_ids, stage_type_of_operations) populate_static_job_info(self.static_job_info, len(pipelines_first_last_stage_ids), len(stage_type_of_operations), self.submission_time) self._write_web_ui_info( dag_information, stage_config, self.static_job_info, StaticVariables.S3_JOBS_INFORMATION_BUCKET_NAME, job_name) cur_coordinator_lambda = lambda_manager.LambdaManager( self.lambda_client, self.s3_client, region, coordinator_zip_path, job_name, cur_coordinator_lambda_name, StaticVariables.COORDINATOR_HANDLER_FUNCTION_PATH) cur_coordinator_lambda.update_code_or_create_on_no_exist( self.total_num_functions, submission_time=self.submission_time) # cur_coordinator_lambda.add_lambda_permission(random.randint(1, 1000), shuffling_bucket) # shuffling_s3_path_prefix = "%s/" % job_name # cur_coordinator_lambda.create_s3_event_source_notification(shuffling_bucket, shuffling_s3_path_prefix) # time.sleep(1) function_lambdas.append(cur_coordinator_lambda) if len(self.pipelines) > 1: in_degree_obj = in_degree.InDegree( in_lambda=self.is_serverless, is_local_testing=self.static_job_info[ StaticVariables.LOCAL_TESTING_FLAG_FN]) in_degree_table_name = StaticVariables.IN_DEGREE_DYNAMODB_TABLE_NAME % ( job_name, self.submission_time) in_degree_obj.delete_in_degree_table(in_degree_table_name) in_degree_obj.create_in_degree_table(in_degree_table_name) in_degree_obj.initialise_in_degree_table(in_degree_table_name, self.in_degrees) if StaticVariables.OPTIMISATION_FN not in self.static_job_info \ or not self.static_job_info[StaticVariables.OPTIMISATION_FN]: stage_progress_obj = stage_progress.StageProgress( in_lambda=self.is_serverless, is_local_testing=self.static_job_info[ StaticVariables.LOCAL_TESTING_FLAG_FN]) stage_progress_table_name = StaticVariables.STAGE_PROGRESS_DYNAMODB_TABLE_NAME % ( job_name, self.submission_time) stage_progress_obj.delete_progress_table(stage_progress_table_name) stage_progress_obj.create_progress_table(stage_progress_table_name) stage_progress_obj.initialise_progress_table( stage_progress_table_name, stage_id - 1) if not self.is_serverless: delete_files(glob.glob(StaticVariables.LAMBDA_ZIP_GLOB_PATH)) delete_files(glob.glob(StaticVariables.FUNCTIONS_PICKLE_GLOB_PATH)) return function_lambdas, invoking_pipelines_info, num_operators
def register_driver(self): stage_id = 1 function_filepaths = [] # The first function should be a map/map_shuffle function for pipeline_id, pipeline in self.pipelines.items(): functions = pipeline.get_functions() pipeline_static_job_info = overwrite_existing_job_info( pipeline.get_config()) self.static_job_info = pipeline_static_job_info dependent_pipeline_ids = pipeline.get_dependent_pipeline_ids() if len(dependent_pipeline_ids) == 0: set_up_local_input_data(pipeline_static_job_info) for i in range(len(functions)): cur_function = functions[i] cur_function_zip_path = "%s-%s.zip" % ( cur_function.get_string(), stage_id) # Prepare Lambda functions if driver running in local machine rel_function_paths = pickle_functions_and_zip_stage( cur_function_zip_path, cur_function, stage_id) function_filepaths += rel_function_paths stage_id += 1 with open(StaticVariables.SERVERLESS_PIPELINES_INFO_PATH, 'wb') as f: pickle.dump(self.pipelines, f) zip.zip_lambda([StaticVariables.COORDINATOR_HANDLER_PATH], StaticVariables.COORDINATOR_ZIP_PATH) zip.zip_driver_lambda(StaticVariables.DRIVER_ZIP_PATH, function_filepaths) serverless_driver = lambda_manager.LambdaManager( self.lambda_client, self.s3_client, self.region, StaticVariables.DRIVER_ZIP_PATH, self.job_name, self.driver_lambda_name, StaticVariables.SERVERLESS_DRIVER_HANDLER_FUNCTION_PATH) serverless_driver.update_code_or_create_on_no_exist( self.total_num_functions) registered_job_information = { 'jobName': self.job_name, 'driverLambdaName': self.driver_lambda_name, 'registeredTime': datetime.utcnow().strftime("%Y-%m-%d_%H.%M.%S"), 'shufflingBucket': self.static_job_info[StaticVariables.SHUFFLING_BUCKET_FN], 'inputSource': self.static_job_info[StaticVariables.INPUT_SOURCE_FN], "outputSource": self.static_job_info[StaticVariables.OUTPUT_SOURCE_FN], 'totalNumPipelines': len(self.pipelines), 'totalNumStages': stage_id - 1 } self.s3_client.put_object( Bucket=StaticVariables.S3_JOBS_INFORMATION_BUCKET_NAME, Key=(StaticVariables.S3_UI_REGISTERED_JOB_INFORMATION_PATH % self.job_name), Body=json.dumps(registered_job_information)) self.s3_client.put_object( Bucket=StaticVariables.S3_JOBS_INFORMATION_BUCKET_NAME, Key=(StaticVariables.S3_UI_REGISTERED_JOB_DRIVER_CONFIG_PATH % self.job_name), Body=json.dumps(self.config)) delete_files(glob.glob(StaticVariables.FUNCTIONS_PICKLE_GLOB_PATH)) delete_files(glob.glob(StaticVariables.LAMBDA_ZIP_GLOB_PATH))