def run(self): # 1. Create the aws_lambda functions function_lambdas, invoking_pipelines_info, num_outputs = self._create_lambdas( ) cur_output_handler = output_handler.get_output_handler( self.static_job_info[StaticVariables.OUTPUT_SOURCE_TYPE_FN], self.static_job_info[StaticVariables.LOCAL_TESTING_FLAG_FN], self.is_serverless) cur_output_handler.create_output_storage(self.static_job_info, self.submission_time) # Execute # 2. Invoke Mappers asynchronously self._invoke_pipelines(invoking_pipelines_info) # 3. Calculate costs - Approx (since we are using exec time reported by our func and not billed ms) StaticVariables.JOB_START_TIME = time.time() logger.info("PERFORMANCE INFO: Job setup time: %s" % (StaticVariables.JOB_START_TIME - StaticVariables.SETUP_START_TIME)) self._calculate_cost(num_outputs, cur_output_handler, invoking_pipelines_info) # 4. Delete the function lambdas for function_lambda in function_lambdas: function_lambda.delete_function() if StaticVariables.OPTIMISATION_FN not in self.static_job_info \ or not self.static_job_info[StaticVariables.OPTIMISATION_FN]: self._update_duration() # 5. View one of the last stage executor's outputs # logger.info(cur_output_handler.get_output(3, self.static_job_info, self.submission_time)) else: job_name = self.static_job_info[StaticVariables.JOB_NAME_FN] table_name = StaticVariables.STAGE_STATE_DYNAMODB_TABLE_NAME % ( job_name, self.submission_time) self.map_phase_state.delete_state_table(table_name) in_degree_obj = in_degree.InDegree( in_lambda=self.is_serverless, is_local_testing=self.static_job_info[ StaticVariables.LOCAL_TESTING_FLAG_FN]) in_degree_table_name = StaticVariables.IN_DEGREE_DYNAMODB_TABLE_NAME % ( job_name, self.submission_time) in_degree_obj.delete_in_degree_table(in_degree_table_name) # metrics_bucket = StaticVariables.METRICS_BUCKET % job_name # self.delete_s3_objects(metrics_bucket, "") # self.s3_client.delete_bucket(Bucket=metrics_bucket) tear_down_time = time.time() - StaticVariables.TEAR_DOWN_START_TIME logger.info("PERFORMANCE INFO - Job tear down time: %s seconds" % str(tear_down_time)) return self.submission_time
def get_in_degree_info(): job_name = request.args.get('job-name') submission_time = request.args.get('submission-time') logger.info( "WebUI: Received request for path /in-degree with parameters: %s, %s" % (job_name, submission_time)) is_local_testing = os.environ.get( "local_testing") == 'True' or os.environ.get("local_testing") == 'true' in_degree_obj = in_degree.InDegree(in_lambda=False, is_local_testing=is_local_testing) in_degrees = in_degree_obj.read_in_degree_table( StaticVariables.IN_DEGREE_DYNAMODB_TABLE_NAME % (job_name, submission_time)) return jsonify(in_degrees)
def schedule_different_pipeline_next_stage(is_serverless_driver, stage_configuration, cur_pipeline_id, shuffling_bucket, job_name, submission_time): if not is_serverless_driver: with open(StaticVariables.PIPELINE_DEPENDENCIES_PATH) as json_file: adj_list = json.load(json_file) else: response = s3_client.get_object( Bucket=shuffling_bucket, Key=StaticVariables.PIPELINE_DEPENDENCIES_PATH) contents = response['Body'].read() adj_list = json.loads(contents) if not is_serverless_driver: with open(StaticVariables.PIPELINE_TO_FIRST_LAST_STAGE_PATH ) as json_file: pipeline_first_last_stage_ids = json.load(json_file) else: response = s3_client.get_object( Bucket=shuffling_bucket, Key=StaticVariables.PIPELINE_TO_FIRST_LAST_STAGE_PATH) contents = response['Body'].read() pipeline_first_last_stage_ids = json.loads(contents) in_degree_obj = in_degree.InDegree( in_lambda=True, is_local_testing=static_job_info[ StaticVariables.LOCAL_TESTING_FLAG_FN]) stage_progress_obj = stage_progress.StageProgress( in_lambda=True, is_local_testing=static_job_info[ StaticVariables.LOCAL_TESTING_FLAG_FN]) stage_progress_table_name = StaticVariables.STAGE_PROGRESS_DYNAMODB_TABLE_NAME % ( job_name, submission_time) for dependent_pipeline_id in adj_list[str(cur_pipeline_id)]: response = in_degree_obj.decrement_in_degree_table( StaticVariables.IN_DEGREE_DYNAMODB_TABLE_NAME % (job_name, submission_time), dependent_pipeline_id) dependent_in_degree = int(response["Attributes"]["in_degree"]["N"]) if dependent_in_degree == 0: next_pipeline_first_stage_id = pipeline_first_last_stage_ids[str( dependent_pipeline_id)][0] next_stage_config = stage_configuration[str( next_pipeline_first_stage_id)] invoking_lambda_name = next_stage_config["invoking_lambda_name"] dependent_stage_ids = next_stage_config["dependent_last_stage_ids"] # The last stages of a pipeline is assumed to be always either a map or reduce. keys_bins = get_map_reduce_outputs(shuffling_bucket, job_name, dependent_stage_ids) total_num_jobs = sum([len(keys_bin) for keys_bin in keys_bins]) stage_progress_obj.update_total_num_keys( stage_progress_table_name, next_pipeline_first_stage_id, total_num_jobs) if next_stage_config["stage_type"] == 1: for i in range(len(keys_bins)): response = lambda_client.invoke( FunctionName=invoking_lambda_name, InvocationType='Event', Payload=json.dumps({ "keys": keys_bins[i], "id": i + 1, "load_data_from_input": False, "function_pickle_path": next_stage_config["function_pickle_path"], "combiner_function_pickle_path": next_stage_config["combiner_function_pickle_path"], "partition_function_pickle_path": next_stage_config["partition_function_pickle_path"] })) else: for i in range(len(keys_bins)): response = lambda_client.invoke( FunctionName=invoking_lambda_name, InvocationType='Event', Payload=json.dumps({ "keys": keys_bins[i], "id": i + 1, "load_data_from_input": False, "function_pickle_path": next_stage_config["function_pickle_path"] })) print( "All operators finished in pipeline %s, next pipeline: number of operators scheduled: %s" % (cur_pipeline_id, len(keys_bins)))
def _create_lambdas(self): job_name = self.static_job_info[StaticVariables.JOB_NAME_FN] lambda_name_prefix = self.static_job_info[StaticVariables.LAMBDA_NAME_PREFIX_FN] \ if StaticVariables.LAMBDA_NAME_PREFIX_FN in self.static_job_info \ else StaticVariables.DEFAULT_LAMBDA_NAME_PREFIX shuffling_bucket = self.static_job_info[ StaticVariables.SHUFFLING_BUCKET_FN] region = self.config[StaticVariables.REGION_FN] \ if StaticVariables.REGION_FN in self.config else StaticVariables.DEFAULT_REGION stage_id = 1 num_operators = 0 function_lambdas = [] stage_config = {} mapping_stage_id_pipeline_id = {} adj_list = defaultdict(list) self.in_degrees = {} invoking_pipelines_info = {} pipelines_last_stage_num_operators = {} pipelines_first_last_stage_ids = {} stage_type_of_operations = {} cur_coordinator_lambda_name = "%s-%s-%s" % ( job_name, lambda_name_prefix, "coordinator") # The first function should be a map/map_shuffle function for pipeline_id, pipeline in self.pipelines.items(): functions = pipeline.get_functions() pipeline_static_job_info = self._overwrite_existing_job_info( pipeline.get_config()) # TODO: The next line is correct? self.static_job_info = pipeline_static_job_info dependent_pipeline_ids = pipeline.get_dependent_pipeline_ids() for dependent_pipeline_id in dependent_pipeline_ids: adj_list[dependent_pipeline_id].append(pipeline_id) self.in_degrees[pipeline_id] = self.in_degrees.get( pipeline_id, 0) + 1 if len(dependent_pipeline_ids) == 0: if not self.is_serverless: set_up_local_input_data(pipeline_static_job_info) all_keys, num_operators, batches = self._get_all_keys( pipeline_static_job_info) first_function = functions[0] invoking_pipelines_info[pipeline_id] = [ all_keys, num_operators, batches, first_function, stage_id ] else: num_operators = 0 for dependent_pipeline_id in dependent_pipeline_ids: num_operators += pipelines_last_stage_num_operators[ dependent_pipeline_id] pipelines_first_last_stage_ids[pipeline_id] = [stage_id] for i in range(len(functions)): mapping_stage_id_pipeline_id[stage_id] = pipeline_id cur_function = functions[i] cur_function_zip_path = "%s-%s.zip" % ( cur_function.get_string(), stage_id) stage_type_of_operations[stage_id] = cur_function.get_string() # Prepare Lambda functions if driver running in local machine if not self.is_serverless: pickle_functions_and_zip_stage(cur_function_zip_path, cur_function, stage_id) cur_function_lambda_name = "%s-%s-%s-%s" % ( job_name, lambda_name_prefix, cur_function.get_string(), stage_id) cur_function_lambda = lambda_manager.LambdaManager( self.lambda_client, self.s3_client, region, cur_function_zip_path, job_name, cur_function_lambda_name, cur_function.get_handler_function_path()) if isinstance(cur_function, MapShuffleFunction): assert i + 1 < len(functions) and isinstance( functions[i + 1], ReduceFunction) cur_function_lambda.update_code_or_create_on_no_exist( self.total_num_functions, submission_time=self.submission_time, coordinator_lambda_name=cur_coordinator_lambda_name, stage_id=stage_id, num_reducers=functions[i + 1].get_num_reducers()) else: cur_function_lambda.update_code_or_create_on_no_exist( self.total_num_functions, submission_time=self.submission_time, coordinator_lambda_name=cur_coordinator_lambda_name, stage_id=stage_id) function_lambdas.append(cur_function_lambda) # Coordinator cur_function_pickle_path = 'job/%s-%s.pkl' % ( cur_function.get_string(), stage_id) dependent_last_stage_ids = [] for dependent_pipeline_id in dependent_pipeline_ids: dependent_last_stage_ids.append( pipelines_first_last_stage_ids[dependent_pipeline_id] [1]) if isinstance(cur_function, MapShuffleFunction): partition_function_pickle_path = 'job/%s-%s.pkl' % ( "partition", stage_id) combiner_function_pickle_path = 'job/%s-%s.pkl' % ( "combiner", stage_id) stage_config[stage_id] = \ create_stage_config_file(num_operators, 1, cur_function_lambda_name, cur_function_pickle_path, dependent_last_stage_ids, partition_function_pickle_path, combiner_function_pickle_path) else: if isinstance(cur_function, ReduceFunction): num_operators = cur_function.get_num_reducers() stage_config[stage_id] = \ create_stage_config_file(num_operators, 2, cur_function_lambda_name, cur_function_pickle_path, dependent_last_stage_ids) stage_id += 1 pipelines_first_last_stage_ids[pipeline_id].append(stage_id - 1) pipelines_last_stage_num_operators[pipeline_id] = num_operators coordinator_zip_path = StaticVariables.COORDINATOR_ZIP_PATH if not self.is_serverless: self._write_config_to_local(adj_list, mapping_stage_id_pipeline_id, pipelines_first_last_stage_ids, stage_config) zip.zip_lambda([StaticVariables.COORDINATOR_HANDLER_PATH], coordinator_zip_path) else: self._write_config_to_s3(adj_list, mapping_stage_id_pipeline_id, pipelines_first_last_stage_ids, stage_config, shuffling_bucket) # Web UI information if StaticVariables.OPTIMISATION_FN not in self.static_job_info \ or not self.static_job_info[StaticVariables.OPTIMISATION_FN]: dag_information = construct_dag_information( adj_list, mapping_stage_id_pipeline_id, pipelines_first_last_stage_ids, stage_type_of_operations) populate_static_job_info(self.static_job_info, len(pipelines_first_last_stage_ids), len(stage_type_of_operations), self.submission_time) self._write_web_ui_info( dag_information, stage_config, self.static_job_info, StaticVariables.S3_JOBS_INFORMATION_BUCKET_NAME, job_name) cur_coordinator_lambda = lambda_manager.LambdaManager( self.lambda_client, self.s3_client, region, coordinator_zip_path, job_name, cur_coordinator_lambda_name, StaticVariables.COORDINATOR_HANDLER_FUNCTION_PATH) cur_coordinator_lambda.update_code_or_create_on_no_exist( self.total_num_functions, submission_time=self.submission_time) # cur_coordinator_lambda.add_lambda_permission(random.randint(1, 1000), shuffling_bucket) # shuffling_s3_path_prefix = "%s/" % job_name # cur_coordinator_lambda.create_s3_event_source_notification(shuffling_bucket, shuffling_s3_path_prefix) # time.sleep(1) function_lambdas.append(cur_coordinator_lambda) if len(self.pipelines) > 1: in_degree_obj = in_degree.InDegree( in_lambda=self.is_serverless, is_local_testing=self.static_job_info[ StaticVariables.LOCAL_TESTING_FLAG_FN]) in_degree_table_name = StaticVariables.IN_DEGREE_DYNAMODB_TABLE_NAME % ( job_name, self.submission_time) in_degree_obj.delete_in_degree_table(in_degree_table_name) in_degree_obj.create_in_degree_table(in_degree_table_name) in_degree_obj.initialise_in_degree_table(in_degree_table_name, self.in_degrees) if StaticVariables.OPTIMISATION_FN not in self.static_job_info \ or not self.static_job_info[StaticVariables.OPTIMISATION_FN]: stage_progress_obj = stage_progress.StageProgress( in_lambda=self.is_serverless, is_local_testing=self.static_job_info[ StaticVariables.LOCAL_TESTING_FLAG_FN]) stage_progress_table_name = StaticVariables.STAGE_PROGRESS_DYNAMODB_TABLE_NAME % ( job_name, self.submission_time) stage_progress_obj.delete_progress_table(stage_progress_table_name) stage_progress_obj.create_progress_table(stage_progress_table_name) stage_progress_obj.initialise_progress_table( stage_progress_table_name, stage_id - 1) if not self.is_serverless: delete_files(glob.glob(StaticVariables.LAMBDA_ZIP_GLOB_PATH)) delete_files(glob.glob(StaticVariables.FUNCTIONS_PICKLE_GLOB_PATH)) return function_lambdas, invoking_pipelines_info, num_operators