예제 #1
0
    def run(self):
        # 1. Create the aws_lambda functions
        function_lambdas, invoking_pipelines_info, num_outputs = self._create_lambdas(
        )

        cur_output_handler = output_handler.get_output_handler(
            self.static_job_info[StaticVariables.OUTPUT_SOURCE_TYPE_FN],
            self.static_job_info[StaticVariables.LOCAL_TESTING_FLAG_FN],
            self.is_serverless)
        cur_output_handler.create_output_storage(self.static_job_info,
                                                 self.submission_time)

        # Execute
        # 2. Invoke Mappers asynchronously
        self._invoke_pipelines(invoking_pipelines_info)

        # 3. Calculate costs - Approx (since we are using exec time reported by our func and not billed ms)
        StaticVariables.JOB_START_TIME = time.time()
        logger.info("PERFORMANCE INFO: Job setup time: %s" %
                    (StaticVariables.JOB_START_TIME -
                     StaticVariables.SETUP_START_TIME))
        self._calculate_cost(num_outputs, cur_output_handler,
                             invoking_pipelines_info)

        # 4. Delete the function lambdas
        for function_lambda in function_lambdas:
            function_lambda.delete_function()

        if StaticVariables.OPTIMISATION_FN not in self.static_job_info \
                or not self.static_job_info[StaticVariables.OPTIMISATION_FN]:
            self._update_duration()
            # 5. View one of the last stage executor's outputs
            # logger.info(cur_output_handler.get_output(3, self.static_job_info, self.submission_time))
        else:
            job_name = self.static_job_info[StaticVariables.JOB_NAME_FN]
            table_name = StaticVariables.STAGE_STATE_DYNAMODB_TABLE_NAME % (
                job_name, self.submission_time)
            self.map_phase_state.delete_state_table(table_name)

            in_degree_obj = in_degree.InDegree(
                in_lambda=self.is_serverless,
                is_local_testing=self.static_job_info[
                    StaticVariables.LOCAL_TESTING_FLAG_FN])
            in_degree_table_name = StaticVariables.IN_DEGREE_DYNAMODB_TABLE_NAME % (
                job_name, self.submission_time)
            in_degree_obj.delete_in_degree_table(in_degree_table_name)

            # metrics_bucket = StaticVariables.METRICS_BUCKET % job_name
            # self.delete_s3_objects(metrics_bucket, "")
            # self.s3_client.delete_bucket(Bucket=metrics_bucket)

        tear_down_time = time.time() - StaticVariables.TEAR_DOWN_START_TIME
        logger.info("PERFORMANCE INFO - Job tear down time: %s seconds" %
                    str(tear_down_time))
        return self.submission_time
예제 #2
0
def get_in_degree_info():
    job_name = request.args.get('job-name')
    submission_time = request.args.get('submission-time')
    logger.info(
        "WebUI: Received request for path /in-degree with parameters: %s, %s" %
        (job_name, submission_time))

    is_local_testing = os.environ.get(
        "local_testing") == 'True' or os.environ.get("local_testing") == 'true'
    in_degree_obj = in_degree.InDegree(in_lambda=False,
                                       is_local_testing=is_local_testing)
    in_degrees = in_degree_obj.read_in_degree_table(
        StaticVariables.IN_DEGREE_DYNAMODB_TABLE_NAME %
        (job_name, submission_time))
    return jsonify(in_degrees)
예제 #3
0
def schedule_different_pipeline_next_stage(is_serverless_driver,
                                           stage_configuration,
                                           cur_pipeline_id, shuffling_bucket,
                                           job_name, submission_time):
    if not is_serverless_driver:
        with open(StaticVariables.PIPELINE_DEPENDENCIES_PATH) as json_file:
            adj_list = json.load(json_file)
    else:
        response = s3_client.get_object(
            Bucket=shuffling_bucket,
            Key=StaticVariables.PIPELINE_DEPENDENCIES_PATH)
        contents = response['Body'].read()
        adj_list = json.loads(contents)

    if not is_serverless_driver:
        with open(StaticVariables.PIPELINE_TO_FIRST_LAST_STAGE_PATH
                  ) as json_file:
            pipeline_first_last_stage_ids = json.load(json_file)
    else:
        response = s3_client.get_object(
            Bucket=shuffling_bucket,
            Key=StaticVariables.PIPELINE_TO_FIRST_LAST_STAGE_PATH)
        contents = response['Body'].read()
        pipeline_first_last_stage_ids = json.loads(contents)

    in_degree_obj = in_degree.InDegree(
        in_lambda=True,
        is_local_testing=static_job_info[
            StaticVariables.LOCAL_TESTING_FLAG_FN])
    stage_progress_obj = stage_progress.StageProgress(
        in_lambda=True,
        is_local_testing=static_job_info[
            StaticVariables.LOCAL_TESTING_FLAG_FN])
    stage_progress_table_name = StaticVariables.STAGE_PROGRESS_DYNAMODB_TABLE_NAME % (
        job_name, submission_time)
    for dependent_pipeline_id in adj_list[str(cur_pipeline_id)]:
        response = in_degree_obj.decrement_in_degree_table(
            StaticVariables.IN_DEGREE_DYNAMODB_TABLE_NAME %
            (job_name, submission_time), dependent_pipeline_id)
        dependent_in_degree = int(response["Attributes"]["in_degree"]["N"])
        if dependent_in_degree == 0:
            next_pipeline_first_stage_id = pipeline_first_last_stage_ids[str(
                dependent_pipeline_id)][0]
            next_stage_config = stage_configuration[str(
                next_pipeline_first_stage_id)]
            invoking_lambda_name = next_stage_config["invoking_lambda_name"]
            dependent_stage_ids = next_stage_config["dependent_last_stage_ids"]
            # The last stages of a pipeline is assumed to be always either a map or reduce.
            keys_bins = get_map_reduce_outputs(shuffling_bucket, job_name,
                                               dependent_stage_ids)

            total_num_jobs = sum([len(keys_bin) for keys_bin in keys_bins])
            stage_progress_obj.update_total_num_keys(
                stage_progress_table_name, next_pipeline_first_stage_id,
                total_num_jobs)

            if next_stage_config["stage_type"] == 1:
                for i in range(len(keys_bins)):
                    response = lambda_client.invoke(
                        FunctionName=invoking_lambda_name,
                        InvocationType='Event',
                        Payload=json.dumps({
                            "keys":
                            keys_bins[i],
                            "id":
                            i + 1,
                            "load_data_from_input":
                            False,
                            "function_pickle_path":
                            next_stage_config["function_pickle_path"],
                            "combiner_function_pickle_path":
                            next_stage_config["combiner_function_pickle_path"],
                            "partition_function_pickle_path":
                            next_stage_config["partition_function_pickle_path"]
                        }))

            else:
                for i in range(len(keys_bins)):
                    response = lambda_client.invoke(
                        FunctionName=invoking_lambda_name,
                        InvocationType='Event',
                        Payload=json.dumps({
                            "keys":
                            keys_bins[i],
                            "id":
                            i + 1,
                            "load_data_from_input":
                            False,
                            "function_pickle_path":
                            next_stage_config["function_pickle_path"]
                        }))

            print(
                "All operators finished in pipeline %s, next pipeline: number of operators scheduled: %s"
                % (cur_pipeline_id, len(keys_bins)))
예제 #4
0
    def _create_lambdas(self):
        job_name = self.static_job_info[StaticVariables.JOB_NAME_FN]
        lambda_name_prefix = self.static_job_info[StaticVariables.LAMBDA_NAME_PREFIX_FN] \
            if StaticVariables.LAMBDA_NAME_PREFIX_FN in self.static_job_info \
            else StaticVariables.DEFAULT_LAMBDA_NAME_PREFIX
        shuffling_bucket = self.static_job_info[
            StaticVariables.SHUFFLING_BUCKET_FN]
        region = self.config[StaticVariables.REGION_FN] \
            if StaticVariables.REGION_FN in self.config else StaticVariables.DEFAULT_REGION
        stage_id = 1
        num_operators = 0
        function_lambdas = []
        stage_config = {}
        mapping_stage_id_pipeline_id = {}
        adj_list = defaultdict(list)
        self.in_degrees = {}
        invoking_pipelines_info = {}
        pipelines_last_stage_num_operators = {}
        pipelines_first_last_stage_ids = {}
        stage_type_of_operations = {}
        cur_coordinator_lambda_name = "%s-%s-%s" % (
            job_name, lambda_name_prefix, "coordinator")

        # The first function should be a map/map_shuffle function
        for pipeline_id, pipeline in self.pipelines.items():
            functions = pipeline.get_functions()
            pipeline_static_job_info = self._overwrite_existing_job_info(
                pipeline.get_config())
            # TODO: The next line is correct?
            self.static_job_info = pipeline_static_job_info
            dependent_pipeline_ids = pipeline.get_dependent_pipeline_ids()
            for dependent_pipeline_id in dependent_pipeline_ids:
                adj_list[dependent_pipeline_id].append(pipeline_id)
                self.in_degrees[pipeline_id] = self.in_degrees.get(
                    pipeline_id, 0) + 1

            if len(dependent_pipeline_ids) == 0:
                if not self.is_serverless:
                    set_up_local_input_data(pipeline_static_job_info)
                all_keys, num_operators, batches = self._get_all_keys(
                    pipeline_static_job_info)
                first_function = functions[0]
                invoking_pipelines_info[pipeline_id] = [
                    all_keys, num_operators, batches, first_function, stage_id
                ]
            else:
                num_operators = 0
                for dependent_pipeline_id in dependent_pipeline_ids:
                    num_operators += pipelines_last_stage_num_operators[
                        dependent_pipeline_id]

            pipelines_first_last_stage_ids[pipeline_id] = [stage_id]

            for i in range(len(functions)):
                mapping_stage_id_pipeline_id[stage_id] = pipeline_id
                cur_function = functions[i]
                cur_function_zip_path = "%s-%s.zip" % (
                    cur_function.get_string(), stage_id)
                stage_type_of_operations[stage_id] = cur_function.get_string()

                # Prepare Lambda functions if driver running in local machine
                if not self.is_serverless:
                    pickle_functions_and_zip_stage(cur_function_zip_path,
                                                   cur_function, stage_id)

                cur_function_lambda_name = "%s-%s-%s-%s" % (
                    job_name, lambda_name_prefix, cur_function.get_string(),
                    stage_id)
                cur_function_lambda = lambda_manager.LambdaManager(
                    self.lambda_client, self.s3_client, region,
                    cur_function_zip_path, job_name, cur_function_lambda_name,
                    cur_function.get_handler_function_path())
                if isinstance(cur_function, MapShuffleFunction):
                    assert i + 1 < len(functions) and isinstance(
                        functions[i + 1], ReduceFunction)
                    cur_function_lambda.update_code_or_create_on_no_exist(
                        self.total_num_functions,
                        submission_time=self.submission_time,
                        coordinator_lambda_name=cur_coordinator_lambda_name,
                        stage_id=stage_id,
                        num_reducers=functions[i + 1].get_num_reducers())
                else:
                    cur_function_lambda.update_code_or_create_on_no_exist(
                        self.total_num_functions,
                        submission_time=self.submission_time,
                        coordinator_lambda_name=cur_coordinator_lambda_name,
                        stage_id=stage_id)
                function_lambdas.append(cur_function_lambda)

                # Coordinator
                cur_function_pickle_path = 'job/%s-%s.pkl' % (
                    cur_function.get_string(), stage_id)
                dependent_last_stage_ids = []
                for dependent_pipeline_id in dependent_pipeline_ids:
                    dependent_last_stage_ids.append(
                        pipelines_first_last_stage_ids[dependent_pipeline_id]
                        [1])
                if isinstance(cur_function, MapShuffleFunction):
                    partition_function_pickle_path = 'job/%s-%s.pkl' % (
                        "partition", stage_id)
                    combiner_function_pickle_path = 'job/%s-%s.pkl' % (
                        "combiner", stage_id)
                    stage_config[stage_id] = \
                        create_stage_config_file(num_operators, 1, cur_function_lambda_name,
                                                 cur_function_pickle_path, dependent_last_stage_ids,
                                                 partition_function_pickle_path,
                                                 combiner_function_pickle_path)
                else:
                    if isinstance(cur_function, ReduceFunction):
                        num_operators = cur_function.get_num_reducers()

                    stage_config[stage_id] = \
                        create_stage_config_file(num_operators, 2, cur_function_lambda_name,
                                                 cur_function_pickle_path,
                                                 dependent_last_stage_ids)

                stage_id += 1

            pipelines_first_last_stage_ids[pipeline_id].append(stage_id - 1)
            pipelines_last_stage_num_operators[pipeline_id] = num_operators

        coordinator_zip_path = StaticVariables.COORDINATOR_ZIP_PATH
        if not self.is_serverless:
            self._write_config_to_local(adj_list, mapping_stage_id_pipeline_id,
                                        pipelines_first_last_stage_ids,
                                        stage_config)

            zip.zip_lambda([StaticVariables.COORDINATOR_HANDLER_PATH],
                           coordinator_zip_path)
        else:
            self._write_config_to_s3(adj_list, mapping_stage_id_pipeline_id,
                                     pipelines_first_last_stage_ids,
                                     stage_config, shuffling_bucket)

        # Web UI information
        if StaticVariables.OPTIMISATION_FN not in self.static_job_info \
                or not self.static_job_info[StaticVariables.OPTIMISATION_FN]:
            dag_information = construct_dag_information(
                adj_list, mapping_stage_id_pipeline_id,
                pipelines_first_last_stage_ids, stage_type_of_operations)
            populate_static_job_info(self.static_job_info,
                                     len(pipelines_first_last_stage_ids),
                                     len(stage_type_of_operations),
                                     self.submission_time)
            self._write_web_ui_info(
                dag_information, stage_config, self.static_job_info,
                StaticVariables.S3_JOBS_INFORMATION_BUCKET_NAME, job_name)

        cur_coordinator_lambda = lambda_manager.LambdaManager(
            self.lambda_client, self.s3_client, region, coordinator_zip_path,
            job_name, cur_coordinator_lambda_name,
            StaticVariables.COORDINATOR_HANDLER_FUNCTION_PATH)
        cur_coordinator_lambda.update_code_or_create_on_no_exist(
            self.total_num_functions, submission_time=self.submission_time)
        # cur_coordinator_lambda.add_lambda_permission(random.randint(1, 1000), shuffling_bucket)
        # shuffling_s3_path_prefix = "%s/" % job_name
        # cur_coordinator_lambda.create_s3_event_source_notification(shuffling_bucket, shuffling_s3_path_prefix)
        # time.sleep(1)
        function_lambdas.append(cur_coordinator_lambda)

        if len(self.pipelines) > 1:
            in_degree_obj = in_degree.InDegree(
                in_lambda=self.is_serverless,
                is_local_testing=self.static_job_info[
                    StaticVariables.LOCAL_TESTING_FLAG_FN])
            in_degree_table_name = StaticVariables.IN_DEGREE_DYNAMODB_TABLE_NAME % (
                job_name, self.submission_time)
            in_degree_obj.delete_in_degree_table(in_degree_table_name)
            in_degree_obj.create_in_degree_table(in_degree_table_name)
            in_degree_obj.initialise_in_degree_table(in_degree_table_name,
                                                     self.in_degrees)

        if StaticVariables.OPTIMISATION_FN not in self.static_job_info \
                or not self.static_job_info[StaticVariables.OPTIMISATION_FN]:
            stage_progress_obj = stage_progress.StageProgress(
                in_lambda=self.is_serverless,
                is_local_testing=self.static_job_info[
                    StaticVariables.LOCAL_TESTING_FLAG_FN])
            stage_progress_table_name = StaticVariables.STAGE_PROGRESS_DYNAMODB_TABLE_NAME % (
                job_name, self.submission_time)
            stage_progress_obj.delete_progress_table(stage_progress_table_name)
            stage_progress_obj.create_progress_table(stage_progress_table_name)
            stage_progress_obj.initialise_progress_table(
                stage_progress_table_name, stage_id - 1)

        if not self.is_serverless:
            delete_files(glob.glob(StaticVariables.LAMBDA_ZIP_GLOB_PATH))
            delete_files(glob.glob(StaticVariables.FUNCTIONS_PICKLE_GLOB_PATH))

        return function_lambdas, invoking_pipelines_info, num_operators