Exemplo n.º 1
0
def pickle_functions_and_zip_stage(cur_function_zip_path, cur_function, stage_id):
    cur_function_pickle_path = 'job/%s-%s.pkl' % (cur_function.get_string(), stage_id)
    rel_function_paths = cur_function.get_rel_function_paths()
    with open(cur_function_pickle_path, 'wb') as f:
        pickle.dump(cur_function.get_function(), f)
    if isinstance(cur_function, MapShuffleFunction):
        partition_function_pickle_path = 'job/%s-%s.pkl' % ("partition", stage_id)
        with open(partition_function_pickle_path, 'wb') as f:
            pickle.dump(cur_function.get_partition_function(), f)

        combiner_function_pickle_path = 'job/%s-%s.pkl' % ("combiner", stage_id)
        with open(combiner_function_pickle_path, 'wb') as f:
            pickle.dump(cur_function.get_combiner_function(), f)

    zip.zip_lambda(rel_function_paths, cur_function_zip_path)
    return rel_function_paths
Exemplo n.º 2
0
    def register_driver(self, main_file_path, function_paths):
        stage_id = 1
        function_filepaths = []

        # The first function should be a map/map_shuffle function
        for pipeline_id, pipeline in self.pipelines.items():
            functions = pipeline.get_functions()
            pipeline_static_job_info = overwrite_existing_job_info(
                pipeline.get_config())
            self.static_job_info = pipeline_static_job_info
            dependent_pipeline_ids = pipeline.get_dependent_pipeline_ids()
            if len(dependent_pipeline_ids) == 0:
                set_up_local_input_data(pipeline_static_job_info)
            for i in range(len(functions)):
                cur_function = functions[i]
                cur_function_zip_path = "%s-%s.zip" % (
                    cur_function.get_string(), stage_id)

                # Prepare Lambda functions
                rel_function_paths = pickle_functions_and_zip_stage(
                    cur_function_zip_path, cur_function, stage_id)

                function_filepaths += rel_function_paths
                stage_id += 1

        with open(StaticVariables.SERVERLESS_PIPELINES_INFO_PATH, 'wb') as f:
            pickle.dump(self.pipelines, f)

        zip.zip_lambda([StaticVariables.COORDINATOR_HANDLER_PATH],
                       StaticVariables.COORDINATOR_ZIP_PATH)

        zip.zip_driver_lambda(StaticVariables.DRIVER_ZIP_PATH,
                              function_filepaths)

        serverless_driver = lambda_manager.LambdaManager(
            self.lambda_client, self.s3_client, self.region,
            StaticVariables.DRIVER_ZIP_PATH, self.job_name,
            self.driver_lambda_name,
            StaticVariables.SERVERLESS_DRIVER_HANDLER_FUNCTION_PATH)
        serverless_driver.update_code_or_create_on_no_exist(
            self.total_num_functions)

        registered_job_information = {
            'jobName':
            self.job_name,
            'driverLambdaName':
            self.driver_lambda_name,
            'registeredTime':
            datetime.utcnow().strftime("%Y-%m-%d_%H.%M.%S"),
            'shufflingBucket':
            self.static_job_info[StaticVariables.SHUFFLING_BUCKET_FN],
            'inputSource':
            self.static_job_info[StaticVariables.INPUT_SOURCE_FN],
            "outputSource":
            self.static_job_info[StaticVariables.OUTPUT_SOURCE_FN],
            'totalNumPipelines':
            len(self.pipelines),
            'totalNumStages':
            stage_id - 1
        }
        self.s3_client.put_object(
            Bucket=StaticVariables.S3_JOBS_INFORMATION_BUCKET_NAME,
            Key=(StaticVariables.S3_UI_REGISTERED_JOB_INFORMATION_PATH %
                 self.job_name),
            Body=json.dumps(registered_job_information))
        self.s3_client.put_object(
            Bucket=StaticVariables.S3_JOBS_INFORMATION_BUCKET_NAME,
            Key=(StaticVariables.S3_UI_REGISTERED_JOB_DRIVER_CONFIG_PATH %
                 self.job_name),
            Body=json.dumps(self.config))

        static_job_info_file_path = 'configuration/static-job-info.json'
        driver_file_path = 'configuration/driver.json'
        registered_job_source_info = [{
            'filePath':
            main_file_path,
            'location':
            StaticVariables.S3_UI_REGISTERED_JOB_SOURCE_FILES_PATH %
            (self.job_name, str(main_file_path))
        }, {
            'filePath':
            static_job_info_file_path,
            'location':
            StaticVariables.S3_UI_REGISTERED_JOB_SOURCE_FILES_PATH %
            (self.job_name, static_job_info_file_path)
        }, {
            'filePath':
            driver_file_path,
            'location':
            StaticVariables.S3_UI_REGISTERED_JOB_SOURCE_FILES_PATH %
            (self.job_name, driver_file_path)
        }]
        os.chdir(StaticVariables.PROJECT_WORKING_DIRECTORY)
        print("Main.py file path is: %s" % main_file_path)
        self.s3_client.upload_file(
            Filename=main_file_path,
            Bucket=StaticVariables.S3_JOBS_INFORMATION_BUCKET_NAME,
            Key=StaticVariables.S3_UI_REGISTERED_JOB_SOURCE_FILES_PATH %
            (self.job_name, str(main_file_path)))
        os.chdir(StaticVariables.LIBRARY_WORKING_DIRECTORY)
        self.s3_client.upload_file(
            Filename=static_job_info_file_path,
            Bucket=StaticVariables.S3_JOBS_INFORMATION_BUCKET_NAME,
            Key=StaticVariables.S3_UI_REGISTERED_JOB_SOURCE_FILES_PATH %
            (self.job_name, str(static_job_info_file_path)))
        self.s3_client.upload_file(
            Filename=driver_file_path,
            Bucket=StaticVariables.S3_JOBS_INFORMATION_BUCKET_NAME,
            Key=StaticVariables.S3_UI_REGISTERED_JOB_SOURCE_FILES_PATH %
            (self.job_name, str(driver_file_path)))
        for function_path in function_paths:
            registered_job_source_info.append({
                'filePath':
                function_path,
                'location':
                StaticVariables.S3_UI_REGISTERED_JOB_SOURCE_FILES_PATH %
                (self.job_name, str(function_path))
            })
            self.s3_client.upload_file(
                Filename=function_path,
                Bucket=StaticVariables.S3_JOBS_INFORMATION_BUCKET_NAME,
                Key=StaticVariables.S3_UI_REGISTERED_JOB_SOURCE_FILES_PATH %
                (self.job_name, str(function_path)))
        main_job_source_info = {
            'main': main_file_path,
            'sourceInfo': registered_job_source_info
        }
        self.s3_client.put_object(
            Bucket=StaticVariables.S3_JOBS_INFORMATION_BUCKET_NAME,
            Key=(StaticVariables.S3_UI_REGISTERED_JOB_SOURCE_INFO_PATH %
                 self.job_name),
            Body=json.dumps(main_job_source_info))

        delete_files(glob.glob(StaticVariables.FUNCTIONS_PICKLE_GLOB_PATH))
        delete_files(glob.glob(StaticVariables.LAMBDA_ZIP_GLOB_PATH))
Exemplo n.º 3
0
    def _create_lambdas(self):
        job_name = self.static_job_info[StaticVariables.JOB_NAME_FN]
        lambda_name_prefix = self.static_job_info[StaticVariables.LAMBDA_NAME_PREFIX_FN] \
            if StaticVariables.LAMBDA_NAME_PREFIX_FN in self.static_job_info \
            else StaticVariables.DEFAULT_LAMBDA_NAME_PREFIX
        shuffling_bucket = self.static_job_info[
            StaticVariables.SHUFFLING_BUCKET_FN]
        region = self.config[StaticVariables.REGION_FN] \
            if StaticVariables.REGION_FN in self.config else StaticVariables.DEFAULT_REGION
        stage_id = 1
        num_operators = 0
        function_lambdas = []
        stage_config = {}
        mapping_stage_id_pipeline_id = {}
        adj_list = defaultdict(list)
        self.in_degrees = {}
        invoking_pipelines_info = {}
        pipelines_last_stage_num_operators = {}
        pipelines_first_last_stage_ids = {}
        stage_type_of_operations = {}
        cur_coordinator_lambda_name = "%s-%s-%s" % (
            job_name, lambda_name_prefix, "coordinator")

        # The first function should be a map/map_shuffle function
        for pipeline_id, pipeline in self.pipelines.items():
            functions = pipeline.get_functions()
            pipeline_static_job_info = self._overwrite_existing_job_info(
                pipeline.get_config())
            # TODO: The next line is correct?
            self.static_job_info = pipeline_static_job_info
            dependent_pipeline_ids = pipeline.get_dependent_pipeline_ids()
            for dependent_pipeline_id in dependent_pipeline_ids:
                adj_list[dependent_pipeline_id].append(pipeline_id)
                self.in_degrees[pipeline_id] = self.in_degrees.get(
                    pipeline_id, 0) + 1

            if len(dependent_pipeline_ids) == 0:
                if not self.is_serverless:
                    set_up_local_input_data(pipeline_static_job_info)
                all_keys, num_operators, batches = self._get_all_keys(
                    pipeline_static_job_info)
                first_function = functions[0]
                invoking_pipelines_info[pipeline_id] = [
                    all_keys, num_operators, batches, first_function, stage_id
                ]
            else:
                num_operators = 0
                for dependent_pipeline_id in dependent_pipeline_ids:
                    num_operators += pipelines_last_stage_num_operators[
                        dependent_pipeline_id]

            pipelines_first_last_stage_ids[pipeline_id] = [stage_id]

            for i in range(len(functions)):
                mapping_stage_id_pipeline_id[stage_id] = pipeline_id
                cur_function = functions[i]
                cur_function_zip_path = "%s-%s.zip" % (
                    cur_function.get_string(), stage_id)
                stage_type_of_operations[stage_id] = cur_function.get_string()

                # Prepare Lambda functions if driver running in local machine
                if not self.is_serverless:
                    pickle_functions_and_zip_stage(cur_function_zip_path,
                                                   cur_function, stage_id)

                cur_function_lambda_name = "%s-%s-%s-%s" % (
                    job_name, lambda_name_prefix, cur_function.get_string(),
                    stage_id)
                cur_function_lambda = lambda_manager.LambdaManager(
                    self.lambda_client, self.s3_client, region,
                    cur_function_zip_path, job_name, cur_function_lambda_name,
                    cur_function.get_handler_function_path())
                if isinstance(cur_function, MapShuffleFunction):
                    assert i + 1 < len(functions) and isinstance(
                        functions[i + 1], ReduceFunction)
                    cur_function_lambda.update_code_or_create_on_no_exist(
                        self.total_num_functions,
                        submission_time=self.submission_time,
                        coordinator_lambda_name=cur_coordinator_lambda_name,
                        stage_id=stage_id,
                        num_reducers=functions[i + 1].get_num_reducers())
                else:
                    cur_function_lambda.update_code_or_create_on_no_exist(
                        self.total_num_functions,
                        submission_time=self.submission_time,
                        coordinator_lambda_name=cur_coordinator_lambda_name,
                        stage_id=stage_id)
                function_lambdas.append(cur_function_lambda)

                # Coordinator
                cur_function_pickle_path = 'job/%s-%s.pkl' % (
                    cur_function.get_string(), stage_id)
                dependent_last_stage_ids = []
                for dependent_pipeline_id in dependent_pipeline_ids:
                    dependent_last_stage_ids.append(
                        pipelines_first_last_stage_ids[dependent_pipeline_id]
                        [1])
                if isinstance(cur_function, MapShuffleFunction):
                    partition_function_pickle_path = 'job/%s-%s.pkl' % (
                        "partition", stage_id)
                    combiner_function_pickle_path = 'job/%s-%s.pkl' % (
                        "combiner", stage_id)
                    stage_config[stage_id] = \
                        create_stage_config_file(num_operators, 1, cur_function_lambda_name,
                                                 cur_function_pickle_path, dependent_last_stage_ids,
                                                 partition_function_pickle_path,
                                                 combiner_function_pickle_path)
                else:
                    if isinstance(cur_function, ReduceFunction):
                        num_operators = cur_function.get_num_reducers()

                    stage_config[stage_id] = \
                        create_stage_config_file(num_operators, 2, cur_function_lambda_name,
                                                 cur_function_pickle_path,
                                                 dependent_last_stage_ids)

                stage_id += 1

            pipelines_first_last_stage_ids[pipeline_id].append(stage_id - 1)
            pipelines_last_stage_num_operators[pipeline_id] = num_operators

        coordinator_zip_path = StaticVariables.COORDINATOR_ZIP_PATH
        if not self.is_serverless:
            self._write_config_to_local(adj_list, mapping_stage_id_pipeline_id,
                                        pipelines_first_last_stage_ids,
                                        stage_config)

            zip.zip_lambda([StaticVariables.COORDINATOR_HANDLER_PATH],
                           coordinator_zip_path)
        else:
            self._write_config_to_s3(adj_list, mapping_stage_id_pipeline_id,
                                     pipelines_first_last_stage_ids,
                                     stage_config, shuffling_bucket)

        # Web UI information
        if StaticVariables.OPTIMISATION_FN not in self.static_job_info \
                or not self.static_job_info[StaticVariables.OPTIMISATION_FN]:
            dag_information = construct_dag_information(
                adj_list, mapping_stage_id_pipeline_id,
                pipelines_first_last_stage_ids, stage_type_of_operations)
            populate_static_job_info(self.static_job_info,
                                     len(pipelines_first_last_stage_ids),
                                     len(stage_type_of_operations),
                                     self.submission_time)
            self._write_web_ui_info(
                dag_information, stage_config, self.static_job_info,
                StaticVariables.S3_JOBS_INFORMATION_BUCKET_NAME, job_name)

        cur_coordinator_lambda = lambda_manager.LambdaManager(
            self.lambda_client, self.s3_client, region, coordinator_zip_path,
            job_name, cur_coordinator_lambda_name,
            StaticVariables.COORDINATOR_HANDLER_FUNCTION_PATH)
        cur_coordinator_lambda.update_code_or_create_on_no_exist(
            self.total_num_functions, submission_time=self.submission_time)
        # cur_coordinator_lambda.add_lambda_permission(random.randint(1, 1000), shuffling_bucket)
        # shuffling_s3_path_prefix = "%s/" % job_name
        # cur_coordinator_lambda.create_s3_event_source_notification(shuffling_bucket, shuffling_s3_path_prefix)
        # time.sleep(1)
        function_lambdas.append(cur_coordinator_lambda)

        if len(self.pipelines) > 1:
            in_degree_obj = in_degree.InDegree(
                in_lambda=self.is_serverless,
                is_local_testing=self.static_job_info[
                    StaticVariables.LOCAL_TESTING_FLAG_FN])
            in_degree_table_name = StaticVariables.IN_DEGREE_DYNAMODB_TABLE_NAME % (
                job_name, self.submission_time)
            in_degree_obj.delete_in_degree_table(in_degree_table_name)
            in_degree_obj.create_in_degree_table(in_degree_table_name)
            in_degree_obj.initialise_in_degree_table(in_degree_table_name,
                                                     self.in_degrees)

        if StaticVariables.OPTIMISATION_FN not in self.static_job_info \
                or not self.static_job_info[StaticVariables.OPTIMISATION_FN]:
            stage_progress_obj = stage_progress.StageProgress(
                in_lambda=self.is_serverless,
                is_local_testing=self.static_job_info[
                    StaticVariables.LOCAL_TESTING_FLAG_FN])
            stage_progress_table_name = StaticVariables.STAGE_PROGRESS_DYNAMODB_TABLE_NAME % (
                job_name, self.submission_time)
            stage_progress_obj.delete_progress_table(stage_progress_table_name)
            stage_progress_obj.create_progress_table(stage_progress_table_name)
            stage_progress_obj.initialise_progress_table(
                stage_progress_table_name, stage_id - 1)

        if not self.is_serverless:
            delete_files(glob.glob(StaticVariables.LAMBDA_ZIP_GLOB_PATH))
            delete_files(glob.glob(StaticVariables.FUNCTIONS_PICKLE_GLOB_PATH))

        return function_lambdas, invoking_pipelines_info, num_operators
Exemplo n.º 4
0
    def register_driver(self):
        stage_id = 1
        function_filepaths = []

        # The first function should be a map/map_shuffle function
        for pipeline_id, pipeline in self.pipelines.items():
            functions = pipeline.get_functions()
            pipeline_static_job_info = overwrite_existing_job_info(
                pipeline.get_config())
            self.static_job_info = pipeline_static_job_info
            dependent_pipeline_ids = pipeline.get_dependent_pipeline_ids()
            if len(dependent_pipeline_ids) == 0:
                set_up_local_input_data(pipeline_static_job_info)
            for i in range(len(functions)):
                cur_function = functions[i]
                cur_function_zip_path = "%s-%s.zip" % (
                    cur_function.get_string(), stage_id)

                # Prepare Lambda functions if driver running in local machine
                rel_function_paths = pickle_functions_and_zip_stage(
                    cur_function_zip_path, cur_function, stage_id)

                function_filepaths += rel_function_paths
                stage_id += 1

        with open(StaticVariables.SERVERLESS_PIPELINES_INFO_PATH, 'wb') as f:
            pickle.dump(self.pipelines, f)

        zip.zip_lambda([StaticVariables.COORDINATOR_HANDLER_PATH],
                       StaticVariables.COORDINATOR_ZIP_PATH)

        zip.zip_driver_lambda(StaticVariables.DRIVER_ZIP_PATH,
                              function_filepaths)

        serverless_driver = lambda_manager.LambdaManager(
            self.lambda_client, self.s3_client, self.region,
            StaticVariables.DRIVER_ZIP_PATH, self.job_name,
            self.driver_lambda_name,
            StaticVariables.SERVERLESS_DRIVER_HANDLER_FUNCTION_PATH)
        serverless_driver.update_code_or_create_on_no_exist(
            self.total_num_functions)

        registered_job_information = {
            'jobName':
            self.job_name,
            'driverLambdaName':
            self.driver_lambda_name,
            'registeredTime':
            datetime.utcnow().strftime("%Y-%m-%d_%H.%M.%S"),
            'shufflingBucket':
            self.static_job_info[StaticVariables.SHUFFLING_BUCKET_FN],
            'inputSource':
            self.static_job_info[StaticVariables.INPUT_SOURCE_FN],
            "outputSource":
            self.static_job_info[StaticVariables.OUTPUT_SOURCE_FN],
            'totalNumPipelines':
            len(self.pipelines),
            'totalNumStages':
            stage_id - 1
        }
        self.s3_client.put_object(
            Bucket=StaticVariables.S3_JOBS_INFORMATION_BUCKET_NAME,
            Key=(StaticVariables.S3_UI_REGISTERED_JOB_INFORMATION_PATH %
                 self.job_name),
            Body=json.dumps(registered_job_information))
        self.s3_client.put_object(
            Bucket=StaticVariables.S3_JOBS_INFORMATION_BUCKET_NAME,
            Key=(StaticVariables.S3_UI_REGISTERED_JOB_DRIVER_CONFIG_PATH %
                 self.job_name),
            Body=json.dumps(self.config))

        delete_files(glob.glob(StaticVariables.FUNCTIONS_PICKLE_GLOB_PATH))
        delete_files(glob.glob(StaticVariables.LAMBDA_ZIP_GLOB_PATH))