Exemplo n.º 1
0
def schedule_same_pipeline_next_stage(stage_configuration, stage_id, shuffling_bucket, job_name, submission_time):
    cur_stage_config = stage_configuration[str(stage_id)]
    next_stage_config = stage_configuration[str(stage_id + 1)]
    invoking_lambda_name = next_stage_config["invoking_lambda_name"]
    next_stage_num_operators = stage_configuration[str(stage_id + 1)]["num_operators"]

    if cur_stage_config["stage_type"] == 1:
        keys_bins = get_map_shuffle_outputs(next_stage_num_operators, shuffling_bucket, job_name, stage_id)
    else:
        keys_bins = get_map_reduce_outputs(shuffling_bucket, job_name, [stage_id])

    # keys_bin_size = len(keys_bins[0])
    # for i in range(1, len(keys_bins)):
    #     assert keys_bin_size == len(keys_bins[i])

    if StaticVariables.OPTIMISATION_FN not in static_job_info \
            or not static_job_info[StaticVariables.OPTIMISATION_FN]:
        stage_progress_obj = stage_progress.StageProgress(in_lambda=True,
                                                          is_local_testing=static_job_info[StaticVariables.LOCAL_TESTING_FLAG_FN])
        stage_progress_table_name = StaticVariables.STAGE_PROGRESS_DYNAMODB_TABLE_NAME % (job_name, submission_time)
        total_num_jobs = sum([len(keys_bin) for keys_bin in keys_bins])
        stage_progress_obj.update_total_num_keys(stage_progress_table_name, stage_id + 1, total_num_jobs)

    if next_stage_config["stage_type"] == 1:
        for i in range(len(keys_bins)):
            response = lambda_client.invoke(
                FunctionName=invoking_lambda_name,
                InvocationType='Event',
                Payload=json.dumps({
                    "keys": keys_bins[i],
                    "id": i + 1,
                    "load_data_from_input": False,
                    "function_pickle_path": next_stage_config["function_pickle_path"],
                    "combiner_function_pickle_path": next_stage_config["combiner_function_pickle_path"],
                    "partition_function_pickle_path": next_stage_config["partition_function_pickle_path"]
                })
            )

    else:
        for i in range(len(keys_bins)):
            response = lambda_client.invoke(
                FunctionName=invoking_lambda_name,
                InvocationType='Event',
                Payload=json.dumps({
                    "keys": keys_bins[i],
                    "id": i + 1,
                    "load_data_from_input": False,
                    "function_pickle_path": next_stage_config["function_pickle_path"]
                })
            )

    logger.info("All operators finished in stage %s, next stage: number of operators scheduled: %s"
          % (stage_id, next_stage_num_operators))
Exemplo n.º 2
0
def get_stage_progress():
    job_name = request.args.get('job-name')
    submission_time = request.args.get('submission-time')
    logger.info(
        "WebUI: Received request for path /stage-progress with parameters: %s, %s"
        % (job_name, submission_time))

    is_local_testing = os.environ.get(
        "local_testing") == 'True' or os.environ.get("local_testing") == 'true'
    stage_progress_obj = stage_progress.StageProgress(
        in_lambda=False, is_local_testing=is_local_testing)
    stages_progress = stage_progress_obj.read_progress_table(
        StaticVariables.STAGE_PROGRESS_DYNAMODB_TABLE_NAME %
        (job_name, submission_time))
    return jsonify(stages_progress)
Exemplo n.º 3
0
    def _invoke_pipelines(self, invoking_pipelines_info):
        job_name = self.static_job_info[StaticVariables.JOB_NAME_FN]
        if StaticVariables.OPTIMISATION_FN not in self.static_job_info \
                or not self.static_job_info[StaticVariables.OPTIMISATION_FN]:
            stage_progress_obj = stage_progress.StageProgress(
                in_lambda=self.is_serverless,
                is_local_testing=self.static_job_info[
                    StaticVariables.LOCAL_TESTING_FLAG_FN])
            stage_progress_table_name = StaticVariables.STAGE_PROGRESS_DYNAMODB_TABLE_NAME % (
                job_name, self.submission_time)
        for pipeline_id, invoking_pipeline_info in invoking_pipelines_info.items(
        ):
            logger.info("Scheduling pipeline %s" % pipeline_id)
            num_mappers = invoking_pipeline_info[1]
            batches = invoking_pipeline_info[2]
            first_function = invoking_pipeline_info[3]
            stage_id = invoking_pipeline_info[4]
            concurrent_lambdas = self.config[StaticVariables.NUM_CONCURRENT_LAMBDAS_FN] \
                if StaticVariables.NUM_CONCURRENT_LAMBDAS_FN in self.config else StaticVariables.DEFAULT_NUM_CONCURRENT_LAMBDAS

            if StaticVariables.OPTIMISATION_FN not in self.static_job_info \
                    or not self.static_job_info[StaticVariables.OPTIMISATION_FN]:
                total_num_jobs = sum([len(batch) for batch in batches])
                stage_progress_obj.update_total_num_keys(
                    stage_progress_table_name, stage_id, total_num_jobs)
            # Exec Parallel
            logger.info("Number of Mappers: %s" % num_mappers)
            pool = ThreadPool(num_mappers)
            ids = [i + 1 for i in range(num_mappers)]
            invoke_lambda_partial = partial(self.invoke_lambda, batches,
                                            first_function, stage_id)

            # Burst request handling
            mappers_executed = 0
            while mappers_executed < num_mappers:
                nm = min(concurrent_lambdas, num_mappers)
                results = pool.map(invoke_lambda_partial,
                                   ids[mappers_executed:mappers_executed + nm])
                mappers_executed += nm

            pool.close()
            pool.join()

            logger.info("Pipeline %s scheduled successfully" % pipeline_id)
Exemplo n.º 4
0
def lambda_handler(event, _):
    logger.info("**************Map****************")
    start_time = time.time()
    io_time = 0

    src_keys = event['keys']
    load_data_from_input = event['load_data_from_input']
    mapper_id = event['id']
    map_function_pickle_path = event['function_pickle_path']

    with open(map_function_pickle_path, 'rb') as f:
        map_function = pickle.load(f)

    # create an S3 session
    if static_job_info[StaticVariables.LOCAL_TESTING_FLAG_FN]:
        s3_client = boto3.client('s3',
                                 aws_access_key_id='',
                                 aws_secret_access_key='',
                                 region_name=StaticVariables.DEFAULT_REGION,
                                 endpoint_url='http://%s:4572' %
                                 os.environ['LOCALSTACK_HOSTNAME'])
        lambda_client = boto3.client(
            'lambda',
            aws_access_key_id='',
            aws_secret_access_key='',
            region_name=StaticVariables.DEFAULT_REGION,
            endpoint_url='http://%s:4574' % os.environ['LOCALSTACK_HOSTNAME'])
    else:
        s3_client = boto3.client('s3')
        lambda_client = boto3.client('lambda')

    shuffling_bucket = static_job_info[StaticVariables.SHUFFLING_BUCKET_FN]
    job_name = static_job_info[StaticVariables.JOB_NAME_FN]

    stage_id = int(os.environ.get("stage_id"))
    total_num_stages = int(os.environ.get("total_num_stages"))
    coordinator_lambda_name = os.environ.get("coordinator_lambda_name")
    submission_time = os.environ.get("submission_time")

    logger.info("Stage: %s" % stage_id)

    if StaticVariables.OPTIMISATION_FN not in static_job_info \
            or not static_job_info[StaticVariables.OPTIMISATION_FN]:
        stage_progress_obj = stage_progress.StageProgress(
            in_lambda=True,
            is_local_testing=static_job_info[
                StaticVariables.LOCAL_TESTING_FLAG_FN])
        stage_progress_table_name = StaticVariables.STAGE_PROGRESS_DYNAMODB_TABLE_NAME % (
            job_name, submission_time)
    # aggr
    line_count = 0

    # INPUT CSV => OUTPUT JSON

    begin_time = time.time()
    interval_time = random.randint(1, 3)
    interval_num_keys_processed = 0

    outputs = []

    start_overhead = time.time() - start_time
    logger.info("Start overhead: %s" % str(start_overhead))

    if load_data_from_input:
        cur_input_handler = input_handler.get_input_handler(
            static_job_info[StaticVariables.INPUT_SOURCE_TYPE_FN],
            static_job_info[StaticVariables.LOCAL_TESTING_FLAG_FN],
            in_lambda=True)
        input_source = static_job_info[StaticVariables.INPUT_SOURCE_FN]
        for input_key in src_keys:
            io_start_time = time.time()
            input_value = cur_input_handler.read_value(input_source, input_key,
                                                       static_job_info)
            io_time += time.time() - io_start_time
            input_pair = (input_key, input_value)
            map_function(outputs, input_pair)

            # TODO: Line count can be used to verify correctness of the job. Can be removed if needed in the future.
            if StaticVariables.OPTIMISATION_FN not in static_job_info \
                    or not static_job_info[StaticVariables.OPTIMISATION_FN]:
                if static_job_info[
                        StaticVariables.INPUT_SOURCE_TYPE_FN] == "s3":
                    line_count += len(input_value.split('\n')) - 1
                elif static_job_info[
                        StaticVariables.INPUT_SOURCE_TYPE_FN] == "dynamodb":
                    line_count += 1

                interval_num_keys_processed += 1
                current_time = time.time()
                if int(current_time - begin_time) > interval_time:
                    begin_time = current_time
                    interval_time = random.randint(1, 3)
                    stage_progress_obj.increase_num_processed_keys(
                        stage_progress_table_name, stage_id,
                        interval_num_keys_processed)
                    interval_num_keys_processed = 0
    else:
        for input_key in src_keys:
            io_start_time = time.time()
            response = s3_client.get_object(Bucket=shuffling_bucket,
                                            Key=input_key)
            contents = response['Body'].read()
            input_value = json.loads(contents)
            io_time += time.time() - io_start_time
            input_pair = (input_key, input_value)
            map_function(outputs, input_pair)

            if StaticVariables.OPTIMISATION_FN not in static_job_info \
                    or not static_job_info[StaticVariables.OPTIMISATION_FN]:
                line_count += len(input_value)

                interval_num_keys_processed += 1
                current_time = time.time()
                if int(current_time - begin_time) > interval_time:
                    begin_time = current_time
                    interval_time = random.randint(1, 3)
                    stage_progress_obj.increase_num_processed_keys(
                        stage_progress_table_name, stage_id,
                        interval_num_keys_processed)
                    interval_num_keys_processed = 0

    if StaticVariables.OPTIMISATION_FN not in static_job_info \
            or not static_job_info[StaticVariables.OPTIMISATION_FN]:
        stage_progress_obj.increase_num_processed_keys(
            stage_progress_table_name, stage_id, interval_num_keys_processed)

    # timeTaken = time_in_secs * 1000000000 # in 10^9
    # s3DownloadTime = 0
    # totalProcessingTime = 0

    logger.info("Map sample outputs: %s" % str(outputs[0:10]))

    if stage_id == total_num_stages:
        cur_output_handler = output_handler.get_output_handler(
            static_job_info[StaticVariables.OUTPUT_SOURCE_TYPE_FN],
            static_job_info[StaticVariables.LOCAL_TESTING_FLAG_FN],
            in_lambda=True)
        # cur_output_handler.write_output(mapper_id, outputs, metadata, submission_time, static_job_info)
        io_start_time = time.time()
        cur_output_handler.write_output(mapper_id, outputs, {},
                                        static_job_info, submission_time)
        io_time += time.time() - io_start_time
    else:
        mapper_filename = "%s/%s-%s/%s" % (
            job_name, StaticVariables.OUTPUT_PREFIX, stage_id, mapper_id)
        # s3_client.put_object(Bucket=shuffling_bucket, Key=mapper_filename,
        #                      Body=json.dumps(outputs), Metadata=metadata)
        io_start_time = time.time()
        s3_client.put_object(Bucket=shuffling_bucket,
                             Key=mapper_filename,
                             Body=json.dumps(outputs))
        io_time += time.time() - io_start_time

        lambda_client.invoke(FunctionName=coordinator_lambda_name,
                             InvocationType='Event',
                             Payload=json.dumps({'stage_id': stage_id}))

    execution_time = time.time() - start_time
    metadata = {
        "lineCount": '%s' % line_count,
        "processingTime": '%s' % execution_time,
        "memoryUsage":
        '%s' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss,
        "numKeys": '%s' % len(src_keys),
        "ioTime": '%s' % io_time,
        "computeTime": '%s' % str(execution_time - io_time)
    }

    info_write_start_time = time.time()
    metrics_bucket = StaticVariables.METRICS_BUCKET % job_name
    execution_info_s3_key = "%s/stage-%s/%s" % (job_name, stage_id, mapper_id)
    s3_client.put_object(Bucket=metrics_bucket,
                         Key=execution_info_s3_key,
                         Body=json.dumps({}),
                         Metadata=metadata)
    logger.info("Info write time: %s" %
                str(time.time() - info_write_start_time))

    logger.info("Mapper %s finishes execution" % str(mapper_id))
    logger.info("Execution time: %s" % str(time.time() - start_time))
Exemplo n.º 5
0
def schedule_different_pipeline_next_stage(is_serverless_driver,
                                           stage_configuration,
                                           cur_pipeline_id, shuffling_bucket,
                                           job_name, submission_time):
    if not is_serverless_driver:
        with open(StaticVariables.PIPELINE_DEPENDENCIES_PATH) as json_file:
            adj_list = json.load(json_file)
    else:
        response = s3_client.get_object(
            Bucket=shuffling_bucket,
            Key=StaticVariables.PIPELINE_DEPENDENCIES_PATH)
        contents = response['Body'].read()
        adj_list = json.loads(contents)

    if not is_serverless_driver:
        with open(StaticVariables.PIPELINE_TO_FIRST_LAST_STAGE_PATH
                  ) as json_file:
            pipeline_first_last_stage_ids = json.load(json_file)
    else:
        response = s3_client.get_object(
            Bucket=shuffling_bucket,
            Key=StaticVariables.PIPELINE_TO_FIRST_LAST_STAGE_PATH)
        contents = response['Body'].read()
        pipeline_first_last_stage_ids = json.loads(contents)

    in_degree_obj = in_degree.InDegree(
        in_lambda=True,
        is_local_testing=static_job_info[
            StaticVariables.LOCAL_TESTING_FLAG_FN])
    stage_progress_obj = stage_progress.StageProgress(
        in_lambda=True,
        is_local_testing=static_job_info[
            StaticVariables.LOCAL_TESTING_FLAG_FN])
    stage_progress_table_name = StaticVariables.STAGE_PROGRESS_DYNAMODB_TABLE_NAME % (
        job_name, submission_time)
    for dependent_pipeline_id in adj_list[str(cur_pipeline_id)]:
        response = in_degree_obj.decrement_in_degree_table(
            StaticVariables.IN_DEGREE_DYNAMODB_TABLE_NAME %
            (job_name, submission_time), dependent_pipeline_id)
        dependent_in_degree = int(response["Attributes"]["in_degree"]["N"])
        if dependent_in_degree == 0:
            next_pipeline_first_stage_id = pipeline_first_last_stage_ids[str(
                dependent_pipeline_id)][0]
            next_stage_config = stage_configuration[str(
                next_pipeline_first_stage_id)]
            invoking_lambda_name = next_stage_config["invoking_lambda_name"]
            dependent_stage_ids = next_stage_config["dependent_last_stage_ids"]
            # The last stages of a pipeline is assumed to be always either a map or reduce.
            keys_bins = get_map_reduce_outputs(shuffling_bucket, job_name,
                                               dependent_stage_ids)

            total_num_jobs = sum([len(keys_bin) for keys_bin in keys_bins])
            stage_progress_obj.update_total_num_keys(
                stage_progress_table_name, next_pipeline_first_stage_id,
                total_num_jobs)

            if next_stage_config["stage_type"] == 1:
                for i in range(len(keys_bins)):
                    response = lambda_client.invoke(
                        FunctionName=invoking_lambda_name,
                        InvocationType='Event',
                        Payload=json.dumps({
                            "keys":
                            keys_bins[i],
                            "id":
                            i + 1,
                            "load_data_from_input":
                            False,
                            "function_pickle_path":
                            next_stage_config["function_pickle_path"],
                            "combiner_function_pickle_path":
                            next_stage_config["combiner_function_pickle_path"],
                            "partition_function_pickle_path":
                            next_stage_config["partition_function_pickle_path"]
                        }))

            else:
                for i in range(len(keys_bins)):
                    response = lambda_client.invoke(
                        FunctionName=invoking_lambda_name,
                        InvocationType='Event',
                        Payload=json.dumps({
                            "keys":
                            keys_bins[i],
                            "id":
                            i + 1,
                            "load_data_from_input":
                            False,
                            "function_pickle_path":
                            next_stage_config["function_pickle_path"]
                        }))

            print(
                "All operators finished in pipeline %s, next pipeline: number of operators scheduled: %s"
                % (cur_pipeline_id, len(keys_bins)))
Exemplo n.º 6
0
def lambda_handler(event, _):
    logger.info("**************Map-Shuffle****************")
    start_time = time.time()
    io_time = 0

    src_keys = event['keys']
    mapper_id = event['id']
    load_data_from_input = event['load_data_from_input']
    map_function_pickle_path = event['function_pickle_path']
    combiner_function_pickle_path = event['combiner_function_pickle_path']
    partition_function_pickle_path = event['partition_function_pickle_path']

    with open(map_function_pickle_path, 'rb') as f:
        map_function = pickle.load(f)

    with open(combiner_function_pickle_path, 'rb') as f:
        combiner_function = pickle.load(f)

    with open(partition_function_pickle_path, 'rb') as f:
        partition_function = pickle.load(f)

    # create an S3 session
    if static_job_info[StaticVariables.LOCAL_TESTING_FLAG_FN]:
        s3_client = boto3.client('s3', aws_access_key_id='', aws_secret_access_key='',
                                 region_name=StaticVariables.DEFAULT_REGION,
                                 endpoint_url='http://%s:4572' % os.environ['LOCALSTACK_HOSTNAME'])
        lambda_client = boto3.client('lambda', aws_access_key_id='', aws_secret_access_key='',
                                     region_name=StaticVariables.DEFAULT_REGION,
                                     endpoint_url='http://%s:4574' % os.environ['LOCALSTACK_HOSTNAME'])
    else:
        s3_client = boto3.client('s3')
        lambda_client = boto3.client('lambda')

    shuffling_bucket = static_job_info[StaticVariables.SHUFFLING_BUCKET_FN]
    job_name = static_job_info[StaticVariables.JOB_NAME_FN]
    use_combine = static_job_info[StaticVariables.USE_COMBINE_FLAG_FN]

    stage_id = int(os.environ.get("stage_id"))
    num_bins = int(os.environ.get("num_reducers"))
    coordinator_lambda_name = os.environ.get("coordinator_lambda_name")
    submission_time = os.environ.get("submission_time")

    logger.info("Stage: %s" % stage_id)

    if StaticVariables.OPTIMISATION_FN not in static_job_info \
            or not static_job_info[StaticVariables.OPTIMISATION_FN]:
        stage_progress_obj = stage_progress.StageProgress(in_lambda=True,
                                                          is_local_testing=static_job_info[StaticVariables.LOCAL_TESTING_FLAG_FN])
        stage_progress_table_name = StaticVariables.STAGE_PROGRESS_DYNAMODB_TABLE_NAME % (job_name, submission_time)

    # aggr
    line_count = 0
    err = ''

    # INPUT CSV => OUTPUT JSON

    begin_time = time.time()
    interval_time = random.randint(1, 3)
    interval_num_keys_processed = 0

    intermediate_data = []
    if load_data_from_input:
        cur_input_handler = input_handler.get_input_handler(static_job_info[StaticVariables.INPUT_SOURCE_TYPE_FN],
                                                            static_job_info[StaticVariables.LOCAL_TESTING_FLAG_FN],
                                                            in_lambda=True)
        input_source = static_job_info[StaticVariables.INPUT_SOURCE_FN]
        for input_key in src_keys:
            io_start_time = time.time()
            input_value = cur_input_handler.read_value(input_source, input_key, static_job_info)
            io_time += time.time() - io_start_time
            input_pair = (input_key, input_value)
            # logger.info("Before calling map function")
            # logger.info("The input key: %s" % input_key)
            map_function(intermediate_data, input_pair)
            # logger.info("After calling map function")

            if StaticVariables.OPTIMISATION_FN not in static_job_info \
                    or not static_job_info[StaticVariables.OPTIMISATION_FN]:
                # TODO: Line count can be used to verify correctness of the job. Can be removed if needed in the future.
                if static_job_info[StaticVariables.INPUT_SOURCE_TYPE_FN] == "s3":
                    line_count += len(input_value.split('\n')) - 1
                elif static_job_info[StaticVariables.INPUT_SOURCE_TYPE_FN] == "dynamodb":
                    line_count += 1

                interval_num_keys_processed += 1
                current_time = time.time()
                if int(current_time - begin_time) > interval_time:
                    begin_time = current_time
                    interval_time = random.randint(1, 3)
                    stage_progress_obj.increase_num_processed_keys(stage_progress_table_name,
                                                                   stage_id, interval_num_keys_processed)
                    interval_num_keys_processed = 0
    else:
        for input_key in src_keys:
            io_start_time = time.time()
            response = s3_client.get_object(Bucket=shuffling_bucket, Key=input_key)
            contents = response['Body'].read()
            input_value = json.loads(contents)
            io_time += time.time() - io_start_time
            input_pair = (input_key, input_value)
            map_function(intermediate_data, input_pair)

            if StaticVariables.OPTIMISATION_FN not in static_job_info \
                    or not static_job_info[StaticVariables.OPTIMISATION_FN]:
                line_count += len(input_value)

                interval_num_keys_processed += 1
                current_time = time.time()
                if int(current_time - begin_time) > interval_time:
                    begin_time = current_time
                    interval_time = random.randint(1, 3)
                    stage_progress_obj.increase_num_processed_keys(stage_progress_table_name,
                                                                   stage_id, interval_num_keys_processed)
                    interval_num_keys_processed = 0

    if StaticVariables.OPTIMISATION_FN not in static_job_info \
            or not static_job_info[StaticVariables.OPTIMISATION_FN]:
        stage_progress_obj.increase_num_processed_keys(stage_progress_table_name,
                                                       stage_id, interval_num_keys_processed)

    if use_combine:
        intermediate_data.sort(key=lambda x: x[0])

        cur_key = None
        cur_values = []
        outputs = []
        for input_key, value in intermediate_data:
            if cur_key == input_key:
                cur_values.append(value)
            else:
                if cur_key is not None:
                    combiner_function(outputs, (cur_key, cur_values))

                cur_key = input_key
                cur_values = [value]

        if cur_key is not None:
            combiner_function(outputs, (cur_key, cur_values))

    else:
        outputs = intermediate_data

    # timeTaken = time_in_secs * 1000000000 # in 10^9
    # s3DownloadTime = 0
    # totalProcessingTime = 0

    # Partition ids are from 1 to n (inclusive).
    output_partitions = [[] for _ in range(num_bins + 1)]

    logger.info("MapShuffle sample outputs: %s" % str(outputs[0:10]))

    for input_key, value in outputs:
        partition_id = partition_function(input_key, num_bins) + 1
        cur_partition = output_partitions[partition_id]
        cur_partition.append(tuple((input_key, value)))

    for i in range(1, num_bins + 1):
        partition_id = "bin-%s" % i
        mapper_filename = "%s/%s-%s/%s/%s" % (job_name, StaticVariables.OUTPUT_PREFIX, stage_id, partition_id, mapper_id)
        io_start_time = time.time()
        s3_client.put_object(Bucket=shuffling_bucket, Key=mapper_filename,
                             Body=json.dumps(output_partitions[i]))
        io_time += time.time() - io_start_time

    lambda_client.invoke(
        FunctionName=coordinator_lambda_name,
        InvocationType='Event',
        Payload=json.dumps({
            'stage_id': stage_id
        })
    )

    time_in_secs = time.time() - start_time
    metadata = {
        "lineCount": '%s' % line_count,
        "processingTime": '%s' % time_in_secs,
        "memoryUsage": '%s' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss,
        "numKeys": '%s' % len(src_keys),
        "ioTime": '%s' % io_time,
        "computeTime": '%s' % str(time_in_secs - io_time)
    }

    info_write_start_time = time.time()
    metrics_bucket = StaticVariables.METRICS_BUCKET % job_name
    execution_info_s3_key = "%s/stage-%s/%s" % (job_name, stage_id, mapper_id)
    s3_client.put_object(Bucket=metrics_bucket, Key=execution_info_s3_key,
                         Body=json.dumps({}), Metadata=metadata)
    logger.info("Info write time: %s" % str(time.time() - info_write_start_time))

    logger.info("MapShuffler %s finishes execution" % str(mapper_id))
    logger.info("Execution time: %s" % str(time.time() - start_time))
Exemplo n.º 7
0
def lambda_handler(event, _):
    print("**************Reduce****************")
    start_time = time.time()

    reduce_keys = event['keys']
    reducer_id = event['id']
    reduce_function_pickle_path = event['function_pickle_path']

    with open(reduce_function_pickle_path, 'rb') as f:
        reduce_function = pickle.load(f)

    # create an S3 & Dynamo session
    static_job_info = json.loads(
        open(StaticVariables.STATIC_JOB_INFO_PATH, 'r').read())
    if static_job_info[StaticVariables.LOCAL_TESTING_FLAG_FN]:
        s3_client = boto3.client('s3',
                                 aws_access_key_id='',
                                 aws_secret_access_key='',
                                 region_name=StaticVariables.DEFAULT_REGION,
                                 endpoint_url='http://%s:4572' %
                                 os.environ['LOCALSTACK_HOSTNAME'])
        lambda_client = boto3.client(
            'lambda',
            aws_access_key_id='',
            aws_secret_access_key='',
            region_name=StaticVariables.DEFAULT_REGION,
            endpoint_url='http://%s:4574' % os.environ['LOCALSTACK_HOSTNAME'])
    else:
        s3_client = boto3.client('s3')
        lambda_client = boto3.client('lambda')

    shuffling_bucket = static_job_info[StaticVariables.SHUFFLING_BUCKET_FN]
    # use_combine = static_job_info[StaticVariables.USE_COMBINE_FLAG_FN]
    job_name = static_job_info[StaticVariables.JOB_NAME_FN]

    stage_id = int(os.environ.get("stage_id"))
    total_num_stages = int(os.environ.get("total_num_stages"))
    coordinator_lambda_name = os.environ.get("coordinator_lambda_name")
    submission_time = os.environ.get("submission_time")

    print("Stage:", stage_id)

    stage_progress_obj = stage_progress.StageProgress(
        in_lambda=True,
        is_local_testing=static_job_info[
            StaticVariables.LOCAL_TESTING_FLAG_FN])
    stage_progress_table_name = StaticVariables.STAGE_PROGRESS_DYNAMODB_TABLE_NAME % (
        job_name, submission_time)

    # aggr
    line_count = 0
    intermediate_data = []

    # INPUT JSON => OUTPUT JSON

    # Download and process all keys
    for key in reduce_keys:
        response = s3_client.get_object(Bucket=shuffling_bucket, Key=key)
        contents = response['Body'].read()

        for key_value in json.loads(contents):
            line_count += 1
            intermediate_data.append(key_value)

    intermediate_data.sort(key=lambda x: x[0])

    begin_time = time.time()
    interval_time = random.randint(60, 180)
    interval_num_keys_processed = 0
    average_num_keys = float(len(intermediate_data) / len(reduce_keys))

    cur_key = None
    cur_values = []
    outputs = []
    for key, value in intermediate_data:
        if cur_key == key:
            cur_values.append(value)
        else:
            if cur_key is not None:
                cur_key_outputs = []
                reduce_function(cur_key_outputs, (cur_key, cur_values))
                outputs += cur_key_outputs

            cur_key = key
            cur_values = [value]

        interval_num_keys_processed += 1
        current_time = time.time()
        if int(current_time - begin_time) > interval_time:
            begin_time = current_time
            interval_time = random.randint(1, 3)
            interval_num_files_processed = int(interval_num_keys_processed /
                                               average_num_keys)
            stage_progress_obj.increase_num_processed_keys(
                stage_progress_table_name, stage_id,
                interval_num_files_processed)
            interval_num_keys_processed = interval_num_keys_processed % average_num_keys

    if cur_key is not None:
        cur_key_outputs = []
        reduce_function(cur_key_outputs, (cur_key, cur_values))
        outputs += cur_key_outputs

    interval_num_files_processed = int(interval_num_keys_processed /
                                       average_num_keys)
    stage_progress_obj.increase_num_processed_keys(
        stage_progress_table_name, stage_id, interval_num_files_processed)

    time_in_secs = (time.time() - start_time)
    # timeTaken = time_in_secs * 1000000000 # in 10^9
    # s3DownloadTime = 0
    # totalProcessingTime = 0

    metadata = {
        "lineCount": '%s' % line_count,
        "processingTime": '%s' % time_in_secs,
        "memoryUsage":
        '%s' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss,
        "numKeys": '%s' % len(reduce_keys)
    }

    if stage_id == total_num_stages:
        cur_output_handler = output_handler.get_output_handler(
            static_job_info[StaticVariables.OUTPUT_SOURCE_TYPE_FN],
            static_job_info[StaticVariables.LOCAL_TESTING_FLAG_FN],
            in_lambda=True)
        cur_output_handler.write_output(reducer_id, outputs, metadata,
                                        static_job_info, submission_time)
    else:
        mapper_filename = "%s/%s-%s/%s" % (
            job_name, StaticVariables.OUTPUT_PREFIX, stage_id, reducer_id)
        s3_client.put_object(Bucket=shuffling_bucket,
                             Key=mapper_filename,
                             Body=json.dumps(outputs),
                             Metadata=metadata)

        lambda_client.invoke(FunctionName=coordinator_lambda_name,
                             InvocationType='Event',
                             Payload=json.dumps({'stage_id': stage_id}))
Exemplo n.º 8
0
    def _create_lambdas(self):
        job_name = self.static_job_info[StaticVariables.JOB_NAME_FN]
        lambda_name_prefix = self.static_job_info[StaticVariables.LAMBDA_NAME_PREFIX_FN] \
            if StaticVariables.LAMBDA_NAME_PREFIX_FN in self.static_job_info \
            else StaticVariables.DEFAULT_LAMBDA_NAME_PREFIX
        shuffling_bucket = self.static_job_info[
            StaticVariables.SHUFFLING_BUCKET_FN]
        region = self.config[StaticVariables.REGION_FN] \
            if StaticVariables.REGION_FN in self.config else StaticVariables.DEFAULT_REGION
        stage_id = 1
        num_operators = 0
        function_lambdas = []
        stage_config = {}
        mapping_stage_id_pipeline_id = {}
        adj_list = defaultdict(list)
        self.in_degrees = {}
        invoking_pipelines_info = {}
        pipelines_last_stage_num_operators = {}
        pipelines_first_last_stage_ids = {}
        stage_type_of_operations = {}
        cur_coordinator_lambda_name = "%s-%s-%s" % (
            job_name, lambda_name_prefix, "coordinator")

        # The first function should be a map/map_shuffle function
        for pipeline_id, pipeline in self.pipelines.items():
            functions = pipeline.get_functions()
            pipeline_static_job_info = self._overwrite_existing_job_info(
                pipeline.get_config())
            # TODO: The next line is correct?
            self.static_job_info = pipeline_static_job_info
            dependent_pipeline_ids = pipeline.get_dependent_pipeline_ids()
            for dependent_pipeline_id in dependent_pipeline_ids:
                adj_list[dependent_pipeline_id].append(pipeline_id)
                self.in_degrees[pipeline_id] = self.in_degrees.get(
                    pipeline_id, 0) + 1

            if len(dependent_pipeline_ids) == 0:
                if not self.is_serverless:
                    set_up_local_input_data(pipeline_static_job_info)
                all_keys, num_operators, batches = self._get_all_keys(
                    pipeline_static_job_info)
                first_function = functions[0]
                invoking_pipelines_info[pipeline_id] = [
                    all_keys, num_operators, batches, first_function, stage_id
                ]
            else:
                num_operators = 0
                for dependent_pipeline_id in dependent_pipeline_ids:
                    num_operators += pipelines_last_stage_num_operators[
                        dependent_pipeline_id]

            pipelines_first_last_stage_ids[pipeline_id] = [stage_id]

            for i in range(len(functions)):
                mapping_stage_id_pipeline_id[stage_id] = pipeline_id
                cur_function = functions[i]
                cur_function_zip_path = "%s-%s.zip" % (
                    cur_function.get_string(), stage_id)
                stage_type_of_operations[stage_id] = cur_function.get_string()

                # Prepare Lambda functions if driver running in local machine
                if not self.is_serverless:
                    pickle_functions_and_zip_stage(cur_function_zip_path,
                                                   cur_function, stage_id)

                cur_function_lambda_name = "%s-%s-%s-%s" % (
                    job_name, lambda_name_prefix, cur_function.get_string(),
                    stage_id)
                cur_function_lambda = lambda_manager.LambdaManager(
                    self.lambda_client, self.s3_client, region,
                    cur_function_zip_path, job_name, cur_function_lambda_name,
                    cur_function.get_handler_function_path())
                if isinstance(cur_function, MapShuffleFunction):
                    assert i + 1 < len(functions) and isinstance(
                        functions[i + 1], ReduceFunction)
                    cur_function_lambda.update_code_or_create_on_no_exist(
                        self.total_num_functions,
                        submission_time=self.submission_time,
                        coordinator_lambda_name=cur_coordinator_lambda_name,
                        stage_id=stage_id,
                        num_reducers=functions[i + 1].get_num_reducers())
                else:
                    cur_function_lambda.update_code_or_create_on_no_exist(
                        self.total_num_functions,
                        submission_time=self.submission_time,
                        coordinator_lambda_name=cur_coordinator_lambda_name,
                        stage_id=stage_id)
                function_lambdas.append(cur_function_lambda)

                # Coordinator
                cur_function_pickle_path = 'job/%s-%s.pkl' % (
                    cur_function.get_string(), stage_id)
                dependent_last_stage_ids = []
                for dependent_pipeline_id in dependent_pipeline_ids:
                    dependent_last_stage_ids.append(
                        pipelines_first_last_stage_ids[dependent_pipeline_id]
                        [1])
                if isinstance(cur_function, MapShuffleFunction):
                    partition_function_pickle_path = 'job/%s-%s.pkl' % (
                        "partition", stage_id)
                    combiner_function_pickle_path = 'job/%s-%s.pkl' % (
                        "combiner", stage_id)
                    stage_config[stage_id] = \
                        create_stage_config_file(num_operators, 1, cur_function_lambda_name,
                                                 cur_function_pickle_path, dependent_last_stage_ids,
                                                 partition_function_pickle_path,
                                                 combiner_function_pickle_path)
                else:
                    if isinstance(cur_function, ReduceFunction):
                        num_operators = cur_function.get_num_reducers()

                    stage_config[stage_id] = \
                        create_stage_config_file(num_operators, 2, cur_function_lambda_name,
                                                 cur_function_pickle_path,
                                                 dependent_last_stage_ids)

                stage_id += 1

            pipelines_first_last_stage_ids[pipeline_id].append(stage_id - 1)
            pipelines_last_stage_num_operators[pipeline_id] = num_operators

        coordinator_zip_path = StaticVariables.COORDINATOR_ZIP_PATH
        if not self.is_serverless:
            self._write_config_to_local(adj_list, mapping_stage_id_pipeline_id,
                                        pipelines_first_last_stage_ids,
                                        stage_config)

            zip.zip_lambda([StaticVariables.COORDINATOR_HANDLER_PATH],
                           coordinator_zip_path)
        else:
            self._write_config_to_s3(adj_list, mapping_stage_id_pipeline_id,
                                     pipelines_first_last_stage_ids,
                                     stage_config, shuffling_bucket)

        # Web UI information
        if StaticVariables.OPTIMISATION_FN not in self.static_job_info \
                or not self.static_job_info[StaticVariables.OPTIMISATION_FN]:
            dag_information = construct_dag_information(
                adj_list, mapping_stage_id_pipeline_id,
                pipelines_first_last_stage_ids, stage_type_of_operations)
            populate_static_job_info(self.static_job_info,
                                     len(pipelines_first_last_stage_ids),
                                     len(stage_type_of_operations),
                                     self.submission_time)
            self._write_web_ui_info(
                dag_information, stage_config, self.static_job_info,
                StaticVariables.S3_JOBS_INFORMATION_BUCKET_NAME, job_name)

        cur_coordinator_lambda = lambda_manager.LambdaManager(
            self.lambda_client, self.s3_client, region, coordinator_zip_path,
            job_name, cur_coordinator_lambda_name,
            StaticVariables.COORDINATOR_HANDLER_FUNCTION_PATH)
        cur_coordinator_lambda.update_code_or_create_on_no_exist(
            self.total_num_functions, submission_time=self.submission_time)
        # cur_coordinator_lambda.add_lambda_permission(random.randint(1, 1000), shuffling_bucket)
        # shuffling_s3_path_prefix = "%s/" % job_name
        # cur_coordinator_lambda.create_s3_event_source_notification(shuffling_bucket, shuffling_s3_path_prefix)
        # time.sleep(1)
        function_lambdas.append(cur_coordinator_lambda)

        if len(self.pipelines) > 1:
            in_degree_obj = in_degree.InDegree(
                in_lambda=self.is_serverless,
                is_local_testing=self.static_job_info[
                    StaticVariables.LOCAL_TESTING_FLAG_FN])
            in_degree_table_name = StaticVariables.IN_DEGREE_DYNAMODB_TABLE_NAME % (
                job_name, self.submission_time)
            in_degree_obj.delete_in_degree_table(in_degree_table_name)
            in_degree_obj.create_in_degree_table(in_degree_table_name)
            in_degree_obj.initialise_in_degree_table(in_degree_table_name,
                                                     self.in_degrees)

        if StaticVariables.OPTIMISATION_FN not in self.static_job_info \
                or not self.static_job_info[StaticVariables.OPTIMISATION_FN]:
            stage_progress_obj = stage_progress.StageProgress(
                in_lambda=self.is_serverless,
                is_local_testing=self.static_job_info[
                    StaticVariables.LOCAL_TESTING_FLAG_FN])
            stage_progress_table_name = StaticVariables.STAGE_PROGRESS_DYNAMODB_TABLE_NAME % (
                job_name, self.submission_time)
            stage_progress_obj.delete_progress_table(stage_progress_table_name)
            stage_progress_obj.create_progress_table(stage_progress_table_name)
            stage_progress_obj.initialise_progress_table(
                stage_progress_table_name, stage_id - 1)

        if not self.is_serverless:
            delete_files(glob.glob(StaticVariables.LAMBDA_ZIP_GLOB_PATH))
            delete_files(glob.glob(StaticVariables.FUNCTIONS_PICKLE_GLOB_PATH))

        return function_lambdas, invoking_pipelines_info, num_operators
Exemplo n.º 9
0
def lambda_handler(event, _):
    logger.info("**************Reduce****************")
    start_time = time.time()
    io_time = 0

    reduce_keys = event['keys']
    reducer_id = event['id']
    reduce_function_pickle_path = event['function_pickle_path']

    with open(reduce_function_pickle_path, 'rb') as f:
        reduce_function = pickle.load(f)

    # create an S3 & Dynamo session
    if static_job_info[StaticVariables.LOCAL_TESTING_FLAG_FN]:
        s3_client = boto3.client('s3',
                                 aws_access_key_id='',
                                 aws_secret_access_key='',
                                 region_name=StaticVariables.DEFAULT_REGION,
                                 endpoint_url='http://%s:4572' %
                                 os.environ['LOCALSTACK_HOSTNAME'])
        lambda_client = boto3.client(
            'lambda',
            aws_access_key_id='',
            aws_secret_access_key='',
            region_name=StaticVariables.DEFAULT_REGION,
            endpoint_url='http://%s:4574' % os.environ['LOCALSTACK_HOSTNAME'])
    else:
        s3_client = boto3.client('s3')
        lambda_client = boto3.client('lambda')

    shuffling_bucket = static_job_info[StaticVariables.SHUFFLING_BUCKET_FN]
    # use_combine = static_job_info[StaticVariables.USE_COMBINE_FLAG_FN]
    job_name = static_job_info[StaticVariables.JOB_NAME_FN]

    stage_id = int(os.environ.get("stage_id"))
    total_num_stages = int(os.environ.get("total_num_stages"))
    coordinator_lambda_name = os.environ.get("coordinator_lambda_name")
    submission_time = os.environ.get("submission_time")

    logger.info("Stage: %s" % str(stage_id))
    logger.info("Reducer id: %s" % str(reducer_id))

    if StaticVariables.OPTIMISATION_FN not in static_job_info \
            or not static_job_info[StaticVariables.OPTIMISATION_FN]:
        stage_progress_obj = stage_progress.StageProgress(
            in_lambda=True,
            is_local_testing=static_job_info[
                StaticVariables.LOCAL_TESTING_FLAG_FN])
        stage_progress_table_name = StaticVariables.STAGE_PROGRESS_DYNAMODB_TABLE_NAME % (
            job_name, submission_time)

    # aggr
    line_count = 0
    intermediate_data = []
    retry_reduce_keys = []

    # INPUT JSON => OUTPUT JSON

    # Download and process all keys
    for key in reduce_keys:
        try:
            io_start_time = time.time()
            response = s3_client.get_object(Bucket=shuffling_bucket, Key=key)
            contents = response['Body'].read()
            io_time += time.time() - io_start_time

            for key_value in json.loads(contents):
                line_count += 1
                intermediate_data.append(key_value)
        except Exception as e:
            logger.info("Key: %s" % key)
            logger.info("First time Error: %s" % str(e))
            retry_reduce_keys.append(key)

    # time.sleep(1)
    # second_retry_reduce_keys = []
    # for key in retry_reduce_keys:
    #     try:
    #         io_start_time = time.time()
    #         response = s3_client.get_object(Bucket=shuffling_bucket, Key=key)
    #         contents = response['Body'].read()
    #         io_time += time.time() - io_start_time
    #
    #         for key_value in json.loads(contents):
    #             line_count += 1
    #             intermediate_data.append(key_value)
    #     except Exception as e:
    #         logger.info("Key: %s" % key)
    #         logger.info("Second time Error: %s" % str(e))
    #         second_retry_reduce_keys.append(key)

    # time.sleep(2)
    # for key in second_retry_reduce_keys:
    #     try:
    #         io_start_time = time.time()
    #         response = s3_client.get_object(Bucket=shuffling_bucket, Key=key)
    #         contents = response['Body'].read()
    #         io_time += time.time() - io_start_time
    #
    #         for key_value in json.loads(contents):
    #             line_count += 1
    #             intermediate_data.append(key_value)
    #     except Exception as e:
    #         logger.info("Key: %s" % key)
    #         logger.info("Third time Error: %s" % str(e))
    #         raise RuntimeError("%s" % str(e))

    intermediate_data.sort(key=lambda x: x[0])

    begin_time = time.time()
    interval_time = random.randint(60, 180)
    interval_num_keys_processed = 0
    average_num_keys = float(len(intermediate_data) / len(reduce_keys))

    cur_key = None
    cur_values = []
    outputs = []
    for key, value in intermediate_data:
        if cur_key == key:
            cur_values.append(value)
        else:
            if cur_key is not None:
                reduce_function(outputs, (cur_key, cur_values))

            cur_key = key
            cur_values = [value]

        if StaticVariables.OPTIMISATION_FN not in static_job_info \
                or not static_job_info[StaticVariables.OPTIMISATION_FN]:
            interval_num_keys_processed += 1
            current_time = time.time()
            if int(current_time - begin_time) > interval_time:
                begin_time = current_time
                interval_time = random.randint(1, 3)
                interval_num_files_processed = int(
                    interval_num_keys_processed / average_num_keys)
                stage_progress_obj.increase_num_processed_keys(
                    stage_progress_table_name, stage_id,
                    interval_num_files_processed)
                interval_num_keys_processed = interval_num_keys_processed % average_num_keys

    if cur_key is not None:
        reduce_function(outputs, (cur_key, cur_values))

    if StaticVariables.OPTIMISATION_FN not in static_job_info \
            or not static_job_info[StaticVariables.OPTIMISATION_FN]:
        interval_num_files_processed = int(interval_num_keys_processed /
                                           average_num_keys)
        stage_progress_obj.increase_num_processed_keys(
            stage_progress_table_name, stage_id, interval_num_files_processed)

    logger.info("Reduce sample outputs: %s" % str(outputs[0:10]))

    if stage_id == total_num_stages:
        cur_output_handler = output_handler.get_output_handler(
            static_job_info[StaticVariables.OUTPUT_SOURCE_TYPE_FN],
            static_job_info[StaticVariables.LOCAL_TESTING_FLAG_FN],
            in_lambda=True)
        io_start_time = time.time()
        cur_output_handler.write_output(reducer_id, outputs, {},
                                        static_job_info, submission_time)
        io_time += time.time() - io_start_time
        logger.info("Finished writing the output")
    else:
        mapper_filename = "%s/%s-%s/%s" % (
            job_name, StaticVariables.OUTPUT_PREFIX, stage_id, reducer_id)
        io_start_time = time.time()
        s3_client.put_object(Bucket=shuffling_bucket,
                             Key=mapper_filename,
                             Body=json.dumps(outputs))
        io_time += time.time() - io_start_time

        logger.info("Finished writing the output")

        lambda_client.invoke(FunctionName=coordinator_lambda_name,
                             InvocationType='Event',
                             Payload=json.dumps({'stage_id': stage_id}))

        logger.info("Finished scheduling the coordinator Lambda function")

    time_in_secs = time.time() - start_time
    metadata = {
        "lineCount": '%s' % line_count,
        "processingTime": '%s' % time_in_secs,
        "memoryUsage":
        '%s' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss,
        "numKeys": '%s' % len(reduce_keys),
        "ioTime": '%s' % io_time,
        "computeTime": '%s' % str(time_in_secs - io_time)
    }

    info_write_start_time = time.time()
    metrics_bucket = StaticVariables.METRICS_BUCKET % job_name
    execution_info_s3_key = "%s/stage-%s/%s" % (job_name, stage_id, reducer_id)
    s3_client.put_object(Bucket=metrics_bucket,
                         Key=execution_info_s3_key,
                         Body=json.dumps({}),
                         Metadata=metadata)
    logger.info("Info write time: %s" %
                str(time.time() - info_write_start_time))

    logger.info("Reducer %s finishes execution" % str(reducer_id))
    logger.info("Execution time: %s" % str(time.time() - start_time))