Пример #1
0
# init
bucket = config["bucket"]
job_bucket = config["jobBucket"]
region = config["region"]
lambda_memory = config["lambdaMemory"]
concurrent_lambdas = config["concurrentLambdas"]

#all_keys = s3_client.list_objects(Bucket=bucket, Prefix=config["prefix"])["Contents"]

# Fetch all the keys that match the prefix
all_keys = []
for obj in s3.Bucket(bucket).objects.filter(Prefix=config["prefix"]).all():
    all_keys.append(obj)

bsize = lambdautils.compute_batch_size(all_keys, lambda_memory)
batches = lambdautils.batch_creator(all_keys, bsize)
n_mappers = len(batches)

# 2. Create the lambda functions

L_PREFIX = "BL"

# Lambda functions
mapper_lambda_name = L_PREFIX + "-mapper-" + job_id
reducer_lambda_name = L_PREFIX + "-reducer-" + job_id
rc_lambda_name = L_PREFIX + "-rc-" + job_id

# write job config
write_job_config(job_id, job_bucket, n_mappers, reducer_lambda_name,
                 config["reducer"]["handler"])
def lambda_handler(event, context):
    print("Received event: " + json.dumps(event, indent=2))

    start_time = time.time();

    # Job Bucket. We just got a notification from this bucket
    bucket = event['Records'][0]['s3']['bucket']['name']

    #key = urllib.unquote_plus(event['Records'][0]['s3']['object']['key'].encode('utf8'))
   
    config = json.loads(open('./jobinfo.json', "r").read())
    
    job_id =  config["jobId"]
    map_count = config["mapCount"] 
    r_function_name = config["reducerFunction"] 
    r_handler = config["reducerHandler"] 

    ### Get Mapper Finished Count ###
    
    # Get job files
    files = s3_client.list_objects(Bucket=bucket, Prefix=job_id)["Contents"]

    if check_job_done(files) == True:
        print "Job done!!! Check the result file"
        # TODO:  Delete reducer and coordinator lambdas
        return
    else:
        ### Stateless Coordinator logic
        mapper_keys = get_mapper_files(files)
        print "Mappers Done so far ", len(mapper_keys)

        if map_count == len(mapper_keys):
            
            # All the mappers have finished, time to schedule the reducers
            stepInfo = get_reducer_state_info(files, job_id, bucket)

            print "stepInfo", stepInfo

            step_number = stepInfo[0];
            reducer_keys = stepInfo[1];
               
            if len(reducer_keys) == 0:
                print "Still waiting to finish Reducer step ", step_number
                return
                 
            # Compute this based on metadata of files
            r_batch_size = get_reducer_batch_size(reducer_keys); 
                
            print "Starting the the reducer step", step_number
            print "Batch Size", r_batch_size
                
            # Create Batch params for the Lambda function
            r_batch_params = lambdautils.batch_creator(reducer_keys, r_batch_size);
                
            # Build the lambda parameters
            n_reducers = len(r_batch_params)
            n_s3 = n_reducers * len(r_batch_params[0])
            step_id = step_number +1;

            for i in range(len(r_batch_params)):
                batch = [b['Key'] for b in r_batch_params[i]]

                # invoke the reducers asynchronously
                resp = lambda_client.invoke( 
                        FunctionName = r_function_name,
                        InvocationType = 'Event',
                        Payload =  json.dumps({
                            "bucket": bucket,
                            "keys": batch,
                            "jobBucket": bucket,
                            "jobId": job_id,
                            "nReducers": n_reducers, 
                            "stepId": step_id, 
                            "reducerId": i 
                        })
                    )
                print resp

            # Now write the reducer state
            fname = "%s/reducerstate.%s"  % (job_id, step_id)
            write_reducer_state(n_reducers, n_s3, bucket, fname)
        else:
            print "Still waiting for all the mappers to finish .."
def handler(event, context):
    start_time = time.time();

    # Job Bucket. We just got a notification from this bucket
    bucket = event['Records'][0]['s3']['bucket']['name']
    key = event['Records'][0]['s3']['object']['key']
    print("Received event: {}:{}".format(bucket,key))
   
    idx = key.find('/')
    tmpdir = key[:idx]
    obj = s3.Object(bucket, '{}/jobinfo.json'.format(tmpdir))
    file_content = obj.get()['Body'].read().decode('utf-8')
    config = json.loads(file_content)
    
    job_id =  config["jobId"]
    map_count = config["mapCount"] 
    r_function_name = config["reducerFunction"] 
    r_handler = config["reducerHandler"] 

    ### Get Mapper Finished Count ###
    
    # Get job files
    files = s3_client.list_objects(Bucket=bucket, Prefix=job_id)["Contents"]

    if check_job_done(files) == True:
        print("Job done!!! Check the result file")
        return
    else:
        ### Stateless Coordinator logic
        mapper_keys = get_mapper_files(files)
        print("Mappers Done so far ", len(mapper_keys))

        if map_count == len(mapper_keys):
            
            # All the mappers have finished, time to schedule the reducers
            stepInfo = get_reducer_state_info(files, job_id, bucket)

            #print("stepInfo", stepInfo)

            step_number = stepInfo[0];
            reducer_keys = stepInfo[1];
               
            if len(reducer_keys) == 0:
                print("Waiting to finish Reducer step ", step_number)
                return
                 
            # Compute this based on metadata of files
            r_batch_size = get_reducer_batch_size(reducer_keys); 
                
            #print("Starting the the reducer step", step_number)
                
            # Create Batch params for the Lambda function
            r_batch_params = lambdautils.batch_creator(reducer_keys, r_batch_size);
            print("Batch Size {}, Spawning this many reducers: {}".format(r_batch_size,len(r_batch_params)))
                
            # Build the lambda parameters
            n_reducers = len(r_batch_params)
            n_s3 = n_reducers * len(r_batch_params[0])
            step_id = step_number +1;

            for i in range(len(r_batch_params)):
                batch = [b['Key'] for b in r_batch_params[i]]

                # invoke the reducers asynchronously
                resp = lambda_client.invoke( 
                        FunctionName = r_function_name,
                        InvocationType = 'Event',
                        Payload =  json.dumps({
                            "bucket": bucket,
                            "keys": batch,
                            "jobBucket": bucket,
                            "jobId": job_id,
                            "nReducers": n_reducers, 
                            "stepId": step_id, 
                            "reducerId": i 
                        })
                    )
                #print('Reducer: {}'.format(resp))

            # Now write the reducer state
            fname = "%s/reducerstate.%s"  % (job_id, step_id)
            write_reducer_state(n_reducers, n_s3, bucket, fname)
        else:
            print("Still waiting for all the mappers or reducers (if count > total_jobs (Num. of Mappers reported by driver)) to finish ..")
Пример #4
0
def lambda_handler(event, context):
    print("Received event: " + json.dumps(event, indent=2))

    start_time = time.time()

    # Job Bucket. We just got a notification from this bucket
    bucket = event['Records'][0]['s3']['bucket']['name']

    #key = urllib.unquote_plus(event['Records'][0]['s3']['object']['key'].encode('utf8'))

    config = json.loads(open('./jobinfo.json', "r").read())

    job_id = config["jobId"]
    map_count = config["mapCount"]
    r_function_name = config["reducerFunction"]
    r_handler = config["reducerHandler"]

    ### Get Mapper Finished Count ###

    # Get job files
    files = s3_client.list_objects(Bucket=bucket, Prefix=job_id)["Contents"]

    if check_job_done(files) == True:
        print "Job done!!! Check the result file"
        # TODO:  Delete reducer and coordinator lambdas
        return
    else:
        ### Stateless Coordinator logic
        mapper_keys = get_mapper_files(files)
        print "Mappers Done so far ", len(mapper_keys)

        if map_count == len(mapper_keys):

            # All the mappers have finished, time to schedule the reducers
            stepInfo = get_reducer_state_info(files, job_id, bucket)

            print "stepInfo", stepInfo

            step_number = stepInfo[0]
            reducer_keys = stepInfo[1]

            if len(reducer_keys) == 0:
                print "Still waiting to finish Reducer step ", step_number
                return

            # Compute this based on metadata of files
            r_batch_size = get_reducer_batch_size(reducer_keys)

            print "Starting the the reducer step", step_number
            print "Batch Size", r_batch_size

            # Create Batch params for the Lambda function
            r_batch_params = lambdautils.batch_creator(reducer_keys,
                                                       r_batch_size)

            # Build the lambda parameters
            n_reducers = len(r_batch_params)
            n_s3 = n_reducers * len(r_batch_params[0])
            step_id = step_number + 1

            for i in range(len(r_batch_params)):
                batch = [b['Key'] for b in r_batch_params[i]]

                # invoke the reducers asynchronously
                resp = lambda_client.invoke(FunctionName=r_function_name,
                                            InvocationType='Event',
                                            Payload=json.dumps({
                                                "bucket": bucket,
                                                "keys": batch,
                                                "jobBucket": bucket,
                                                "jobId": job_id,
                                                "nReducers": n_reducers,
                                                "stepId": step_id,
                                                "reducerId": i
                                            }))
                print resp

            # Now write the reducer state
            fname = "%s/reducerstate.%s" % (job_id, step_id)
            write_reducer_state(n_reducers, n_s3, bucket, fname)
        else:
            print "Still waiting for all the mappers to finish .."
def handler(event, context):
    start_time = time.time()

    # Job Bucket. We just got a notification from this bucket
    bucket = event['Records'][0]['s3']['bucket']['name']
    key = event['Records'][0]['s3']['object']['key']
    print("Received event: {}:{}".format(bucket, key))

    idx = key.find('/')
    tmpdir = key[:idx]
    obj = s3.Object(bucket, '{}/jobinfo.json'.format(tmpdir))
    file_content = obj.get()['Body'].read().decode('utf-8')
    config = json.loads(file_content)

    job_id = config["jobId"]
    map_count = config["mapCount"]
    r_function_name = config["reducerFunction"]
    r_handler = config["reducerHandler"]

    ### Get Mapper Finished Count ###

    # Get job files
    files = s3_client.list_objects(Bucket=bucket, Prefix=job_id)["Contents"]

    if check_job_done(files) == True:
        print("Job done!!! Check the result file")
        return
    else:
        ### Stateless Coordinator logic
        mapper_keys = get_mapper_files(files)
        print("Mappers Done so far ", len(mapper_keys))

        if map_count == len(mapper_keys):

            # All the mappers have finished, time to schedule the reducers
            stepInfo = get_reducer_state_info(files, job_id, bucket)

            #print("stepInfo", stepInfo)

            step_number = stepInfo[0]
            reducer_keys = stepInfo[1]

            if len(reducer_keys) == 0:
                print("Waiting to finish Reducer step ", step_number)
                return

            # Compute this based on metadata of files
            r_batch_size = get_reducer_batch_size(reducer_keys)

            #print("Starting the the reducer step", step_number)

            # Create Batch params for the Lambda function
            r_batch_params = lambdautils.batch_creator(reducer_keys,
                                                       r_batch_size)
            print("Batch Size {}, Spawning this many reducers: {}".format(
                r_batch_size, len(r_batch_params)))

            # Build the lambda parameters
            n_reducers = len(r_batch_params)
            n_s3 = n_reducers * len(r_batch_params[0])
            step_id = step_number + 1

            for i in range(len(r_batch_params)):
                batch = [b['Key'] for b in r_batch_params[i]]

                # invoke the reducers asynchronously
                resp = lambda_client.invoke(FunctionName=r_function_name,
                                            InvocationType='Event',
                                            Payload=json.dumps({
                                                "bucket": bucket,
                                                "keys": batch,
                                                "jobBucket": bucket,
                                                "jobId": job_id,
                                                "nReducers": n_reducers,
                                                "stepId": step_id,
                                                "reducerId": i
                                            }))
                #print('Reducer: {}'.format(resp))

            # Now write the reducer state
            fname = "%s/reducerstate.%s" % (job_id, step_id)
            write_reducer_state(n_reducers, n_s3, bucket, fname)
        else:
            print(
                "Still waiting for all the mappers or reducers (if count > total_jobs (Num. of Mappers reported by driver)) to finish .."
            )
def lambda_handler(event, context):
    print("Received event: " + json.dumps(event, indent=2))

    start_time = time.time()

    # Job Bucket으로 이 Bucket으로부터 notification을 받습니다.
    bucket = event['Records'][0]['s3']['bucket']['name']
    config = json.loads(open('./jobinfo.json', "r").read())

    job_id = config["jobId"]
    map_count = config["mapCount"]
    r_function_name = config["reducerFunction"]
    r_handler = config["reducerHandler"]

    ### Mapper 완료된 수를 count 합니다. ###

    # Job 파일들을 가져옵니다.
    files = s3_client.list_objects(Bucket=bucket, Prefix=job_id)["Contents"]

    if check_job_done(files) == True:
        print("Job done!!! Check the result file")
        return
    else:
        mapper_keys = get_mapper_files(files)
        print("Mappers Done so far ", len(mapper_keys))

        if map_count == len(mapper_keys):

            # 모든 mapper가 완료되었다면, reducer를 시작합니다.
            stepInfo = get_reducer_state_info(files, job_id, bucket)

            print("stepInfo", stepInfo)

            step_number = stepInfo[0]
            reducer_keys = stepInfo[1]

            if len(reducer_keys) == 0:
                print("Still waiting to finish Reducer step ", step_number)
                return

            # 메타데이터(metadata)의 파일을 기반으로 Reduce의 배치 사이즈를 계산합니다.
            r_batch_size = get_reducer_batch_size(reducer_keys)

            print("Starting the the reducer step", step_number)
            print("Batch Size", r_batch_size)

            r_batch_params = lambdautils.batch_creator(reducer_keys,
                                                       r_batch_size)

            n_reducers = len(r_batch_params)
            n_s3 = n_reducers * len(r_batch_params[0])
            step_id = step_number + 1

            for i in range(len(r_batch_params)):
                batch = [b['Key'] for b in r_batch_params[i]]

                # Reducer Lambda를 비동기식(asynchronously)으로 호출(invoke)합니다.
                resp = lambda_client.invoke(FunctionName=r_function_name,
                                            InvocationType='Event',
                                            Payload=json.dumps({
                                                "bucket": bucket,
                                                "keys": batch,
                                                "jobBucket": bucket,
                                                "jobId": job_id,
                                                "nReducers": n_reducers,
                                                "stepId": step_id,
                                                "reducerId": i
                                            }))
                print(resp)

            # Reducer의 상태를 S3에 저장합니다.
            fname = "%s/reducerstate.%s" % (job_id, step_id)
            write_reducer_state(n_reducers, n_s3, bucket, fname)
        else:
            print("Still waiting for all the mappers to finish ..")
Пример #7
0
def handler(event, context):
    entry = time.time() * 1000
    logger = logging.getLogger()
    logger.setLevel(logging.WARN)
    if not event: #requires arguments
        print('No event was passed to the handler, exiting...')
        return

    if 'mapper' not in event or 'reducer' not in event:
        print('No mapper or reducer function names given, unable to proceed, exiting...')
        return

    # create an S3 session
    if not context: #calling from main
        boto3.setup_default_session(profile_name='cjk1')
    s3 = boto3.resource('s3')
    config = botocore.client.Config(connect_timeout=50, read_timeout=200)
    s3_client = boto3.client('s3',config=config)

    JOB_INFO = 'jobinfo.json'

    # 1. Get all keys to be processed  
    # init 
    endearly = 0
    if 'endearly' in event:
        endearly = int(event['endearly'])
    bucket = event["bucket"]
    dryrun = True if "dryrun" in event else False
    lambda_memory = 1536

    # Fetch all the keys that match the prefix
    all_keys = []
    for obj in s3.Bucket(bucket).objects.filter(Prefix=event["prefix"]).all():
        all_keys.append(obj)
    
    bsize = lambdautils.compute_batch_size(all_keys, lambda_memory)
    batches = lambdautils.batch_creator(all_keys, bsize)
    n_mappers = len(batches)
    if endearly > 0 and endearly < n_mappers:
        n_mappers = endearly
    print("Num. of Mappers (and Reducers) ", n_mappers)

    if dryrun: #don't go any further
        delta = (time.time() * 1000) - entry
        me_str = 'TIMER:CALL:{}:dryrun:0'.format(delta)
        logger.warn(me_str)
        return me_str

    #process the remaining arguments
    job_id = event["job_id"]
    job_bucket = event["jobBucket"]
    region = event["region"]
    async = True if "full_async" in event else False
    reducer_lambda_name = event["reducer"]
    mapper_lambda_name = event["mapper"]
    
    # Write Jobdata to S3
    j_key = job_id + "/jobdata";
    data = json.dumps({
        "mapCount": n_mappers, 
        "totalS3Files": len(all_keys),
        "startTime": time.time()
        })
    write_to_s3(s3, job_bucket, j_key, data, {})
    data = json.dumps({
        "jobId": job_id,
        "jobBucket" : job_bucket,
        "mapCount": n_mappers,
        "reducerFunction": reducer_lambda_name,
        "reducerHandler": "{}.handler".format(reducer_lambda_name)
        }, indent=4);
    j_key = job_id + "/jobinfo.json";
    write_to_s3(s3,job_bucket,j_key,data,{})

    ### Execute ###
    total_lambda_secs = 0
    reducer_lambda_time = 0
    mapper_outputs = []

    if async: #asynchronous invocation of mappers
        for i in range(n_mappers):
            invoke_lambda(mapper_lambda_name,batches,bucket,job_bucket,job_id,i)

    else: #synchronous invocation of mappers on parallel threads
        pool = ThreadPool(n_mappers)
        Ids = [i+1 for i in range(n_mappers)]
        invoke_lambda_partial = partial(invoke_lambda_sync,mapper_lambda_name,batches,mapper_outputs,bucket,job_bucket,job_id)
        
        # Burst request handling
        mappers_executed = 0
        concurrent_lambdas = 100 #only used by synchronous run (use --dryrun to see how many actual mappers are needed
        while mappers_executed < n_mappers:
            nm = min(concurrent_lambdas, n_mappers)
            results = pool.map(invoke_lambda_partial, Ids[mappers_executed: mappers_executed + nm])
            mappers_executed += nm
    
        pool.close()
        pool.join()
    
    for output in mapper_outputs:
        if 'body' in output:
            total_lambda_secs += float(output['body'][2])
        else:
            total_lambda_secs += float(output[2])
    
    if not async:
        #Note: Wait for the job to complete so that we can compute total cost ; create a poll every 10 secs
        # Get all reducer keys
        reducer_keys = []
        # Total execution time for reducers
    
        while True:
            job_keys = s3_client.list_objects(Bucket=job_bucket, Prefix=job_id)["Contents"]
            keys = [jk["Key"] for jk in job_keys]
            total_s3_size = sum([jk["Size"] for jk in job_keys])
            
            logger.info("checking if job is done")
        
            # check job done
            if job_id + "/result" in keys:
                reducer_lambda_time += float(s3.Object(job_bucket, job_id + "/result").metadata['processingtime'])
                for key in keys:
                    if "task/reducer" in key:
                        reducer_lambda_time += float(s3.Object(job_bucket, key).metadata['processingtime'])
                        reducer_keys.append(key)
                break
            time.sleep(5)
        
    delta = (time.time() * 1000) - entry
    me_str = 'TIMER:CALL:{}:mappers:{}:reducer:{}'.format(delta,total_lambda_secs,reducer_lambda_time)
    logger.warn(me_str)
    return me_str
Пример #8
0
def handler(event, context):
    entry = time.time() * 1000
    logger = logging.getLogger()
    logger.setLevel(logging.WARN)
    if not event:  #requires arguments
        print('No event was passed to the handler, exiting...')
        return

    if 'mapper' not in event or 'reducer' not in event:
        print(
            'No mapper or reducer function names given, unable to proceed, exiting...'
        )
        return

    # create an S3 session
    if not context:  #calling from main
        boto3.setup_default_session(profile_name='cjk1')
    s3 = boto3.resource('s3')
    config = botocore.client.Config(connect_timeout=50, read_timeout=200)
    s3_client = boto3.client('s3', config=config)

    JOB_INFO = 'jobinfo.json'

    # 1. Get all keys to be processed
    # init
    endearly = 0
    if 'endearly' in event:
        endearly = int(event['endearly'])
    bucket = event["bucket"]
    dryrun = True if "dryrun" in event else False
    lambda_memory = 1536

    # Fetch all the keys that match the prefix
    all_keys = []
    for obj in s3.Bucket(bucket).objects.filter(Prefix=event["prefix"]).all():
        all_keys.append(obj)

    bsize = lambdautils.compute_batch_size(all_keys, lambda_memory)
    batches = lambdautils.batch_creator(all_keys, bsize)
    n_mappers = len(batches)
    if endearly > 0 and endearly < n_mappers:
        n_mappers = endearly
    print("Num. of Mappers (and Reducers) ", n_mappers)

    if dryrun:  #don't go any further
        delta = (time.time() * 1000) - entry
        me_str = 'TIMER:CALL:{}:dryrun:0'.format(delta)
        logger.warn(me_str)
        return me_str

    #process the remaining arguments
    job_id = event["job_id"]
    job_bucket = event["jobBucket"]
    region = event["region"]
    async = True if "full_async" in event else False
    reducer_lambda_name = event["reducer"]
    mapper_lambda_name = event["mapper"]

    # Write Jobdata to S3
    j_key = job_id + "/jobdata"
    data = json.dumps({
        "mapCount": n_mappers,
        "totalS3Files": len(all_keys),
        "startTime": time.time()
    })
    write_to_s3(s3, job_bucket, j_key, data, {})
    data = json.dumps(
        {
            "jobId": job_id,
            "jobBucket": job_bucket,
            "mapCount": n_mappers,
            "reducerFunction": reducer_lambda_name,
            "reducerHandler": "{}.handler".format(reducer_lambda_name)
        },
        indent=4)
    j_key = job_id + "/jobinfo.json"
    write_to_s3(s3, job_bucket, j_key, data, {})

    ### Execute ###
    total_lambda_secs = 0
    reducer_lambda_time = 0
    mapper_outputs = []

    if async:  #asynchronous invocation of mappers
        for i in range(n_mappers):
            invoke_lambda(mapper_lambda_name, batches, bucket, job_bucket,
                          job_id, i)

    else:  #synchronous invocation of mappers on parallel threads
        pool = ThreadPool(n_mappers)
        Ids = [i + 1 for i in range(n_mappers)]
        invoke_lambda_partial = partial(invoke_lambda_sync, mapper_lambda_name,
                                        batches, mapper_outputs, bucket,
                                        job_bucket, job_id)

        # Burst request handling
        mappers_executed = 0
        concurrent_lambdas = 100  #only used by synchronous run (use --dryrun to see how many actual mappers are needed
        while mappers_executed < n_mappers:
            nm = min(concurrent_lambdas, n_mappers)
            results = pool.map(invoke_lambda_partial,
                               Ids[mappers_executed:mappers_executed + nm])
            mappers_executed += nm

        pool.close()
        pool.join()

    for output in mapper_outputs:
        if 'body' in output:
            total_lambda_secs += float(output['body'][2])
        else:
            total_lambda_secs += float(output[2])

    if not async:
        #Note: Wait for the job to complete so that we can compute total cost ; create a poll every 10 secs
        # Get all reducer keys
        reducer_keys = []
        # Total execution time for reducers

        while True:
            job_keys = s3_client.list_objects(Bucket=job_bucket,
                                              Prefix=job_id)["Contents"]
            keys = [jk["Key"] for jk in job_keys]
            total_s3_size = sum([jk["Size"] for jk in job_keys])

            logger.info("checking if job is done")

            # check job done
            if job_id + "/result" in keys:
                reducer_lambda_time += float(
                    s3.Object(job_bucket,
                              job_id + "/result").metadata['processingtime'])
                for key in keys:
                    if "task/reducer" in key:
                        reducer_lambda_time += float(
                            s3.Object(job_bucket,
                                      key).metadata['processingtime'])
                        reducer_keys.append(key)
                break
            time.sleep(5)

    delta = (time.time() * 1000) - entry
    me_str = 'TIMER:CALL:{}:mappers:{}:reducer:{}'.format(
        delta, total_lambda_secs, reducer_lambda_time)
    logger.warn(me_str)
    return me_str