예제 #1
0
 def ingest(self):                    
     debug_print("Ingesting directory {}".format(self.directory))               
     debug_print("Ingesting the files \n{}".format(self.files))                    
     is_lambda =  self.context[c.KEY_LAMBDA_FUNCTION] is not None
     timeout = self.__calculate_aggregate_window_timeout(self.context[c.KEY_MAX_LAMBDA_TIME])
     target_excretion_size = self.context[c.KEY_TARGET_AGGREGATION_FILE_SIZE_IN_MB]
     compression_ratio = self.context[c.KEY_CSV_PARQUET_COMPRESSION_RATIO]
     sep = self.context[c.KEY_SEPERATOR_PARTITION]
     memory_trigger = self.context[c.KEY_AMOEBA_MEMORY_FLUSH_TRIGGER]             
     memory_used = mutil.get_memory_usage()             
     main_filename, main_file_data, main_file_size_mb = self.__get_main_aggregate_file(self.directory, sep, target_excretion_size)        
     main_file_data = self.__append(None, main_file_data)           
     keys_ingested = []
     for file in self.files:
         debug_print("\tProcessing file {}".format(file))
         key_parts = KeyParts(file, sep)
         duration = datetime.datetime.utcnow() - key_parts.filename_timestamp            
         if duration.total_seconds() < 300:
             debug_print("The file '{}' is {}s old.  It is too new and will be processed later to allow for S3 propagation.".format(file, duration.total_seconds()))
             continue
         keys_ingested.append(file)            
         data = self.__open(file, main_file_data)
         if data is None:
             continue            
         size_in_megabytes = self.__size(file)            
         main_file_data = self.__append(main_file_data, data) 
         del data
         gc.collect()
         current_dataframe_size = sys.getsizeof(main_file_data)        
         #break conditions
         #1. Memory limit exceeded
         #2. Time window exceeded
         #3. Target excretion size hit
         main_file_size_mb += size_in_megabytes
         memory_used = mutil.get_memory_usage()           
         debug_print("\t\tSize on S3: {}MB Size of new dataset: {}bytes Estimated Compression Ratio: {} Memory Used: {}% Project Compression Size {}MB  Target Excretion Size {}MB".format(size_in_megabytes, current_dataframe_size, compression_ratio, memory_used, main_file_size_mb, target_excretion_size))
         if util.elapsed(self.context) > timeout or memory_used > memory_trigger or main_file_size_mb > target_excretion_size :
             print "Elapsed", util.elapsed(self.context), "Start:", self.starttime, "Timeout:", timeout, "Has timed out:", util.elapsed(self.context) > timeout, "Mem Used %:", memory_used, "Max Memory %:", memory_trigger
             break                
     
     #only save the files if we have a reasonable amount of time remaining before the lambda timeout.
     debug_print("Time remaining: {}s".format(util.time_remaining(self.context)))    
     debug_print("There were {} keys ingested.  The keys ingested are: \n {}".format(len(keys_ingested), keys_ingested))
     if len(keys_ingested)>0 and util.time_remaining(self.context) > c.SAVE_WINDOW_IN_SECONDS and not main_file_data.empty:            
         main_file_data = self.__convert_to_submission_df(main_file_data)
         gc.collect()
         self.__excret(self.directory, main_filename, main_file_data, sep)            
         self.__delete_keys(keys_ingested)
     elif util.time_remaining(self.context) <= c.SAVE_WINDOW_IN_SECONDS:            
         print "Time has run out!  We have less than {} seconds remaining before this lambda times out.  Abandoning the S3 commit to avoid file corruption.".format(c.SAVE_WINDOW_IN_SECONDS)
         print "Aggregation window (Max Lambda Execution Time * {}): {} seconds".format(c.RATIO_OF_MAX_LAMBDA_TIME, timeout) 
         print "S3 Save window: {} seconds".format(c.SAVE_WINDOW_IN_SECONDS) 
         print "Lambda time remaining: {} seconds".format(util.time_remaining(self.context))                        
        
     remaining_files = list(set(self.files) - set(keys_ingested))
     if len(remaining_files) > 0:        
         debug_print("Re-adding the {} paths to SQS to attempt again. The paths are \n{}".format(len(remaining_files), remaining_files))               
         self.__add_to_sqs(remaining_files)        
     print "I've consumed everything I can in bucket '{}'".format(self.directory)
     return
예제 #2
0
def ingest(event, lambdacontext):
    debug_print("Initial memory size: {} bytes and {}%".format(
        mutil.get_memory_object(), mutil.get_memory_usage()))
    amoeba = Amoeba(event, lambdacontext)
    amoeba.ingest()
    del amoeba
    gc.collect()
    debug_print("Initial memory size: {} bytes and {}%".format(
        mutil.get_memory_object(), mutil.get_memory_usage()))
예제 #3
0
def process(context):
    print mutil.get_memory_object()
    write_initial_stats(context)
    process_bytes = mutil.get_process_memory_usage_bytes()
    if c.KEY_SQS_QUEUE_URL not in context or context[
            c.KEY_SQS_QUEUE_URL] is None:
        context[c.KEY_SQS].set_queue_url(lowest_load_queue=False)
    #execute at least once
    messages_to_process = None
    inflight_messages = 0
    elapsed = 0
    metric_sets = dict({})
    context[c.KEY_AGGREGATOR] = Aggregator(context, metric_sets)
    messages = []
    last_queue_size_check = context[c.KEY_FREQUENCY_TO_CHECK_TO_SPAWN_ANOTHER]
    growth_rate = last_check = 0
    last_message_count = None
    timeout = calculate_aggregate_window_timeout(context)
    value = datetime.datetime.fromtimestamp(context[c.KEY_START_TIME])
    message_processing_time = 0
    print "[{}]Using SQS queue URL '{}'".format(context[c.KEY_REQUEST_ID],
                                                context[c.KEY_SQS].queue_url)
    print "[{}]Started the consumer at {}.  The aggregation window is {} seconds.".format(
        context[c.KEY_REQUEST_ID], value.strftime('%Y-%m-%d %H:%M:%S'),
        timeout)
    while elapsed < timeout:
        if elapsed > last_check:
            last_check = elapsed + context[c.KEY_FREQUENCY_TO_CHECK_SQS_STATE]
            response = context[c.KEY_SQS].get_queue_attributes()
            inflight_messages = int(response['Attributes']
                                    ['ApproximateNumberOfMessagesNotVisible'])
            messages_to_process = int(
                response['Attributes']['ApproximateNumberOfMessages'])
            if last_message_count is None:
                last_message_count = messages_to_process
            else:
                growth_rate = last_message_count if last_message_count == 0 else float(
                    messages_to_process -
                    last_message_count) / last_message_count
                last_message_count = messages_to_process
                grow_if_threshold_hit(
                    context, growth_rate,
                    context[c.KEY_GROWTH_RATE_BEFORE_ADDING_LAMBDAS])

            #if the queue is growing slowly and is above 30,000 messages launch a new consumer
            if elapsed > last_queue_size_check:
                last_queue_size_check = elapsed + context[
                    c.KEY_FREQUENCY_TO_CHECK_TO_SPAWN_ANOTHER]
                print "[{}]\nThere are approximately {} messages that require processing.\n" \
                    "There are {} in-flight messages.\n" \
                    "{} seconds have elapsed and there is {} seconds remaining before timeout.\n" \
                    "The queue growth rate is {}\n" \
                    "{} message(s) were processed.".format(context[c.KEY_REQUEST_ID], messages_to_process,inflight_messages,round(elapsed,2),util.time_remaining(context),growth_rate,len(messages))
                if messages_to_process > context[
                        c.
                        KEY_THRESHOLD_BEFORE_SPAWN_NEW_CONSUMER] and inflight_messages <= context[
                            c.KEY_MAX_INFLIGHT_MESSAGES]:
                    print "The queue size is greater than {}. Launching another consumer.".format(
                        context[c.KEY_THRESHOLD_BEFORE_SPAWN_NEW_CONSUMER])
                    add_consumer(context)
            if last_message_count == 0:
                print "[{}]No more messages to process.".format(
                    context[c.KEY_REQUEST_ID])
                break
        messages = context[c.KEY_SQS].read_queue()

        if len(messages) > 0:
            start = time.time()
            context[c.KEY_AGGREGATOR].append_default_metrics_and_partition(
                messages)
            message_processing_time = round(
                ((time.time() - start) + message_processing_time) / 2, 4)
        else:
            if len(metric_sets) > 1:
                print "[{}]No more messages to process.".format(
                    context[c.KEY_REQUEST_ID])
                break
            else:
                print "[{}]No metric sets to process. Exiting.".format(
                    context[c.KEY_REQUEST_ID])
                return

        #start throttling the message processing when the SQS inflight messages is at 80% (16,000)
        #one queue is only allowed to have 20,000 maximum messages being processed (in-flight)
        usage = mutil.get_memory_usage()
        if inflight_messages > 16000:
            print "[{}]Stopping aggregation.  There are too many messages in flight.  Currently there are {} messages in flight.".format(
                context[c.KEY_REQUEST_ID], inflight_messages)
            break
        if usage > context[c.KEY_MEMORY_FLUSH_TRIGGER]:
            print "[{}]Stopping aggregation.  Memory safe level threshold exceeded.  The lambda is currently at {}%.".format(
                context[c.KEY_REQUEST_ID], usage)
            break
        if util.elapsed(context) + message_processing_time > timeout:
            print "[{}]Stopping aggregation.  The elapsed time and the projected message processing time exceeds the timeout window.  Messages are taking {} seconds to process.  There is {} seconds left before time out and {} seconds for aggregation.".format(
                context[c.KEY_REQUEST_ID], message_processing_time,
                util.time_remaining(context), timeout)
            break

        elapsed = util.elapsed(context)

    util.debug_print(
        "[{}]Lambda has completed the agreggation phase.  Elapsed time was {} seconds and we have {} seconds remaining. There are {} in-flight messages and {} remaining messages to process."
        .format(context[c.KEY_REQUEST_ID], elapsed,
                util.time_remaining(context), inflight_messages,
                messages_to_process))
    context[c.KEY_THREAD_POOL].wait()
    bytes_consumed = mutil.get_process_memory_usage_bytes()
    memory_usage = str(mutil.get_memory_usage())
    print mutil.get_memory_object()
    tables = metric_sets[c.KEY_TABLES]
    del metric_sets[c.KEY_TABLES]
    flush_and_delete(context, metric_sets)
    context[c.KEY_THREAD_POOL].wait()
    update_glue_crawler_datastores(context, tables)
    print mutil.get_memory_object()
    print "[{}]Elapsed time {} seconds. ".format(context[c.KEY_REQUEST_ID],
                                                 util.elapsed(context))
    print "[{}]Message processing averaged {} seconds per message. ".format(
        context[c.KEY_REQUEST_ID], message_processing_time)
    print "[{}]The process consumed {} KB of memory.".format(
        context[c.KEY_REQUEST_ID], bytes_consumed / 1024)
    print '[{}]The memory utilization was at {}%.'.format(
        context[c.KEY_REQUEST_ID], memory_usage)
    print '[{}]The process used {} KB for converting messages to parquet format.'.format(
        context[c.KEY_REQUEST_ID], (bytes_consumed - process_bytes) / 1024)
    print "[{}]The save process took {} seconds.".format(
        context[c.KEY_REQUEST_ID], context[c.CW_ATTR_SAVE_DURATION])
    print "[{}]Processed {} uncompressed bytes.".format(
        context[c.KEY_REQUEST_ID],
        context[c.KEY_AGGREGATOR].bytes_uncompressed)
    print "[{}]Processed {} metrics. ".format(context[c.KEY_REQUEST_ID],
                                              context[c.KEY_AGGREGATOR].rows)
    print "[{}]Processed {} messages. ".format(
        context[c.KEY_REQUEST_ID], context[c.KEY_AGGREGATOR].messages)
    print "[{}]Average metrics per minute {}. ".format(
        context[c.KEY_REQUEST_ID],
        round(
            context[c.KEY_AGGREGATOR].rows / util.elasped_time_in_min(context),
            2))
    print "[{}]Average messages per minute {}. ".format(
        context[c.KEY_REQUEST_ID],
        round(
            context[c.KEY_AGGREGATOR].messages /
            util.elasped_time_in_min(context), 2))
    print "[{}]Average uncompressed bytes per minute {}. ".format(
        context[c.KEY_REQUEST_ID],
        round(
            context[c.KEY_AGGREGATOR].bytes_uncompressed /
            util.elasped_time_in_min(context), 2))
    print "[{}]There are approximately {} messages that require processing.".format(
        context[c.KEY_REQUEST_ID],
        messages_to_process if messages_to_process else 0)
    print "[{}]There are {} in-flight messages.".format(
        context[c.KEY_REQUEST_ID], inflight_messages)
    print "[{}]There was {} seconds remaining before timeout. ".format(
        context[c.KEY_REQUEST_ID], util.time_remaining(context))
    del tables
    del metric_sets
    gc.collect()
예제 #4
0
def ingest(event, lambdacontext):
    starttime = time.time()
    gc.collect()
    root = event.get("root", None)
    print "Initial memory size:", mutil.get_memory_object()
    print "Started amoeba with root {}".format(root)
    context = event.get("context", {})
    context[c.KEY_LAMBDA_FUNCTION] = lambdacontext.function_name if hasattr(
        lambdacontext, 'function_name') else None
    context[c.KEY_START_TIME] = starttime
    is_lambda = context[c.KEY_LAMBDA_FUNCTION] is not None
    bucket = os.environ[c.ENV_S3_STORAGE]
    crawler = Crawler(context, bucket)
    roots = crawler.crawl_from_relative(root)
    s3_fs = s3fs.S3FileSystem()
    s3 = S3(context, bucket)
    timeout = calculate_aggregate_window_timeout(
        context[c.KEY_MAX_LAMBDA_TIME])
    target_excretion_size = context[c.KEY_TARGET_AGGREGATION_FILE_SIZE_IN_MB]
    compression_ratio = context[c.KEY_CSV_PARQUET_COMPRESSION_RATIO]
    sep = context[c.KEY_SEPERATOR_PARTITION]
    memory_used = mutil.get_memory_usage()
    projected_compressed_file_size_in_mb = 0
    print "Hunting for {} seconds in bucket '{}'".format(timeout, bucket)
    for path in roots:
        #The GLUE Crawler does not work well when a single key in S3 contains varying data schemas.
        files = roots[path]
        if len(files) == 1:
            continue
        debug_print("\t\tIngesting path {}".format(path))
        df = {}
        keys_ingested = []
        data = None

        for file in files:
            debug_print("\t\t\t{}".format(file))
            key = "{}/{}".format(path, file)
            try:
                size_in_megabytes = s3.size_in_megabytes(key)
            except ClientError as e:
                if str(e.response['Error']['Code']) == '404':
                    continue
                else:
                    print "Error: ", e.response['Error']['Code'], key
                    raise e

            if size_in_megabytes > target_excretion_size:
                debug_print(
                    "Skipping file '{}'.  It has reached the targetted file size"
                    .format(key))
                continue
            size_in_bytes = size_in_megabytes * 1024 * 1024
            try:
                data = reader.read(s3_fs, bucket, key)
                keys_ingested.append(key)
            except ClientError as e:
                print e.response['Error']['Code'], "key=>", key
                #handle corrupt files, this can happen if a write did not finish correctly
                if e.message == "Seek before start of file":
                    print "Deleting corrupt file %s", key
                    s3.delete([key])
                elif e.response['Error']['Code'] == 'NoSuchKey':
                    print '{}: for key {}'.format(e.response['Error']['Code'],
                                                  key)
                else:
                    util.logger.error(e)
                continue
            for row in data.itertuples(index=True):
                row = row.__dict__
                del row['Index']
                key_parts = KeyParts(key, sep)
                uuid_key = "{}{}{}".format(row[c.PRIMARY_KEY], key_parts.event,
                                           row[c.TERTIARY_KEY])
                df_size = len(row)

                debug_print(
                    "\t\t\tSize on S3 in MB: {} Size as Dataframe: {} Ratio: {}"
                    .format(size_in_megabytes, df_size, compression_ratio))

                #a dictionary is the fastest way to create a unique set.
                if uuid_key in df:
                    debug_print(
                        "\t\t\tFound duplication in key '{}'.  Keeping the first occurrence."
                        .format(key))
                else:
                    df[uuid_key] = row

                current_dataframe_size = len(df)
                #break conditions
                #1. Memory limit exceeded
                #2. Time window exceeded
                #3. Target excretion size hit
                projected_compressed_file_size_in_mb = (
                    compression_ratio * current_dataframe_size) / 1048576.0
                memory_used = mutil.get_memory_usage()
                debug_print(
                    "\t\t\t{} seconds have elapsed.  {} kilobytes of memory have been used. The projected compressed file size is {} MB.  We are targetting an excretion file size of {} MB."
                    .format(util.elapsed(context), memory_used / 1024,
                            projected_compressed_file_size_in_mb,
                            target_excretion_size))
                if util.elapsed(context) > timeout or memory_used > context[
                        c.
                        KEY_AMOEBA_MEMORY_FLUSH_TRIGGER] or projected_compressed_file_size_in_mb > target_excretion_size:
                    break
            if util.elapsed(context) > timeout or memory_used > context[
                    c.
                    KEY_AMOEBA_MEMORY_FLUSH_TRIGGER] or projected_compressed_file_size_in_mb > target_excretion_size:
                print "Elapsed", util.elapsed(context), "Start:", context[
                    c.
                    KEY_START_TIME], "Timeout:", timeout, "Has timed out:", util.elapsed(
                        context
                    ) > timeout, "Mem Used:", memory_used, "Max Memory %:", context[
                        c.KEY_AMOEBA_MEMORY_FLUSH_TRIGGER]
                break
        if len(keys_ingested) > 0 and util.time_remaining(context) > 45:
            values = df.values()
            #since the schema hash of the event columns are all the same we can infer the dict[0].keys has the same column headers as the rest
            columns = values[0].keys()
            set = pd.DataFrame(values, columns=columns)
            excret(s3, bucket, path, set, keys_ingested, sep,
                   m_schema.object_encoding(columns))
            del set
        elif util.time_remaining(context) <= 45:
            return
        del data
        del df
        del keys_ingested

        if util.elapsed(
                context) > timeout or mutil.get_process_memory_usage_bytes(
                ) >= c.ONE_GB_IN_BYTES:
            print "\tThe elapsed time threshold of {} seconds has been hit or the memory threshold of {} megabytes has been hit. Time: {}s, Memory: {}MB".format(
                timeout, c.ONE_GB_IN_BYTES / 1048576.0, util.elapsed(context),
                mutil.get_process_memory_usage_megabytes())
            return
    print "I've consumed everything I can in bucket '{}'".format(bucket)
    return