def ingest(self): debug_print("Ingesting directory {}".format(self.directory)) debug_print("Ingesting the files \n{}".format(self.files)) is_lambda = self.context[c.KEY_LAMBDA_FUNCTION] is not None timeout = self.__calculate_aggregate_window_timeout(self.context[c.KEY_MAX_LAMBDA_TIME]) target_excretion_size = self.context[c.KEY_TARGET_AGGREGATION_FILE_SIZE_IN_MB] compression_ratio = self.context[c.KEY_CSV_PARQUET_COMPRESSION_RATIO] sep = self.context[c.KEY_SEPERATOR_PARTITION] memory_trigger = self.context[c.KEY_AMOEBA_MEMORY_FLUSH_TRIGGER] memory_used = mutil.get_memory_usage() main_filename, main_file_data, main_file_size_mb = self.__get_main_aggregate_file(self.directory, sep, target_excretion_size) main_file_data = self.__append(None, main_file_data) keys_ingested = [] for file in self.files: debug_print("\tProcessing file {}".format(file)) key_parts = KeyParts(file, sep) duration = datetime.datetime.utcnow() - key_parts.filename_timestamp if duration.total_seconds() < 300: debug_print("The file '{}' is {}s old. It is too new and will be processed later to allow for S3 propagation.".format(file, duration.total_seconds())) continue keys_ingested.append(file) data = self.__open(file, main_file_data) if data is None: continue size_in_megabytes = self.__size(file) main_file_data = self.__append(main_file_data, data) del data gc.collect() current_dataframe_size = sys.getsizeof(main_file_data) #break conditions #1. Memory limit exceeded #2. Time window exceeded #3. Target excretion size hit main_file_size_mb += size_in_megabytes memory_used = mutil.get_memory_usage() debug_print("\t\tSize on S3: {}MB Size of new dataset: {}bytes Estimated Compression Ratio: {} Memory Used: {}% Project Compression Size {}MB Target Excretion Size {}MB".format(size_in_megabytes, current_dataframe_size, compression_ratio, memory_used, main_file_size_mb, target_excretion_size)) if util.elapsed(self.context) > timeout or memory_used > memory_trigger or main_file_size_mb > target_excretion_size : print "Elapsed", util.elapsed(self.context), "Start:", self.starttime, "Timeout:", timeout, "Has timed out:", util.elapsed(self.context) > timeout, "Mem Used %:", memory_used, "Max Memory %:", memory_trigger break #only save the files if we have a reasonable amount of time remaining before the lambda timeout. debug_print("Time remaining: {}s".format(util.time_remaining(self.context))) debug_print("There were {} keys ingested. The keys ingested are: \n {}".format(len(keys_ingested), keys_ingested)) if len(keys_ingested)>0 and util.time_remaining(self.context) > c.SAVE_WINDOW_IN_SECONDS and not main_file_data.empty: main_file_data = self.__convert_to_submission_df(main_file_data) gc.collect() self.__excret(self.directory, main_filename, main_file_data, sep) self.__delete_keys(keys_ingested) elif util.time_remaining(self.context) <= c.SAVE_WINDOW_IN_SECONDS: print "Time has run out! We have less than {} seconds remaining before this lambda times out. Abandoning the S3 commit to avoid file corruption.".format(c.SAVE_WINDOW_IN_SECONDS) print "Aggregation window (Max Lambda Execution Time * {}): {} seconds".format(c.RATIO_OF_MAX_LAMBDA_TIME, timeout) print "S3 Save window: {} seconds".format(c.SAVE_WINDOW_IN_SECONDS) print "Lambda time remaining: {} seconds".format(util.time_remaining(self.context)) remaining_files = list(set(self.files) - set(keys_ingested)) if len(remaining_files) > 0: debug_print("Re-adding the {} paths to SQS to attempt again. The paths are \n{}".format(len(remaining_files), remaining_files)) self.__add_to_sqs(remaining_files) print "I've consumed everything I can in bucket '{}'".format(self.directory) return
def ingest(event, lambdacontext): debug_print("Initial memory size: {} bytes and {}%".format( mutil.get_memory_object(), mutil.get_memory_usage())) amoeba = Amoeba(event, lambdacontext) amoeba.ingest() del amoeba gc.collect() debug_print("Initial memory size: {} bytes and {}%".format( mutil.get_memory_object(), mutil.get_memory_usage()))
def process(context): print mutil.get_memory_object() write_initial_stats(context) process_bytes = mutil.get_process_memory_usage_bytes() if c.KEY_SQS_QUEUE_URL not in context or context[ c.KEY_SQS_QUEUE_URL] is None: context[c.KEY_SQS].set_queue_url(lowest_load_queue=False) #execute at least once messages_to_process = None inflight_messages = 0 elapsed = 0 metric_sets = dict({}) context[c.KEY_AGGREGATOR] = Aggregator(context, metric_sets) messages = [] last_queue_size_check = context[c.KEY_FREQUENCY_TO_CHECK_TO_SPAWN_ANOTHER] growth_rate = last_check = 0 last_message_count = None timeout = calculate_aggregate_window_timeout(context) value = datetime.datetime.fromtimestamp(context[c.KEY_START_TIME]) message_processing_time = 0 print "[{}]Using SQS queue URL '{}'".format(context[c.KEY_REQUEST_ID], context[c.KEY_SQS].queue_url) print "[{}]Started the consumer at {}. The aggregation window is {} seconds.".format( context[c.KEY_REQUEST_ID], value.strftime('%Y-%m-%d %H:%M:%S'), timeout) while elapsed < timeout: if elapsed > last_check: last_check = elapsed + context[c.KEY_FREQUENCY_TO_CHECK_SQS_STATE] response = context[c.KEY_SQS].get_queue_attributes() inflight_messages = int(response['Attributes'] ['ApproximateNumberOfMessagesNotVisible']) messages_to_process = int( response['Attributes']['ApproximateNumberOfMessages']) if last_message_count is None: last_message_count = messages_to_process else: growth_rate = last_message_count if last_message_count == 0 else float( messages_to_process - last_message_count) / last_message_count last_message_count = messages_to_process grow_if_threshold_hit( context, growth_rate, context[c.KEY_GROWTH_RATE_BEFORE_ADDING_LAMBDAS]) #if the queue is growing slowly and is above 30,000 messages launch a new consumer if elapsed > last_queue_size_check: last_queue_size_check = elapsed + context[ c.KEY_FREQUENCY_TO_CHECK_TO_SPAWN_ANOTHER] print "[{}]\nThere are approximately {} messages that require processing.\n" \ "There are {} in-flight messages.\n" \ "{} seconds have elapsed and there is {} seconds remaining before timeout.\n" \ "The queue growth rate is {}\n" \ "{} message(s) were processed.".format(context[c.KEY_REQUEST_ID], messages_to_process,inflight_messages,round(elapsed,2),util.time_remaining(context),growth_rate,len(messages)) if messages_to_process > context[ c. KEY_THRESHOLD_BEFORE_SPAWN_NEW_CONSUMER] and inflight_messages <= context[ c.KEY_MAX_INFLIGHT_MESSAGES]: print "The queue size is greater than {}. Launching another consumer.".format( context[c.KEY_THRESHOLD_BEFORE_SPAWN_NEW_CONSUMER]) add_consumer(context) if last_message_count == 0: print "[{}]No more messages to process.".format( context[c.KEY_REQUEST_ID]) break messages = context[c.KEY_SQS].read_queue() if len(messages) > 0: start = time.time() context[c.KEY_AGGREGATOR].append_default_metrics_and_partition( messages) message_processing_time = round( ((time.time() - start) + message_processing_time) / 2, 4) else: if len(metric_sets) > 1: print "[{}]No more messages to process.".format( context[c.KEY_REQUEST_ID]) break else: print "[{}]No metric sets to process. Exiting.".format( context[c.KEY_REQUEST_ID]) return #start throttling the message processing when the SQS inflight messages is at 80% (16,000) #one queue is only allowed to have 20,000 maximum messages being processed (in-flight) usage = mutil.get_memory_usage() if inflight_messages > 16000: print "[{}]Stopping aggregation. There are too many messages in flight. Currently there are {} messages in flight.".format( context[c.KEY_REQUEST_ID], inflight_messages) break if usage > context[c.KEY_MEMORY_FLUSH_TRIGGER]: print "[{}]Stopping aggregation. Memory safe level threshold exceeded. The lambda is currently at {}%.".format( context[c.KEY_REQUEST_ID], usage) break if util.elapsed(context) + message_processing_time > timeout: print "[{}]Stopping aggregation. The elapsed time and the projected message processing time exceeds the timeout window. Messages are taking {} seconds to process. There is {} seconds left before time out and {} seconds for aggregation.".format( context[c.KEY_REQUEST_ID], message_processing_time, util.time_remaining(context), timeout) break elapsed = util.elapsed(context) util.debug_print( "[{}]Lambda has completed the agreggation phase. Elapsed time was {} seconds and we have {} seconds remaining. There are {} in-flight messages and {} remaining messages to process." .format(context[c.KEY_REQUEST_ID], elapsed, util.time_remaining(context), inflight_messages, messages_to_process)) context[c.KEY_THREAD_POOL].wait() bytes_consumed = mutil.get_process_memory_usage_bytes() memory_usage = str(mutil.get_memory_usage()) print mutil.get_memory_object() tables = metric_sets[c.KEY_TABLES] del metric_sets[c.KEY_TABLES] flush_and_delete(context, metric_sets) context[c.KEY_THREAD_POOL].wait() update_glue_crawler_datastores(context, tables) print mutil.get_memory_object() print "[{}]Elapsed time {} seconds. ".format(context[c.KEY_REQUEST_ID], util.elapsed(context)) print "[{}]Message processing averaged {} seconds per message. ".format( context[c.KEY_REQUEST_ID], message_processing_time) print "[{}]The process consumed {} KB of memory.".format( context[c.KEY_REQUEST_ID], bytes_consumed / 1024) print '[{}]The memory utilization was at {}%.'.format( context[c.KEY_REQUEST_ID], memory_usage) print '[{}]The process used {} KB for converting messages to parquet format.'.format( context[c.KEY_REQUEST_ID], (bytes_consumed - process_bytes) / 1024) print "[{}]The save process took {} seconds.".format( context[c.KEY_REQUEST_ID], context[c.CW_ATTR_SAVE_DURATION]) print "[{}]Processed {} uncompressed bytes.".format( context[c.KEY_REQUEST_ID], context[c.KEY_AGGREGATOR].bytes_uncompressed) print "[{}]Processed {} metrics. ".format(context[c.KEY_REQUEST_ID], context[c.KEY_AGGREGATOR].rows) print "[{}]Processed {} messages. ".format( context[c.KEY_REQUEST_ID], context[c.KEY_AGGREGATOR].messages) print "[{}]Average metrics per minute {}. ".format( context[c.KEY_REQUEST_ID], round( context[c.KEY_AGGREGATOR].rows / util.elasped_time_in_min(context), 2)) print "[{}]Average messages per minute {}. ".format( context[c.KEY_REQUEST_ID], round( context[c.KEY_AGGREGATOR].messages / util.elasped_time_in_min(context), 2)) print "[{}]Average uncompressed bytes per minute {}. ".format( context[c.KEY_REQUEST_ID], round( context[c.KEY_AGGREGATOR].bytes_uncompressed / util.elasped_time_in_min(context), 2)) print "[{}]There are approximately {} messages that require processing.".format( context[c.KEY_REQUEST_ID], messages_to_process if messages_to_process else 0) print "[{}]There are {} in-flight messages.".format( context[c.KEY_REQUEST_ID], inflight_messages) print "[{}]There was {} seconds remaining before timeout. ".format( context[c.KEY_REQUEST_ID], util.time_remaining(context)) del tables del metric_sets gc.collect()
def ingest(event, lambdacontext): starttime = time.time() gc.collect() root = event.get("root", None) print "Initial memory size:", mutil.get_memory_object() print "Started amoeba with root {}".format(root) context = event.get("context", {}) context[c.KEY_LAMBDA_FUNCTION] = lambdacontext.function_name if hasattr( lambdacontext, 'function_name') else None context[c.KEY_START_TIME] = starttime is_lambda = context[c.KEY_LAMBDA_FUNCTION] is not None bucket = os.environ[c.ENV_S3_STORAGE] crawler = Crawler(context, bucket) roots = crawler.crawl_from_relative(root) s3_fs = s3fs.S3FileSystem() s3 = S3(context, bucket) timeout = calculate_aggregate_window_timeout( context[c.KEY_MAX_LAMBDA_TIME]) target_excretion_size = context[c.KEY_TARGET_AGGREGATION_FILE_SIZE_IN_MB] compression_ratio = context[c.KEY_CSV_PARQUET_COMPRESSION_RATIO] sep = context[c.KEY_SEPERATOR_PARTITION] memory_used = mutil.get_memory_usage() projected_compressed_file_size_in_mb = 0 print "Hunting for {} seconds in bucket '{}'".format(timeout, bucket) for path in roots: #The GLUE Crawler does not work well when a single key in S3 contains varying data schemas. files = roots[path] if len(files) == 1: continue debug_print("\t\tIngesting path {}".format(path)) df = {} keys_ingested = [] data = None for file in files: debug_print("\t\t\t{}".format(file)) key = "{}/{}".format(path, file) try: size_in_megabytes = s3.size_in_megabytes(key) except ClientError as e: if str(e.response['Error']['Code']) == '404': continue else: print "Error: ", e.response['Error']['Code'], key raise e if size_in_megabytes > target_excretion_size: debug_print( "Skipping file '{}'. It has reached the targetted file size" .format(key)) continue size_in_bytes = size_in_megabytes * 1024 * 1024 try: data = reader.read(s3_fs, bucket, key) keys_ingested.append(key) except ClientError as e: print e.response['Error']['Code'], "key=>", key #handle corrupt files, this can happen if a write did not finish correctly if e.message == "Seek before start of file": print "Deleting corrupt file %s", key s3.delete([key]) elif e.response['Error']['Code'] == 'NoSuchKey': print '{}: for key {}'.format(e.response['Error']['Code'], key) else: util.logger.error(e) continue for row in data.itertuples(index=True): row = row.__dict__ del row['Index'] key_parts = KeyParts(key, sep) uuid_key = "{}{}{}".format(row[c.PRIMARY_KEY], key_parts.event, row[c.TERTIARY_KEY]) df_size = len(row) debug_print( "\t\t\tSize on S3 in MB: {} Size as Dataframe: {} Ratio: {}" .format(size_in_megabytes, df_size, compression_ratio)) #a dictionary is the fastest way to create a unique set. if uuid_key in df: debug_print( "\t\t\tFound duplication in key '{}'. Keeping the first occurrence." .format(key)) else: df[uuid_key] = row current_dataframe_size = len(df) #break conditions #1. Memory limit exceeded #2. Time window exceeded #3. Target excretion size hit projected_compressed_file_size_in_mb = ( compression_ratio * current_dataframe_size) / 1048576.0 memory_used = mutil.get_memory_usage() debug_print( "\t\t\t{} seconds have elapsed. {} kilobytes of memory have been used. The projected compressed file size is {} MB. We are targetting an excretion file size of {} MB." .format(util.elapsed(context), memory_used / 1024, projected_compressed_file_size_in_mb, target_excretion_size)) if util.elapsed(context) > timeout or memory_used > context[ c. KEY_AMOEBA_MEMORY_FLUSH_TRIGGER] or projected_compressed_file_size_in_mb > target_excretion_size: break if util.elapsed(context) > timeout or memory_used > context[ c. KEY_AMOEBA_MEMORY_FLUSH_TRIGGER] or projected_compressed_file_size_in_mb > target_excretion_size: print "Elapsed", util.elapsed(context), "Start:", context[ c. KEY_START_TIME], "Timeout:", timeout, "Has timed out:", util.elapsed( context ) > timeout, "Mem Used:", memory_used, "Max Memory %:", context[ c.KEY_AMOEBA_MEMORY_FLUSH_TRIGGER] break if len(keys_ingested) > 0 and util.time_remaining(context) > 45: values = df.values() #since the schema hash of the event columns are all the same we can infer the dict[0].keys has the same column headers as the rest columns = values[0].keys() set = pd.DataFrame(values, columns=columns) excret(s3, bucket, path, set, keys_ingested, sep, m_schema.object_encoding(columns)) del set elif util.time_remaining(context) <= 45: return del data del df del keys_ingested if util.elapsed( context) > timeout or mutil.get_process_memory_usage_bytes( ) >= c.ONE_GB_IN_BYTES: print "\tThe elapsed time threshold of {} seconds has been hit or the memory threshold of {} megabytes has been hit. Time: {}s, Memory: {}MB".format( timeout, c.ONE_GB_IN_BYTES / 1048576.0, util.elapsed(context), mutil.get_process_memory_usage_megabytes()) return print "I've consumed everything I can in bucket '{}'".format(bucket) return