def handoff_event_to_emitter(context, bucket, key, events): bucket = os.environ["ProjectConfigurationBucket"] lmdclient = Lambda(context) s3client = S3(context, bucket) parts = KeyParts(key, context[c.KEY_SEPERATOR_PARTITION]) key = "deployment/share/emitted_event_payloads/{}/{}/{}/{}".format( parts.source, parts.event, parts.datetime, parts.filename.replace(parts.extension, 'json')) payload = { 'emitted': { 'key': key, 'bucket': bucket, 'type': parts.event, 'source': parts.source, 'buildid': parts.buildid, 'filename': parts.filename.replace(parts.extension, 'json'), 'datetime': parts.datetime, 'datetimeformat': util.partition_date_format(), 'sensitivitylevel': parts.sensitivity_level } } #create a temporary file for the event emitter to read expires = datetime.datetime.utcnow() + datetime.timedelta(minutes=30) s3client.put_object(key, events.to_json(orient='records'), expires) resp = lmdclient.invoke(os.environ[c.ENV_EVENT_EMITTER], payload)
def ingest(self): debug_print("Ingesting directory {}".format(self.directory)) debug_print("Ingesting the files \n{}".format(self.files)) is_lambda = self.context[c.KEY_LAMBDA_FUNCTION] is not None timeout = self.__calculate_aggregate_window_timeout(self.context[c.KEY_MAX_LAMBDA_TIME]) target_excretion_size = self.context[c.KEY_TARGET_AGGREGATION_FILE_SIZE_IN_MB] compression_ratio = self.context[c.KEY_CSV_PARQUET_COMPRESSION_RATIO] sep = self.context[c.KEY_SEPERATOR_PARTITION] memory_trigger = self.context[c.KEY_AMOEBA_MEMORY_FLUSH_TRIGGER] memory_used = mutil.get_memory_usage() main_filename, main_file_data, main_file_size_mb = self.__get_main_aggregate_file(self.directory, sep, target_excretion_size) main_file_data = self.__append(None, main_file_data) keys_ingested = [] for file in self.files: debug_print("\tProcessing file {}".format(file)) key_parts = KeyParts(file, sep) duration = datetime.datetime.utcnow() - key_parts.filename_timestamp if duration.total_seconds() < 300: debug_print("The file '{}' is {}s old. It is too new and will be processed later to allow for S3 propagation.".format(file, duration.total_seconds())) continue keys_ingested.append(file) data = self.__open(file, main_file_data) if data is None: continue size_in_megabytes = self.__size(file) main_file_data = self.__append(main_file_data, data) del data gc.collect() current_dataframe_size = sys.getsizeof(main_file_data) #break conditions #1. Memory limit exceeded #2. Time window exceeded #3. Target excretion size hit main_file_size_mb += size_in_megabytes memory_used = mutil.get_memory_usage() debug_print("\t\tSize on S3: {}MB Size of new dataset: {}bytes Estimated Compression Ratio: {} Memory Used: {}% Project Compression Size {}MB Target Excretion Size {}MB".format(size_in_megabytes, current_dataframe_size, compression_ratio, memory_used, main_file_size_mb, target_excretion_size)) if util.elapsed(self.context) > timeout or memory_used > memory_trigger or main_file_size_mb > target_excretion_size : print "Elapsed", util.elapsed(self.context), "Start:", self.starttime, "Timeout:", timeout, "Has timed out:", util.elapsed(self.context) > timeout, "Mem Used %:", memory_used, "Max Memory %:", memory_trigger break #only save the files if we have a reasonable amount of time remaining before the lambda timeout. debug_print("Time remaining: {}s".format(util.time_remaining(self.context))) debug_print("There were {} keys ingested. The keys ingested are: \n {}".format(len(keys_ingested), keys_ingested)) if len(keys_ingested)>0 and util.time_remaining(self.context) > c.SAVE_WINDOW_IN_SECONDS and not main_file_data.empty: main_file_data = self.__convert_to_submission_df(main_file_data) gc.collect() self.__excret(self.directory, main_filename, main_file_data, sep) self.__delete_keys(keys_ingested) elif util.time_remaining(self.context) <= c.SAVE_WINDOW_IN_SECONDS: print "Time has run out! We have less than {} seconds remaining before this lambda times out. Abandoning the S3 commit to avoid file corruption.".format(c.SAVE_WINDOW_IN_SECONDS) print "Aggregation window (Max Lambda Execution Time * {}): {} seconds".format(c.RATIO_OF_MAX_LAMBDA_TIME, timeout) print "S3 Save window: {} seconds".format(c.SAVE_WINDOW_IN_SECONDS) print "Lambda time remaining: {} seconds".format(util.time_remaining(self.context)) remaining_files = list(set(self.files) - set(keys_ingested)) if len(remaining_files) > 0: debug_print("Re-adding the {} paths to SQS to attempt again. The paths are \n{}".format(len(remaining_files), remaining_files)) self.__add_to_sqs(remaining_files) print "I've consumed everything I can in bucket '{}'".format(self.directory) return
def write(bucket, key, data, sep, object_encoding, append=False): if data.empty: raise RuntimeError( "[{}]An attempt to write an empty dataset has occurred. The request dataset was: {}".format(error.Error.empty_dataframe(), data)) sensitivity_type = KeyParts(key, sep).sensitivity_level.lower() s3 = s3fsmap[sensitivity_type] s3_open = s3.open size_before_dup_drop = len(data) data.drop_duplicates(inplace=True) size_after_dup_drop = len(data) if size_before_dup_drop - size_after_dup_drop > 0: print "{} duplicates have been dropped".format(size_before_dup_drop - size_after_dup_drop) util.debug_print("Using object encoding {}".format(object_encoding)) path='{}{}'.format(bucket,key) pwrite(path, data, open_with=s3_open, compression='GZIP', append=append, has_nulls=True, object_encoding=object_encoding) return path
def write(bucket, key, data, sep, object_encoding): if data.empty: raise RuntimeError( "[{}]An attempt to write an empty dataset has occurred. The request dataset was: {}" .format(error.Error.empty_dataframe(), data)) sensitivity_type = KeyParts(key, sep).sensitivity_level.lower() s3 = s3fsmap[sensitivity_type] s3_open = s3.open path = '{}{}'.format(bucket, key) pwrite(path, data, open_with=s3_open, compression='GZIP', append=False, has_nulls=True, object_encoding=object_encoding)
def crawl_from_relative(self, prefix): #combine only files for the past two days. Older files should already be fully aggregated start = datetime.datetime.utcnow() - datetime.timedelta(days=2) crawl_paths = dict({}) for page in self.__s3.list(prefix): if "Contents" in page: for obj in page[ "Contents" ]: key = obj['Key'] parts = KeyParts(key, self.__sep) event_date = datetime.datetime(parts.year, parts.month, parts.day, parts.hour) if event_date >= start: path = self.__sep.join(parts.path.split(self.__sep)[:-1]) if path not in crawl_paths: crawl_paths[path] = [] crawl_paths[path].append(parts.filename) #assign an amoeba generator per identified path return crawl_paths
def crawl(self, prefix, lambda_pool, func, depth=10): #combine only files for the past X. Older files should already be fully aggregated print "Locating paths to crawl on bucket '{}' with prefix '{}'".format(self.__bucket, prefix) crawled_paths = {} idx = 0 count = 0 for page in self.__s3.list(prefix=prefix): if "Contents" in page: for obj in page[ "Contents" ]: key = obj['Key'] parts = KeyParts(key, self.__sep) path = self.__sep.join(parts.path.split(self.__sep)[:depth]) if path not in crawled_paths: crawled_paths[path]=True func(self.__context, path, lambda_pool[idx]) count += 1 idx += 1 if idx >= len(lambda_pool): idx = 0 print "Path scouting complete on bucket '{}'".format(self.__bucket) return count
def ingest(event, lambdacontext): starttime = time.time() gc.collect() root = event.get("root", None) print "Initial memory size:", mutil.get_memory_object() print "Started amoeba with root {}".format(root) context = event.get("context", {}) context[c.KEY_LAMBDA_FUNCTION] = lambdacontext.function_name if hasattr( lambdacontext, 'function_name') else None context[c.KEY_START_TIME] = starttime is_lambda = context[c.KEY_LAMBDA_FUNCTION] is not None bucket = os.environ[c.ENV_S3_STORAGE] crawler = Crawler(context, bucket) roots = crawler.crawl_from_relative(root) s3_fs = s3fs.S3FileSystem() s3 = S3(context, bucket) timeout = calculate_aggregate_window_timeout( context[c.KEY_MAX_LAMBDA_TIME]) target_excretion_size = context[c.KEY_TARGET_AGGREGATION_FILE_SIZE_IN_MB] compression_ratio = context[c.KEY_CSV_PARQUET_COMPRESSION_RATIO] sep = context[c.KEY_SEPERATOR_PARTITION] memory_used = mutil.get_memory_usage() projected_compressed_file_size_in_mb = 0 print "Hunting for {} seconds in bucket '{}'".format(timeout, bucket) for path in roots: #The GLUE Crawler does not work well when a single key in S3 contains varying data schemas. files = roots[path] if len(files) == 1: continue debug_print("\t\tIngesting path {}".format(path)) df = {} keys_ingested = [] data = None for file in files: debug_print("\t\t\t{}".format(file)) key = "{}/{}".format(path, file) try: size_in_megabytes = s3.size_in_megabytes(key) except ClientError as e: if str(e.response['Error']['Code']) == '404': continue else: print "Error: ", e.response['Error']['Code'], key raise e if size_in_megabytes > target_excretion_size: debug_print( "Skipping file '{}'. It has reached the targetted file size" .format(key)) continue size_in_bytes = size_in_megabytes * 1024 * 1024 try: data = reader.read(s3_fs, bucket, key) keys_ingested.append(key) except ClientError as e: print e.response['Error']['Code'], "key=>", key #handle corrupt files, this can happen if a write did not finish correctly if e.message == "Seek before start of file": print "Deleting corrupt file %s", key s3.delete([key]) elif e.response['Error']['Code'] == 'NoSuchKey': print '{}: for key {}'.format(e.response['Error']['Code'], key) else: util.logger.error(e) continue for row in data.itertuples(index=True): row = row.__dict__ del row['Index'] key_parts = KeyParts(key, sep) uuid_key = "{}{}{}".format(row[c.PRIMARY_KEY], key_parts.event, row[c.TERTIARY_KEY]) df_size = len(row) debug_print( "\t\t\tSize on S3 in MB: {} Size as Dataframe: {} Ratio: {}" .format(size_in_megabytes, df_size, compression_ratio)) #a dictionary is the fastest way to create a unique set. if uuid_key in df: debug_print( "\t\t\tFound duplication in key '{}'. Keeping the first occurrence." .format(key)) else: df[uuid_key] = row current_dataframe_size = len(df) #break conditions #1. Memory limit exceeded #2. Time window exceeded #3. Target excretion size hit projected_compressed_file_size_in_mb = ( compression_ratio * current_dataframe_size) / 1048576.0 memory_used = mutil.get_memory_usage() debug_print( "\t\t\t{} seconds have elapsed. {} kilobytes of memory have been used. The projected compressed file size is {} MB. We are targetting an excretion file size of {} MB." .format(util.elapsed(context), memory_used / 1024, projected_compressed_file_size_in_mb, target_excretion_size)) if util.elapsed(context) > timeout or memory_used > context[ c. KEY_AMOEBA_MEMORY_FLUSH_TRIGGER] or projected_compressed_file_size_in_mb > target_excretion_size: break if util.elapsed(context) > timeout or memory_used > context[ c. KEY_AMOEBA_MEMORY_FLUSH_TRIGGER] or projected_compressed_file_size_in_mb > target_excretion_size: print "Elapsed", util.elapsed(context), "Start:", context[ c. KEY_START_TIME], "Timeout:", timeout, "Has timed out:", util.elapsed( context ) > timeout, "Mem Used:", memory_used, "Max Memory %:", context[ c.KEY_AMOEBA_MEMORY_FLUSH_TRIGGER] break if len(keys_ingested) > 0 and util.time_remaining(context) > 45: values = df.values() #since the schema hash of the event columns are all the same we can infer the dict[0].keys has the same column headers as the rest columns = values[0].keys() set = pd.DataFrame(values, columns=columns) excret(s3, bucket, path, set, keys_ingested, sep, m_schema.object_encoding(columns)) del set elif util.time_remaining(context) <= 45: return del data del df del keys_ingested if util.elapsed( context) > timeout or mutil.get_process_memory_usage_bytes( ) >= c.ONE_GB_IN_BYTES: print "\tThe elapsed time threshold of {} seconds has been hit or the memory threshold of {} megabytes has been hit. Time: {}s, Memory: {}MB".format( timeout, c.ONE_GB_IN_BYTES / 1048576.0, util.elapsed(context), mutil.get_process_memory_usage_megabytes()) return print "I've consumed everything I can in bucket '{}'".format(bucket) return