def lambda_handler(event, context): """ Checks if a dataset is driven by manifest file Arguments: event {dict} -- Dictionary with details on previous processing step context {dict} -- Dictionary with details on Lambda context Returns: {dict} -- Dictionary with outcome of the process """ try: logger.info("Fetching event data from previous step") team = event['body']['team'] pipeline = event['body']['pipeline'] stage = event['body']['pipeline_stage'] dataset = event['body']['dataset'] peh_id = event['body']['peh_id'] env = event['body']['env'] manifest_flag = event['body']['manifest_enabled'] manifest_file_pattern = event['body']['manifest_details']['regex_pattern'] manifest_file_timeout = event['body']['manifest_details']['manifest_timeout'] manifest_datafile_timeout = event['body']['manifest_details']['manifest_data_timeout'] input_file_name = event['body']['key'].split('/')[-1] logger.info('Initializing Octagon client') component = context.function_name.split('-')[-2].title() octagon_client = ( octagon.OctagonClient() .with_run_lambda(True) .with_configuration_instance(event['body']['env']) .build() ) peh.PipelineExecutionHistoryAPI( octagon_client).retrieve_pipeline_execution(peh_id) ### Check if the file being processes is the manifest file match = re.match(manifest_file_pattern, input_file_name) if match: is_manifest_file = "True" else: is_manifest_file = "False" event['body']['is_manifest_file'] = is_manifest_file octagon_client.update_pipeline_execution(status="{} {} Processing".format(stage, component), component=component) except Exception as e: logger.error("Fatal error", exc_info=True) octagon_client.end_pipeline_execution_failed(component=component, issue_comment="{} {} Error: {}".format(stage, component, repr(e))) raise e return event
def lambda_handler(event, context): """Checks if the file to be processed is manifest driven Arguments: event {dict} -- Dictionary with details on previous processing step context {dict} -- Dictionary with details on Lambda context Returns: {dict} -- Dictionary with Processed Bucket and Key(s) """ try: logger.info('Fetching event data from previous step') bucket = event['body']['bucket'] keys_to_process = event['body']['keysToProcess'] team = event['body']['team'] pipeline = event['body']['pipeline'] stage = event['body']['pipeline_stage'] dataset = event['body']['dataset'] ddb_key = team + "-" + dataset logger.info('Initializing Octagon client') component = context.function_name.split('-')[-2].title() octagon_client = (octagon.OctagonClient().with_run_lambda( True).with_configuration_instance(event['body']['env']).build()) peh_id = octagon_client.start_pipeline_execution( pipeline_name='{}-{}-stage-{}'.format(team, pipeline, stage[-1].lower()), dataset_name='{}-{}'.format(team, dataset), comment=event) dynamo_config = DynamoConfiguration() dynamo_interface = DynamoInterface(dynamo_config) response = dynamo_interface.get_transform_table_item(ddb_key) logger.info("Querying DynamoDB to check for manifest details") event["body"]["manifest_enabled"] = response["manifest_enabled"] event["body"]["manifest_details"] = response["manifest_details"] # Call custom transform created by user and process the file event['body']['peh_id'] = peh_id remove_content_tmp() octagon_client.update_pipeline_execution( status="{} {} Processing".format(stage, component), component=component) except Exception as e: logger.error("Fatal error", exc_info=True) octagon_client.end_pipeline_execution_failed( component=component, issue_comment="{} {} Error: {}".format(stage, component, repr(e))) remove_content_tmp() raise e return event
def lambda_handler(event, context): """Calls custom transform developed by user Arguments: event {dict} -- Dictionary with details on previous processing step context {dict} -- Dictionary with details on Lambda context Returns: {dict} -- Dictionary with Processed Bucket and Key(s) """ try: logger.info('Fetching event data from previous step') bucket = event['body']['bucket'] # keys_to_process = event['body']['keysToProcess'] team = event['body']['team'] pipeline = event['body']['pipeline'] stage = event['body']['pipeline_stage'] stage = stage.replace('A', 'B') dataset = event['body']['dataset'] logger.info('Initializing Octagon client') component = context.function_name.split('-')[-2].title() octagon_client = ( octagon.OctagonClient() .with_run_lambda(True) .with_configuration_instance(event['body']['env']) .build() ) peh_id = octagon_client.start_pipeline_execution( pipeline_name='{}-{}-stage-{}'.format(team, pipeline, stage[-1].lower()), dataset_name='{}-{}'.format(team, dataset), comment=event ) # Call custom transform created by user and process the file logger.info('Calling user custom processing code') transform_handler = TransformHandler().stage_transform(team, dataset, stage) response = transform_handler().transform_object( bucket, event['body'], team, dataset) # custom user code called response['peh_id'] = peh_id remove_content_tmp() octagon_client.update_pipeline_execution( status="{} {} Processing".format(stage, component), component=component) except Exception as e: logger.error("Fatal error", exc_info=True) octagon_client.end_pipeline_execution_failed(component=component, issue_comment="{} {} Error: {}".format(stage, component, repr(e))) remove_content_tmp() raise e return response
def lambda_handler(event, context): """ Checks if a dataset is driven by manifest file Arguments: event {dict} -- Dictionary with details on previous processing step context {dict} -- Dictionary with details on Lambda context Returns: {dict} -- Dictionary with outcome of the process """ try: logger.info("Fetching event data from previous step") team = event['body']['team'] pipeline = event['body']['pipeline'] stage = event['body']['pipeline_stage'] dataset = event['body']['dataset'] peh_id = event['body']['peh_id'] env = event['body']['env'] ddb_key = team+"-"+dataset logger.info('Initializing Octagon client') component = context.function_name.split('-')[-2].title() octagon_client = ( octagon.OctagonClient() .with_run_lambda(True) .with_configuration_instance(event['body']['env']) .build() ) peh.PipelineExecutionHistoryAPI( octagon_client).retrieve_pipeline_execution(peh_id) logger.info('Initializing DynamoDB config and Interface') dynamo_config = DynamoConfiguration() dynamo_interface = DynamoInterface(dynamo_config) response = dynamo_interface.get_transform_table_item(ddb_key) event["body"]["manifest_enabled"] = response["manifest_enabled"] event["body"]["manifest_details"] = response["manifest_details"] octagon_client.update_pipeline_execution(status="{} {} Processing".format(stage, component), component=component) except Exception as e: logger.error("Fatal error", exc_info=True) octagon_client.end_pipeline_execution_failed(component=component, issue_comment="{} {} Error: {}".format(stage, component, repr(e))) raise e return event
def lambda_handler(event, context): """Calls custom job waiter developed by user Arguments: event {dict} -- Dictionary with details on previous processing step context {dict} -- Dictionary with details on Lambda context Returns: {dict} -- Dictionary with Processed Bucket, Key(s) and Job Details """ try: logger.info('Fetching event data from previous step') bucket = event['body']['bucket'] keys_to_process = event['body']['key'] team = event['body']['team'] stage = event['body']['pipeline_stage'] dataset = event['body']['dataset'] job_details = event['body']['job']['jobDetails'] processed_keys_path = event['body']['job']['processedKeysPath'] logger.info('Initializing Octagon client') component = context.function_name.split('-')[-2].title() octagon_client = (octagon.OctagonClient().with_run_lambda( True).with_configuration_instance(event['body']['env']).build()) logger.info('Checking Job Status with user custom code') transform_handler = TransformHandler().stage_transform( team, dataset, stage) response = transform_handler().check_job_status( bucket, keys_to_process, processed_keys_path, job_details) # custom user code called response['peh_id'] = event['body']['peh_id'] if event['body']['job']['jobDetails']['jobStatus'] == 'FAILED': peh.PipelineExecutionHistoryAPI( octagon_client).retrieve_pipeline_execution(response['peh_id']) octagon_client.end_pipeline_execution_failed( component=component, issue_comment="{} {} Error: Check Job Logs".format( stage, component)) except Exception as e: logger.error("Fatal error", exc_info=True) peh.PipelineExecutionHistoryAPI( octagon_client).retrieve_pipeline_execution( event['body']['peh_id']) octagon_client.end_pipeline_execution_failed( component=component, issue_comment="{} {} Error: {}".format(stage, component, repr(e))) raise e return response
def lambda_handler(event, context): """Calls custom transform developed by user Arguments: event {dict} -- Dictionary with details on previous processing step context {dict} -- Dictionary with details on Lambda context Returns: {dict} -- Dictionary with Processed Bucket and Key(s) """ try: logger.info('Fetching event data from previous step') bucket = event['body']['bucket'] key = event['body']['key'] team = event['body']['team'] stage = event['body']['pipeline_stage'] dataset = event['body']['dataset'] ddb_key = event['body']['manifest_ddb_key'] logger.info('Initializing Octagon client') component = context.function_name.split('-')[-2].title() octagon_client = (octagon.OctagonClient().with_run_lambda( True).with_configuration_instance(event['body']['env']).build()) peh.PipelineExecutionHistoryAPI( octagon_client).retrieve_pipeline_execution( event['body']['peh_id']) # Call custom transform created by user and process the file logger.info('Calling user custom processing code') transform_handler = TransformHandler().stage_transform( team, dataset, stage) response = transform_handler().transform_object( bucket, key, team, dataset) # custom user code called remove_content_tmp() octagon_client.update_pipeline_execution( status="{} {} Processing".format(stage, component), component=component) dynamo_interface.update_manifests_control_table_stagea( ddb_key, "PROCESSING", response[0]) except Exception as e: logger.error("Fatal error", exc_info=True) octagon_client.end_pipeline_execution_failed( component=component, issue_comment="{} {} Error: {}".format(stage, component, repr(e))) remove_content_tmp() dynamo_interface.update_manifests_control_table_stagea( ddb_key, "FAILED") raise e return response
def lambda_handler(event, context): """Updates the objects metadata catalog Arguments: event {dict} -- Dictionary with details on S3 event context {dict} -- Dictionary with details on Lambda context Returns: {dict} -- Dictionary with Processed Bucket and Key """ try: logger.info('Fetching event data from previous step') object_metadata = json.loads(event) stage = object_metadata['pipeline_stage'] logger.info('Initializing Octagon client') component = context.function_name.split('-')[-2].title() octagon_client = (octagon.OctagonClient().with_run_lambda( True).with_configuration_instance(object_metadata['env']).build()) object_metadata['peh_id'] = octagon_client.start_pipeline_execution( pipeline_name='{}-{}-stage-{}'.format(object_metadata['team'], object_metadata['pipeline'], stage[-1].lower()), dataset_name='{}-{}'.format(object_metadata['team'], object_metadata['dataset']), comment=event) # Add business metadata (e.g. object_metadata['project'] = 'xyz') logger.info('Initializing DynamoDB config and Interface') dynamo_config = DynamoConfiguration() dynamo_interface = DynamoInterface(dynamo_config) logger.info('Storing metadata to DynamoDB') dynamo_interface.update_object_metadata_catalog(object_metadata) logger.info( 'Passing arguments to the next function of the state machine') octagon_client.update_pipeline_execution( status="{} {} Processing".format(stage, component), component=component) except Exception as e: logger.error("Fatal error", exc_info=True) octagon_client.end_pipeline_execution_failed( component=component, issue_comment="{} {} Error: {}".format(stage, component, repr(e))) raise e return {'statusCode': 200, 'body': object_metadata}
def lambda_handler(event, context): """Crawl Data using specified Glue Crawler Arguments: event {dict} -- Dictionary with details on Bucket and Keys context {dict} -- Dictionary with details on Lambda context Returns: {dict} -- Dictionary with Processed Bucket and Keys Path """ try: logger.info('Fetching event data from previous step') team = event['body']['team'] stage = event['body']['pipeline_stage'] dataset = event['body']['dataset'] logger.info('Initializing Octagon client') component = context.function_name.split('-')[-2].title() octagon_client = (octagon.OctagonClient().with_run_lambda( True).with_configuration_instance(event['body']['env']).build()) peh.PipelineExecutionHistoryAPI( octagon_client).retrieve_pipeline_execution( event['body']['job']['peh_id']) crawler_name = '-'.join(['sdlf', team, dataset, 'post-stage-crawler']) logger.info('Starting Crawler {}'.format(crawler_name)) try: client.start_crawler(Name=crawler_name) except client.exceptions.CrawlerRunningException: logger.info('Crawler is already running') octagon_client.update_pipeline_execution( status="{} {} Processing".format(stage, component), component=component) except Exception as e: logger.error("Fatal error", exc_info=True) octagon_client.end_pipeline_execution_failed( component=component, issue_comment="{} {} Error: {}".format(stage, component, repr(e))) raise e return 200
def lambda_handler(event, context): """ Process the manifest file and loads into DynamoDB Arguments: event {dict} -- Dictionary with details on previous processing step context {dict} -- Dictionary with details on Lambda context Returns: {dict} -- Dictionary with outcome of the process """ s3_interface = S3Interface() stage_bucket = S3Configuration().stage_bucket dynamo_config = DynamoConfiguration() dynamo_interface = DynamoInterface(dynamo_config) try: logger.info("Fetching event data from previous step") team = event['body']['team'] pipeline = event['body']['pipeline'] stage = event['body']['pipeline_stage'] dataset = event['body']['dataset'] peh_id = event['body']['peh_id'] env = event['body']['env'] bucket = event['body']['bucket'] manifest_file_key = event['body']['key'] manifest_file_name = manifest_file_key.split("/")[-1] logger.info('Initializing Octagon client') component = context.function_name.split('-')[-2].title() octagon_client = (octagon.OctagonClient().with_run_lambda( True).with_configuration_instance(env).build()) peh.PipelineExecutionHistoryAPI( octagon_client).retrieve_pipeline_execution(peh_id) ### Download the manifest file to local local_path = s3_interface.download_object(bucket, manifest_file_key) ### Process the manifest file with open(local_path, "r") as raw_file: file_names = [ file_name.strip().split("/")[-1] for file_name in raw_file ] ### Load data into manifests control table for file in file_names: item = { "dataset_name": team + "-" + dataset + "-" + manifest_file_name, "datafile_name": manifest_file_name + "-" + file } dynamo_interface.put_item_in_manifests_control_table(item) ### Set s3 path for Copy s3_path = 'pre-stage/{}/manifests/{}/{}'.format( team, dataset, manifest_file_name) kms_key = KMSConfiguration(team).get_kms_arn ### Copy Manifest File to team/manifest/dataset location s3_interface.copy_object(bucket, manifest_file_key, stage_bucket, s3_path, kms_key=kms_key) octagon_client.update_pipeline_execution( status="{} {} Processing".format(stage, component), component=component) processed_keys = [s3_path] except Exception as e: logger.error("Fatal error", exc_info=True) octagon_client.end_pipeline_execution_failed( component=component, issue_comment="{} {} Error: {}".format(stage, component, repr(e))) raise e return processed_keys
def lambda_handler(event, context): """Updates the S3 objects metadata catalog Arguments: event {dict} -- Dictionary with details on Bucket and Keys context {dict} -- Dictionary with details on Lambda context Returns: {dict} -- Dictionary with response """ try: logger.info('Fetching event data from previous step') bucket = event['body']['bucket'] processed_keys_path = event['body']['job']['processedKeysPath'] processed_keys = S3Interface().list_objects(bucket, processed_keys_path) team = event['body']['team'] pipeline = event['body']['pipeline'] stage = event['body']['pipeline_stage'] dataset = event['body']['dataset'] peh_id = event['body']['job']['peh_id'] keys_to_process = event['body']['keysToProcess'] s3_path = "post-stage/{}/manifests/{}/{}".format( team, dataset, keys_to_process[0].split("/")[-1]) logger.info('Initializing Octagon client') component = context.function_name.split('-')[-2].title() octagon_client = (octagon.OctagonClient().with_run_lambda( True).with_configuration_instance(event['body']['env']).build()) peh.PipelineExecutionHistoryAPI( octagon_client).retrieve_pipeline_execution(peh_id) logger.info('Initializing DynamoDB config and Interface') dynamo_config = DynamoConfiguration() dynamo_interface = DynamoInterface(dynamo_config) logger.info('Storing metadata to DynamoDB') for key in processed_keys: object_metadata = { 'bucket': bucket, 'key': key, 'size': S3Interface().get_size(bucket, key), 'last_modified_date': S3Interface().get_last_modified(bucket, key), 'org': event['body']['org'], 'app': event['body']['app'], 'env': event['body']['env'], 'team': team, 'pipeline': pipeline, 'dataset': dataset, 'stage': 'stage', 'pipeline_stage': stage, 'peh_id': peh_id } dynamo_interface.update_object_metadata_catalog(object_metadata) logger.info("Updating manifests control table") items = get_manifest_data(bucket, team, dataset, keys_to_process[0]) ddb_keys = get_ddb_keys(items) for ddb_key in ddb_keys: dynamo_interface.update_manifests_control_table_stageb( ddb_key, "COMPLETED") logger.info("Move manifest file to post stage") kms_key = KMSConfiguration(team).get_kms_arn s3_interface = S3Interface() s3_interface.copy_object(bucket, keys_to_process[0], bucket, s3_path, kms_key=kms_key) logger.info("Removing manifest file from pre-stage") s3_interface.delete_objects(bucket, keys_to_process[0]) # Only uncomment if a queue for the next stage exists # logger.info('Sending messages to next SQS queue if it exists') # sqs_config = SQSConfiguration(team, dataset, ''.join([stage[:-1], chr(ord(stage[-1]) + 1)])) # sqs_interface = SQSInterface(sqs_config.get_stage_queue_name) # sqs_interface.send_batch_messages_to_fifo_queue(processed_keys, 10, '{}-{}'.format(team, dataset)) octagon_client.update_pipeline_execution( status="{} {} Processing".format(stage, component), component=component) octagon_client.end_pipeline_execution_success() except Exception as e: logger.error("Fatal error", exc_info=True) octagon_client.end_pipeline_execution_failed( component=component, issue_comment="{} {} Error: {}".format(stage, component, repr(e))) for ddb_key in ddb_keys: dynamo_interface.update_manifests_control_table_stageb( ddb_key, "FAILED", None, "Failed in Post Update") raise e return 200
def lambda_handler(event, context): """Updates the S3 objects metadata catalog Arguments: event {dict} -- Dictionary with details on previous processing step context {dict} -- Dictionary with details on Lambda context Returns: {dict} -- Dictionary with outcome of the process """ try: logger.info('Fetching event data from previous step') processed_keys = event['body']['processedKeys'] team = event['body']['team'] pipeline = event['body']['pipeline'] stage = event['body']['pipeline_stage'] dataset = event['body']['dataset'] peh_id = event['body']['peh_id'] logger.info('Initializing Octagon client') component = context.function_name.split('-')[-2].title() octagon_client = (octagon.OctagonClient().with_run_lambda( True).with_configuration_instance(event['body']['env']).build()) peh.PipelineExecutionHistoryAPI( octagon_client).retrieve_pipeline_execution(peh_id) logger.info('Initializing DynamoDB config and Interface') dynamo_config = DynamoConfiguration() dynamo_interface = DynamoInterface(dynamo_config) logger.info('Storing metadata to DynamoDB') bucket = S3Configuration().stage_bucket for key in processed_keys: object_metadata = { 'bucket': bucket, 'key': key, 'size': S3Interface().get_size(bucket, key), 'last_modified_date': S3Interface().get_last_modified(bucket, key), 'org': event['body']['org'], 'app': event['body']['app'], 'env': event['body']['env'], 'team': team, 'pipeline': pipeline, 'dataset': dataset, 'stage': 'stage', 'pipeline_stage': stage, 'peh_id': peh_id } dynamo_interface.update_object_metadata_catalog(object_metadata) #Workload management changes #--------------------------- wlm_ddb_table = dynamo_interface.wlm_control_table item = dynamo_interface.get_item( wlm_ddb_table, { "name": "{}-{}-{}".format(team, dataset, processed_keys[0].split("/")[-2]) }) priority = item.get('priority', None) print(priority) #--------------------------- logger.info('Sending messages to next SQS queue if it exists') sqs_config = SQSConfiguration(team, dataset, ''.join( [stage[:-1], chr(ord(stage[-1]) + 1)]), priority) #Workload management changes sqs_interface = SQSInterface( sqs_config.get_stage_queue_name_wlm) #Workload management changes sqs_interface.send_batch_messages_to_fifo_queue( processed_keys, 10, '{}-{}'.format(team, dataset)) octagon_client.update_pipeline_execution( status="{} {} Processing".format(stage, component), component=component) octagon_client.end_pipeline_execution_success() except Exception as e: logger.error("Fatal error", exc_info=True) octagon_client.end_pipeline_execution_failed( component=component, issue_comment="{} {} Error: {}".format(stage, component, repr(e))) raise e return 200
def lambda_handler(event, context): """Compile Data to a CSV with Topic Model Output Arguments: event {dict} -- Dictionary with details on Bucket and Keys context {dict} -- Dictionary with details on Lambda context Returns: {dict} -- Dictionary with Processed Bucket and Keys Path """ try: # Get Information about the Step Function logger.info('Fetching event data from previous step') team = event['body']['team'] stage = event['body']['pipeline_stage'] dataset = event['body']['dataset'] bucket = event['body']['bucket'] # Start Connection to Octagon Client logger.info('Initializing Octagon client') component = context.function_name.split('-')[-2].title() octagon_client = (octagon.OctagonClient().with_run_lambda( True).with_configuration_instance(event['body']['env']).build()) peh.PipelineExecutionHistoryAPI( octagon_client).retrieve_pipeline_execution( event['body']['job']['peh_id']) logger.info('Starting to Compile Results') # Here we will get associate topics and add to our existing metadata # for each of the abstract text files we have in the pre-stage bucket: # Get the s3 location of the zipped topic model output key = "post-stage/{}/{}/".format(team, dataset) my_bucket = client.list_objects_v2(Bucket=bucket, Prefix=key) for objects in my_bucket["Contents"]: if ".tar.gz" in objects["Key"]: key = (objects["Key"]) # Extract the Topic Model Data from the zipped file s3_object = client.get_object(Bucket=bucket, Key=key) wholefile = s3_object['Body'].read() fileobj = io.BytesIO(wholefile) tarf = tarfile.open(fileobj=fileobj) csv_files = [ f.name for f in tarf.getmembers() if f.name.endswith('.csv') ] # Read in both the Doc-Topics and Topic-Terms csv files using Pandas DataFrames # doc-topics.csv (The topics for each abstract document) # topic-terms.csv (The terms associated to each topic - up to 10 terms) for i in csv_files: if "doc-topics" in i: csv_contents = tarf.extractfile(i).read() doc_topics = pd.read_csv(io.BytesIO(csv_contents), encoding='utf8') if "topic-terms" in i: csv_contents1 = tarf.extractfile(i).read() topic_terms = pd.read_csv(io.BytesIO(csv_contents1), encoding='utf8') # Group All of the Topics as a List for Each Abstract Docname doc_topics_grouped = doc_topics.groupby("docname")["topic"].apply( list).reset_index(name='topic_list') # Group All of the Terms Associated to each of the Topics Found topic_terms_grouped = topic_terms.groupby("topic")["term"].apply( list).reset_index(name='term_list') # For Each Abstract We Will Add a Column with the Associated Topic Terms (i.e. 'term_list') main_list = [] for index, row in doc_topics_grouped.iterrows(): labels = [] for topic in row[1]: l = topic_terms_grouped.loc[topic][1] labels.extend(l) main_list.append(labels) doc_topics_grouped['term_list'] = main_list # Now Lets Pull All the PreStage Metadata we Have for Each Abstract Document: # List csv Files in the Pre-stage Bucket key = "pre-stage/{}/{}/medical_data".format(team, dataset) response = client.list_objects_v2(Bucket=bucket, Prefix=key) # Combine All the Metadata into one Large Pandas DataFrame count = 0 for contents in response['Contents']: if contents['Size'] > 0: if count < 1: obj = client.get_object(Bucket=bucket, Key=contents["Key"]) metadata = pd.read_csv(io.BytesIO(obj['Body'].read()), encoding='utf8') else: obj = client.get_object(Bucket=bucket, Key=contents["Key"]) df = pd.read_csv(io.BytesIO(obj['Body'].read()), encoding='utf8') metadata = metadata.append(df, ignore_index=True) count = count + 1 # IMPORTANT: Now we can merge the Topics and Terms # we found for each document with the existing Metadata doc_topics_final = pd.merge(metadata, doc_topics_grouped, on='docname') # We will also create a training data csv (including topics and text only) so new documents # can be associated to one of these topics using Multi-Label Classification: label_list = [] for index, row in doc_topics_final.iterrows(): if len(doc_topics_final["topic_list"][index]) > 1: listToStr = '|'.join([ str(elem) for elem in doc_topics_final["topic_list"][index] ]) label_list.append(listToStr) else: label_list.append(str( doc_topics_final["topic_list"][index][0])) # Create Training Data DataFrame from the two columns training_data = pd.DataFrame(list( zip(label_list, doc_topics_final["abstract"])), columns=['Labels', 'Abstracts']) # Get KMS Key to Encrypt Data kms_key = KMSConfiguration(team).get_kms_arn # Write Our DataFrames with Output to S3 Post-Stage: # Write Training data to s3 Post-Stage Bucket output_path = "training_data.csv" s3_path_key = "post-stage/{}/{}/multilabel_classification/{}".format( team, dataset, output_path) training_data.to_csv('/tmp/' + output_path, index=False, header=False) s3_interface.upload_object('/tmp/' + output_path, bucket, s3_path_key, kms_key=kms_key) # Write Final df to s3 Post-Stage Bucket output_path = "compile_topics_data.csv" s3_path_key = "post-stage/{}/{}/{}".format(team, dataset, output_path) doc_topics_final.to_csv('/tmp/' + output_path) s3_interface.upload_object('/tmp/' + output_path, bucket, s3_path_key, kms_key=kms_key) # Write doc_topics df to s3 Post-Stage Bucket output_path = "doc_topics.csv" s3_path_key = "post-stage/{}/{}/topic_data/{}".format( team, dataset, output_path) doc_topics.to_csv('/tmp/' + output_path) s3_interface.upload_object('/tmp/' + output_path, bucket, s3_path_key, kms_key=kms_key) # Write topic_terms df to s3 Post-Stage Bucket output_path = "topic_terms.csv" s3_path_key = "post-stage/{}/{}/topic_data/{}".format( team, dataset, output_path) topic_terms.to_csv('/tmp/' + output_path) s3_interface.upload_object('/tmp/' + output_path, bucket, s3_path_key, kms_key=kms_key) # Update Pipeline Execution in Octagon octagon_client.update_pipeline_execution( status="{} {} Processing".format(stage, component), component=component) except Exception as e: logger.error("Fatal error", exc_info=True) octagon_client.end_pipeline_execution_failed( component=component, issue_comment="{} {} Error: {}".format(stage, component, repr(e))) raise e return 200
def lambda_handler(event, context): """Updates the S3 objects metadata catalog Arguments: event {dict} -- Dictionary with details on Bucket and Keys context {dict} -- Dictionary with details on Lambda context Returns: {dict} -- Dictionary with response """ def replace_decimals(obj): if isinstance(obj, list): for i in range(len(obj)): obj[i] = replace_decimals(obj[i]) return obj elif isinstance(obj, dict): for k, v in obj.items(): obj[k] = replace_decimals(v) return obj elif isinstance(obj, set): return set(replace_decimals(i) for i in obj) elif isinstance(obj, decimal.Decimal): if obj % 1 == 0: return int(obj) else: return float(obj) else: return obj def get_table_partitions(db, tbl): glue_response = glue_client.get_table(DatabaseName=db, Name=tbl) logger.debug('Glue get_table response: {}'.format(glue_response)) return glue_response['Table']['PartitionKeys'] try: logger.info('Fetching event data from previous step') bucket = event['body']['bucket'] processed_keys_path = event['body']['job']['processedKeysPath'] processed_keys = S3Interface().list_objects(bucket, processed_keys_path) team = event['body']['team'] pipeline = event['body']['pipeline'] stage = event['body']['pipeline_stage'] dataset1 = event['body']['dataset'] peh_id = event['body']['job']['peh_id'] env = event['body']['env'] logger.info('Initializing Octagon client') component = context.function_name.split('-')[-2].title() octagon_client = (octagon.OctagonClient().with_run_lambda( True).with_configuration_instance(event['body']['env']).build()) peh.PipelineExecutionHistoryAPI( octagon_client).retrieve_pipeline_execution(peh_id) logger.info('Initializing DynamoDB config and Interface') dynamo_config = DynamoConfiguration() dynamo_interface = DynamoInterface(dynamo_config) logger.info( 'Storing metadata to DynamoDB and tagging resulting S3 Objects') for key in processed_keys: object_metadata = { 'bucket': bucket, 'key': key, 'size': S3Interface().get_size(bucket, key), 'last_modified_date': S3Interface().get_last_modified(bucket, key), 'org': event['body']['org'], 'app': event['body']['app'], 'env': event['body']['env'], 'team': team, 'pipeline': pipeline, 'dataset': dataset1, 'stage': 'stage', 'pipeline_stage': stage, 'peh_id': peh_id } dynamo_interface.update_object_metadata_catalog(object_metadata) tag_keys = ['org', 'app', 'env', 'team', 'dataset'] tag_dict = {key: object_metadata[key] for key in tag_keys} S3Interface().tag_object(bucket, key, tag_dict) # Only uncomment if a queue for the next stage exists # logger.info('Sending messages to next SQS queue if it exists') # sqs_config = SQSConfiguration(team, dataset, ''.join([stage[:-1], chr(ord(stage[-1]) + 1)])) # sqs_interface = SQSInterface(sqs_config.get_stage_queue_name) # sqs_interface.send_batch_messages_to_fifo_queue(processed_keys, 10, '{}-{}'.format(team, dataset)) prestage_table = event['body']['dest_table']['name'] prestage_db = event['body']['dest_db'] dest_part_name = event['body']['dest_table']['part_name'] dest_part_value = event['body']['dest_table']['part_value'] processOutput = {} if dest_part_name is not '' and dest_part_value is not '': partitions = [] part_dict = {"name": dest_part_name, "value": dest_part_value} partitions.append(part_dict) processOutput['partitions'] = partitions processOutput['processed_keys'] = processed_keys ssmresponse = ssmcli.get_parameter( Name=f'/SDLF/DDB/{team}/{pipeline}/DependenciesByTable') ddb_dependencies_by_table = ssmresponse['Parameter']['Value'] ddb_table = dynamodb.Table(ddb_dependencies_by_table) ssmresponse = ssmcli.get_parameter( Name=f'/SDLF/DDB/{team}/{pipeline}/Dependencies') ddb_dependencies = ssmresponse['Parameter']['Value'] consulta = f'{prestage_db.lower()}.{prestage_table.lower()}' logger.info(consulta) response = ddb_table.get_item(Key={'table_name': consulta}) logger.info(f'Response {response}') if 'Item' in response: list_transforms = response['Item']['list_transforms'] num_of_transforms = len(list_transforms) logger.debug(f'Response {response}') logger.info(f'This table triggers {num_of_transforms} datasets') next_stage = 'B' stage_b_message = {} for dataset in list_transforms: ddb_steps = dynamodb.Table(ddb_dependencies) logger.info(dataset) response = ddb_steps.get_item(Key={'dataset': dataset}) logger.info(f'Response {response}') num_of_transforms = len(list_transforms) item = response['Item'] dest_table = item['dataset'].split('.')[1] dest_db = item['dataset'].split('.')[0] dependencies = item['dependencies'] date_substitutions = replace_decimals( item.get('date_substitutions', [])) logger.info(f'Dependencies: {dependencies}') partition = item.get('partitionColumn', '') partition_mask = item.get('partitionPythonMask', None) partition_value_formatted = None table_check = [] for table in dependencies: table_name = table['TableName'].split('.')[1] table_db = table['TableName'].split('.')[0] table_partition = table.get('FieldColumn', '') table_partition_format = table.get('DateExpression', None) relativedelta_attributes = replace_decimals( table.get('relativedelta_attributes', None)) table_partitions = processOutput.get('partitions', []) usage = table.get('Usage', 'validate').lower() if usage == 'validate': if prestage_db == table_db and prestage_table == table_name: logger.info( f'This table does not update/overwrite {dataset} dataset' ) break else: logger.debug( f'Table {table_db}.{table_name} is not the trigger table' ) else: if prestage_db.lower() == table_db.lower( ) and prestage_table.lower() == table_name.lower(): # dst_tbl_partitions = get_table_partitions(prestage_db,prestage_table) partition_value_formatted = '' # If dest table has partitions and source table has partitions logger.debug( f'Partition: {partition}, table_partitions: {table_partitions}' ) if table_partitions and table_partition_format is not None: table_partition_value = table_partitions[0][ 'value'] value = datetime.strptime( table_partition_value, table_partition_format) target_value = value - relativedelta( **relativedelta_attributes) partition_value_formatted = target_value.strftime( partition_mask) logger.info( f'This table {usage.upper()} dataset {dest_table} ' f' Partition {partition} = {partition_value_formatted}' ) # validate(table_db, table_name, table_partitions) stage_b_message[ 'prev_stage_processed_keys'] = processed_keys stage_b_message['team'] = team stage_b_message['pipeline'] = pipeline stage_b_message['pipeline_stage'] = ''.join( [stage[:-1], next_stage]) stage_b_message['dataset'] = dataset1 stage_b_message['org'] = event['body']['org'] stage_b_message['app'] = event['body']['app'] stage_b_message['env'] = event['body']['env'] stage_b_message['behaviour'] = table[ 'Usage'].lower() stage_b_message['dest_db'] = dest_db stage_b_message['dest_table'] = {} stage_b_message['dest_table']['name'] = dest_table stage_b_message['dest_table'][ 'part_name'] = partition stage_b_message['dest_table'][ 'part_value'] = partition_value_formatted stage_b_message['steps'] = item['steps'] stage_b_message[ 'date_substitutions'] = date_substitutions logger.info( 'Sending messages to next SQS queue if it exists' ) # GEt queue by SSM logger.info(stage_b_message) sqs_config = SQSConfiguration( team, pipeline, stage) sqs_interface = SQSInterface( sqs_config.get_stage_queue_name) sqs_interface.send_message_to_fifo_queue( json.dumps(stage_b_message), '{}-{}'.format(team, pipeline)) break else: logger.info(f'This table triggers 0 datasets') octagon_client.update_pipeline_execution( status=f'{stage} {component} Processing', component=component) octagon_client.end_pipeline_execution_success() except Exception as e: logger.error("Fatal error", exc_info=True) octagon_client.end_pipeline_execution_failed( component=component, issue_comment=f'{stage} {component} Error: {repr(e)}') raise e return 200
def lambda_handler(event, context): """ Load Datafile metadata in manifests control table Check if manifest file is available within the threshold Arguments: event {dict} -- Dictionary with details on previous processing step context {dict} -- Dictionary with details on Lambda context Returns: {dict} -- Dictionary with outcome of the process """ s3_interface = S3Interface() stage_bucket = S3Configuration().stage_bucket dynamo_config = DynamoConfiguration() dynamo_interface = DynamoInterface(dynamo_config) current_time = dt.datetime.utcnow() current_timestamp = current_time.timestamp() try: logger.info("Fetching event data from previous step") team = event['body']['team'] pipeline = event['body']['pipeline'] stage = event['body']['pipeline_stage'] dataset = event['body']['dataset'] peh_id = event['body']['peh_id'] env = event['body']['env'] bucket = event['body']['bucket'] input_file_key = event['body']['key'] input_file_name = input_file_key.split("/")[-1] manifest_file_pattern = event['body']['manifest_details'][ 'regex_pattern'] manifest_timeout = int( event['body']['manifest_details']['manifest_timeout']) if 'manifest_interval' in event['body']: manifest_interval = event['body']['manifest_interval'] else: manifest_interval = current_timestamp logger.info('Initializing Octagon client') component = context.function_name.split('-')[-2].title() octagon_client = (octagon.OctagonClient().with_run_lambda( True).with_configuration_instance(env).build()) peh.PipelineExecutionHistoryAPI( octagon_client).retrieve_pipeline_execution(peh_id) octagon_client.update_pipeline_execution( status="{} {} Processing".format(stage, component), component=component) ### List S3 Objects for the manifest file in the manifest prefix ### For this to work the manifest should have been loaded into DynamoDB manifest_key = "pre-stage/{}/manifests/{}/".format(team, dataset) processed_manifest_keys = s3_interface.list_objects( stage_bucket, manifest_key) matched_keys = [] items = [] if not processed_manifest_keys: logger.info( "Manifest File has not been loaded, sleeping for 5 mins") time.sleep(300) manifest_file_loaded = "False" else: for manifest_file_key in processed_manifest_keys: manifest_file_name = manifest_file_key.split("/")[-1] match = re.match(manifest_file_pattern, manifest_file_name) if match: matched_keys.append(manifest_file_name) ### Query Manifests Control table for keys in matched_keys: dataset_name = team + "-" + dataset try: items.append( dynamo_interface. get_item_from_manifests_control_table( dataset_name, keys, input_file_name)) except KeyError: logger.info( "Manifest File has not been loaded, sleeping for 5 mins" ) manifest_file_loaded = "False" ### Update Manifests Control table if not items: logger.info( "Manifest File has not been loaded, sleeping for 5 mins" ) time.sleep(300) manifest_file_loaded = "False" else: ddb_key = { 'dataset_name': items[0]['dataset_name'], 'datafile_name': items[0]['datafile_name'] } STATUS = "STARTED" dynamo_interface.update_manifests_control_table_stagea( ddb_key, STATUS) manifest_file_loaded = "True" event['body']['manifest_ddb_key'] = ddb_key ### Check if Manifest threshold has exceeded if current_timestamp == manifest_interval: current_timestamp = dt.datetime.utcnow().timestamp() if int( (current_timestamp - manifest_interval) / 60) >= manifest_timeout: logger.error("Manifest Threshold Breached") raise Exception("Manifest Threshold Breached") event['body']['manifest_interval'] = manifest_interval event['body']['manifest_file_loaded'] = manifest_file_loaded except Exception as e: logger.error("Fatal error", exc_info=True) octagon_client.end_pipeline_execution_failed( component=component, issue_comment="{} {} Error: {}".format(stage, component, repr(e))) raise e return event
def lambda_handler(event, context): """Checks dependent datasets status Arguments: event {dict} -- Dictionary with details on datasets dependency context {dict} -- Dictionary with details on Lambda context Returns: {dict} -- Dictionary with details on datasets dependency """ try: logger.info("Dataset dependency Lambda") bucket = event['body']['bucket'] team = event['body']['team'] pipeline = event['body']['pipeline'] stage = event['body']['pipeline_stage'] dataset = event['body']['dataset'] env = event['body']['env'] dependent_stage = event['body']['dependent_stage'] retry_count = event['body']["retry_count"] logger.info('Initializing Octagon client') component = context.function_name.split('-')[-2].title() octagon_client = (octagon.OctagonClient().with_run_lambda( True).with_configuration_instance(env).build()) if 'peh_id' not in event['body']: peh_id = octagon_client.start_pipeline_execution( pipeline_name='{}-{}-stage-{}'.format(team, pipeline, stage[-1].lower()), dataset_name='{}-{}'.format(team, dataset), comment=event) else: peh_id = event['body']['peh_id'] octagon.peh.PipelineExecutionHistoryAPI( octagon_client).retrieve_pipeline_execution(peh_id) logger.info("Checking dependent tables status") dependent_datasets = get_dependent_datasets(team, dataset) atomic_completed_datasets_count = 0 for each_dataset in dependent_datasets: output = get_dynamodb_peh_status(env, dependent_datasets[each_dataset], dependent_stage, get_current_date()) if output == "COMPLETED": atomic_completed_datasets_count += 1 dependent_datasets_status = "SUCCEEDED" if len( dependent_datasets ) == atomic_completed_datasets_count else "FAILED" octagon_client.update_pipeline_execution( status="{} {} Dependent Datasets Status".format(stage, component), component=component) except Exception as e: logger.error("Fatal error", exc_info=True) octagon_client.end_pipeline_execution_failed( component=component, issue_comment="{} {} Error: {}".format(stage, component, repr(e))) raise e return { "body": { "bucket": bucket, "team": team, "pipeline": pipeline, "pipeline_stage": stage, "dataset": dataset, "env": env, "dependent_stage": dependent_stage, "retry_count": retry_count + 1, "dependent_datasets_status": dependent_datasets_status, "peh_id": peh_id } }
def lambda_handler(event, context): """Write Metadata JSON Files for Data Source Arguments: event {dict} -- Dictionary with details on Bucket and Keys context {dict} -- Dictionary with details on Lambda context Returns: {dict} -- Dictionary with Processed Bucket and Keys Path """ try: logger.info('Fetching event data from previous step') team = event['body']['team'] stage = event['body']['pipeline_stage'] dataset = event['body']['dataset'] bucket = event['body']['bucket'] # This Stage will Add Metadata Directory # (NOTE: We can use metadata to filter queries in Amazon Kendra): # Add a Metadata Directory for a s3 location to write json files directory_key = "pre-stage/{}/{}/datasource_metadata/".format( team, dataset) s3client.put_object(Bucket=bucket, Key=directory_key) # Get KMS Key to Encrypt Data kms_key = KMSConfiguration(team).get_kms_arn # Read in our compiled metadata and topic data in a DataFrame key = "post-stage/{}/{}/compile_topics_data.csv".format(team, dataset) obj = s3client.get_object(Bucket=bucket, Key=key) metadata = pd.read_csv(io.BytesIO(obj['Body'].read()), encoding='utf8') # Add A Dictionary to Pass JSON Strucutre Parameters for Each # Lambda Invocation (one for ever 10,000 rows so no timeouts) rows = metadata["abstract"].count() invocations = int((rows / 10000) + 1) jobs = {} jobList = [] for i in range(0, invocations): # Set Start and End Rows for each Lambda start = i * 10000 if (i + 1) == invocations: end = rows else: end = (i + 1) * 10000 # Send a Payload with the s3 path to write and the start/end row count payload = { "start": str(start), "end": str(end), "key": key, "bucket": bucket, "directory_key": directory_key, "team": team, "dataset": dataset } jobList.append(payload) jobs["jobList"] = jobList logger.info('Initializing Octagon client') component = context.function_name.split('-')[-2].title() octagon_client = (octagon.OctagonClient().with_run_lambda( True).with_configuration_instance(event['body']['env']).build()) peh.PipelineExecutionHistoryAPI( octagon_client).retrieve_pipeline_execution( event['body']['job']['peh_id']) octagon_client.update_pipeline_execution( status="{} {} Processing".format(stage, component), component=component) except Exception as e: logger.error("Fatal error", exc_info=True) octagon_client.end_pipeline_execution_failed( component=component, issue_comment="{} {} Error: {}".format(stage, component, repr(e))) raise e return jobs
def lambda_handler(event, context): """Checks if the file to be processed is manifest driven Arguments: event {dict} -- Dictionary with details on previous processing step context {dict} -- Dictionary with details on Lambda context Returns: {dict} -- Dictionary with Processed Bucket and Key(s) """ try: logger.info('Fetching event data from previous step') bucket = event['body']['bucket'] keys_to_process = event['body']['keysToProcess'] team = event['body']['team'] pipeline = event['body']['pipeline'] stage = event['body']['pipeline_stage'] dataset = event['body']['dataset'] peh_id = event['body']['peh_id'] manifest_data_timeout = int( event['body']['manifest_details']['manifest_data_timeout']) current_time = dt.datetime.utcnow() current_timestamp = current_time.timestamp() if 'manifest_interval' in event['body']: manifest_interval = event['body']['manifest_interval'] else: manifest_interval = current_timestamp logger.info('Initializing Octagon client') component = context.function_name.split('-')[-2].title() octagon_client = (octagon.OctagonClient().with_run_lambda( True).with_configuration_instance(event['body']['env']).build()) peh.PipelineExecutionHistoryAPI( octagon_client).retrieve_pipeline_execution(peh_id) ### Set max_items_process in datasets table so that the statemachine only processes 1 manifest file at a time ddb_keys = get_ddb_keys(keys_to_process, bucket, team, dataset) dynamo_config = DynamoConfiguration() dynamo_interface = DynamoInterface(dynamo_config) ### Query Manifest Control Table to get the status items = [] logger.info( "Querying DynamoDB to check data in manifests control table for Stage A status" ) for ddb_key in ddb_keys: try: items.append( dynamo_interface.get_item_from_manifests_control_table( ddb_key["dataset_name"], ddb_key["manifest_file_name"], ddb_key["datafile_name"])) except KeyError: logger.error( "The manifest file has not been processed in Stage A") raise Exception( "Manifest File has not been processed in Stage A") ### Check stage a status for data files logger.info( "Checking to see if all the files have been processed in Stage A") status_message_list = [] failed_status_message_list = [] wait_message_counter = 0 failed_message_counter = 0 for item in items: if "stage_a_status" in item: stage_a_status = item["stage_a_status"] else: stage_a_status = "NOT STARTED" if stage_a_status != "COMPLETED" and stage_a_status != "FAILED": status_message_list.append("Waiting for Data File {}".format( item["datafile_name"].split("-")[-1])) wait_message_counter += 1 elif stage_a_status == "FAILED": failed_status_message_list.append( "Data Files Failed in Stage A {}".format( item["datafile_name"].split("-")[-1])) failed_message_counter += 1 if failed_message_counter > 0: logger.error("Data File Failure in Stage A, Processing will stop") logger.error("The following files have failed in Stage A") for message in failed_status_message_list: logger.error(message) ### Update manifest control table, mark all files as failed in Stage B for ddb_key in ddb_keys: update_key = dynamo_interface.manifest_keys( ddb_key["dataset_name"], ddb_key["manifest_file_name"], ddb_key["datafile_name"]) dynamo_interface.update_manifests_control_table_stageb( update_key, "FAILED", None, "Datafile Failed in Stage A") raise Exception("Data File Failure in Stage A") if wait_message_counter > 0: logger.info("Waiting for Data Files to be processed in Stage A") for message in status_message_list: logger.info(message) logger.info("Will sleep for 5 mins") time.sleep(300) data_file_wait = "True" if manifest_interval == current_timestamp: current_timestamp = dt.datetime.utcnow().timestamp() if int((current_timestamp - manifest_interval) / 60) >= manifest_data_timeout: logger.error("Data File Threshold Breached") logger.error("Stage B Processing Will Stop Now") data_file_wait = "False" for message in status_message_list: logger.error(message) ### Update manifest control table, mark all files as failed in Stage B for ddb_key in ddb_keys: update_key = dynamo_interface.manifest_keys( ddb_key["dataset_name"], ddb_key["manifest_file_name"], ddb_key["datafile_name"]) dynamo_interface.update_manifests_control_table_stageb( update_key, "FAILED", None, "Datafile threshold Breached") raise Exception("Data File Threshold Breached") else: logger.info("All files processed in Stage A") data_file_wait = "False" for ddb_key in ddb_keys: update_key = dynamo_interface.manifest_keys( ddb_key["dataset_name"], ddb_key["manifest_file_name"], ddb_key["datafile_name"]) dynamo_interface.update_manifests_control_table_stageb( update_key, "STARTED") event["body"]["manifest_interval"] = manifest_interval event["body"]["data_file_wait"] = data_file_wait remove_content_tmp() octagon_client.update_pipeline_execution( status="{} {} Processing".format(stage, component), component=component) except Exception as e: logger.error("Fatal error", exc_info=True) octagon_client.end_pipeline_execution_failed( component=component, issue_comment="{} {} Error: {}".format(stage, component, repr(e))) remove_content_tmp() raise e return event
def lambda_handler(event, context): """Updates the S3 objects metadata catalog Arguments: event {dict} -- Dictionary with details on Bucket and Keys context {dict} -- Dictionary with details on Lambda context Returns: {dict} -- Dictionary with response """ try: logger.info('Fetching event data from previous step') bucket = event['body']['bucket'] processed_keys_path = event['body']['job']['processedKeysPath'] processed_keys = S3Interface().list_objects(bucket, processed_keys_path) team = event['body']['team'] pipeline = event['body']['pipeline'] stage = event['body']['pipeline_stage'] dataset = event['body']['dataset'] logger.info('Initializing Octagon client') component = context.function_name.split('-')[-2].title() octagon_client = (octagon.OctagonClient().with_run_lambda( True).with_configuration_instance(event['body']['env']).build()) peh.PipelineExecutionHistoryAPI( octagon_client).retrieve_pipeline_execution( event['body']['job']['peh_id']) logger.info('Initializing DynamoDB config and Interface') dynamo_config = DynamoConfiguration() dynamo_interface = DynamoInterface(dynamo_config) logger.info('Storing metadata to DynamoDB') for key in processed_keys: object_metadata = { 'bucket': bucket, 'key': key, 'team': team, 'pipeline': pipeline, 'dataset': dataset, 'peh_id': event['body']['job']['peh_id'], 'stage': 'post-stage' } dynamo_interface.update_object_metadata_catalog(object_metadata) # Add Tables to Result Path to Enable Deequ Job table_path = "compile_topics_data_csv" tables = [table_path] # Only uncomment if using Kendra and index and data source ALREADY created # Data Sync Job # kendra_client = boto3.client('kendra') # response = kendra_client.start_data_source_sync_job( # Id='ENTER_DATASOURCE_ID', # IndexId='ENTER_INDEX_ID'' # ) # Only uncomment if a queue for the next stage exists # logger.info('Sending messages to next SQS queue if it exists') # sqs_config = SQSConfiguration(team, dataset, ''.join([stage[:-1], chr(ord(stage[-1]) + 1)])) # sqs_interface = SQSInterface(sqs_config.get_stage_queue_name) # sqs_interface.send_batch_messages_to_fifo_queue(processed_keys, 10, '{}-{}'.format(team, dataset)) octagon_client.update_pipeline_execution( status="{} {} Processing".format(stage, component), component=component) octagon_client.end_pipeline_execution_success() except Exception as e: logger.error("Fatal error", exc_info=True) octagon_client.end_pipeline_execution_failed( component=component, issue_comment="{} {} Error: {}".format(stage, component, repr(e))) raise e return tables