def ingest_into_native_bigquery_storage(data, context): """ This is the primary function invoked whenever the Cloud Function is triggered. It parses the Pub/Sub notification that triggered it by extracting the location of the file in Google Cloud Storage (GCS). It subsequently downloads the contents of this file from GCS, sanitizes & augments the events within it & finally writes them into native BigQuery storage. """ # Source required datasets & tables: bigquery_asset_list = [ # (dataset, table_name, table_schema, table_partition_column) ('logs', f'native_events_{os.environ["ENVIRONMENT"]}', 'logs', 'event_ds'), ('logs', f'native_events_debug_{os.environ["ENVIRONMENT"]}', 'logs', 'event_ds'), ('logs', f'dataflow_backfill_{os.environ["ENVIRONMENT"]}', 'logs', 'event_ds'), ('native', f'events_improbable_{os.environ["ENVIRONMENT"]}', 'improbable', 'event_timestamp') ] try: table_logs, table_debug, _, table_function = source_bigquery_assets( client_bq, bigquery_asset_list) except Exception: table_logs, table_debug, _, table_function = generate_bigquery_assets( client_bq, bigquery_asset_list) # Parse payload: payload = json.loads(base64.b64decode(data['data']).decode('utf-8')) bucket_name, object_location = payload['bucket'], payload['name'] gspath = f'gs://{bucket_name}/{object_location}' # Write log to events_logs_function: malformed, failed_insertion = False, False errors = client_bq.insert_rows( table_logs, format_event_list(['parse_initiated'], str, os.environ['FUNCTION_NAME'], gspath)) if errors: print(f'Errors while inserting logs: {str(errors)}') failed_insertion = True # Get file from GCS: bucket = client_gcs.get_bucket(bucket_name) try: data = bucket.get_blob(object_location).download_as_string().decode( 'utf8') except UnicodeDecodeError: print( 'Automatic decompressive transcoding failed, unzipping content..') data = gunzip_bytes_obj( bucket.get_blob(object_location).download_as_string()).decode( 'utf-8') except Exception: raise Exception( f'Could not retrieve file gs://{bucket_name}/{object_location} from GCS!' ) # We use generators in order to save memory usage, allowing the Cloud Function to use the smallest capacity template: for chunk in generator_chunk(generator_split(data, '\n'), 1000): events_batch_function, events_batch_debug = [], [] for event_tuple in generator_load_json(chunk): if event_tuple[0]: for event in event_tuple[1]: d = dict() # Sanitize: d['analytics_environment'] = get_dict_value( event, 'analyticsEnvironment', 'analytics_environment') d['event_environment'] = get_dict_value( event, 'eventEnvironment', 'event_environment') d['event_source'] = get_dict_value(event, 'eventSource', 'event_source') d['session_id'] = get_dict_value(event, 'sessionId', 'session_id') d['version_id'] = get_dict_value(event, 'versionId', 'version_id') d['batch_id'] = get_dict_value(event, 'batchId', 'batch_id') d['event_id'] = get_dict_value(event, 'eventId', 'event_id') d['event_index'] = get_dict_value(event, 'eventIndex', 'event_index') d['event_class'] = get_dict_value(event, 'eventClass', 'event_class') d['event_type'] = get_dict_value(event, 'eventType', 'event_type') d['player_id'] = get_dict_value(event, 'playerId', 'player_id') d['event_timestamp'] = cast_to_unix_timestamp( get_dict_value(event, 'eventTimestamp', 'event_timestamp'), ['%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%d %H:%M:%S %Z']) d['received_timestamp'] = get_dict_value( event, 'receivedTimestamp', 'received_timestamp' ) # This value was set by our endpoint, so we already know it is in unixtime # Augment: d['inserted_timestamp'] = time.time() d['job_name'] = os.environ['FUNCTION_NAME'] # Sanitize: d['event_attributes'] = get_dict_value( event, 'eventAttributes', 'event_attributes') events_batch_function.append(d) else: events_batch_debug.append(event_tuple[1]) if len(events_batch_function) > 0: # Write JSON to events_function: errors = client_bq.insert_rows(table_function, events_batch_function) if errors: print(f'Errors while inserting events: {str(errors)}') failed_insertion = True if len(events_batch_debug) > 0: # Write non-JSON to events_debug_function: errors = client_bq.insert_rows( table_debug, format_event_list(events_batch_debug, str, os.environ['FUNCTION_NAME'], gspath)) if errors: print(f'Errors while inserting debug event: {str(errors)}') failed_insertion = True malformed = True # We only `raise` now because further iterations of the execution loop could have still succeeded: if failed_insertion and malformed: raise Exception( f'Failed to insert records into BigQuery, inspect logs! Non-JSON data present in gs://{bucket_name}/{object_location}' ) if failed_insertion: raise Exception( 'Failed to insert records into BigQuery, inspect logs!') if malformed: raise Exception( f'Non-JSON data present in gs://{bucket_name}/{object_location}') return 200
def run(): client_bq = bigquery.Client.from_service_account_json(args.local_sa_key, location=args.location) bigquery_asset_list = [ ('logs', 'events_logs_function_native', 'event_ds'), ('logs', 'events_debug_function_native', 'event_ds'), ('logs', 'events_logs_dataflow_backfill', 'event_ds'), ('events', 'events_function_native', 'event_timestamp')] try: source_bigquery_assets(client_bq, bigquery_asset_list) except Exception: generate_bigquery_assets(client_bq, bigquery_asset_list) # https://github.com/apache/beam/blob/master/sdks/python/apache_beam/options/pipeline_options.py po = PipelineOptions() job_name = 'p1-gcs-to-bq-{method}-backfill-{environment_name}-{event_category}-{event_ds_start}-to-{event_ds_stop}-{event_time}-{ts}'.format( method=method, environment_name=environment_name, event_category=args.event_category.replace('_', '-'), event_ds_start=args.event_ds_start, event_ds_stop=args.event_ds_stop, event_time=time_part_name, ts=str(int(time.time()))) # https://cloud.google.com/dataflow/docs/guides/specifying-exec-params pipeline_options = po.from_dictionary({ 'project': args.gcp, 'staging_location': 'gs://{bucket_name}/data_type=dataflow/batch/staging/{job_name}/'.format(bucket_name=args.bucket_name, job_name=job_name), 'temp_location': 'gs://{bucket_name}/data_type=dataflow/batch/temp/{job_name}/'.format(bucket_name=args.bucket_name, job_name=job_name), 'runner': args.execution_environment, # {DirectRunner, DataflowRunner} 'setup_file': args.setup_file, 'service_account_email': 'dataflow-batch@{gcp_project_id}.iam.gserviceaccount.com'.format(gcp_project_id=args.gcp), 'job_name': job_name, 'region': args.gcp_region }) pipeline_options.view_as(SetupOptions).save_main_session = True p1 = beam.Pipeline(options=pipeline_options) fileListGcs = (p1 | 'CreateGcsIterators' >> beam.Create(list(generate_gcs_file_list(args.bucket_name, environment_list, category_list, args.event_ds_start, args.event_ds_stop, time_part_list, args.scale_test_name))) | 'GetGcsFileList' >> beam.ParDo(GetGcsFileList()) | 'GcsListPairWithOne' >> beam.Map(lambda x: (x, 1))) fileListBq = (p1 | 'ParseBqFileList' >> beam.io.Read(beam.io.BigQuerySource( # "What is already in BQ?" query=generate_backfill_query( args.gcp, method, (safe_convert_list_to_sql_tuple(environment_list), environment_name), (safe_convert_list_to_sql_tuple(category_list), category_name), args.event_ds_start, args.event_ds_stop, (safe_convert_list_to_sql_tuple(time_part_list), time_part_name), args.scale_test_name), use_standard_sql=True)) | 'BqListPairWithOne' >> beam.Map(lambda x: (x['gspath'], 1))) parseList = ({'fileListGcs': fileListGcs, 'fileListBq': fileListBq} | 'CoGroupByKey' >> beam.CoGroupByKey() | 'UnionMinusIntersect' >> beam.Filter(lambda x: (len(x[1]['fileListGcs']) == 1 and len(x[1]['fileListBq']) == 0)) | 'ExtractKeysParseList' >> beam.Map(lambda x: x[0])) # Write to BigQuery: logsList = (parseList | 'AddParseInitiatedInfo' >> beam.Map(lambda gspath: {'job_name': job_name, 'processed_timestamp': time.time(), 'batch_id': hashlib.md5(gspath.encode('utf-8')).hexdigest(), 'analytics_environment': parse_gspath(gspath, 'analytics_environment='), 'event_category': parse_gspath(gspath, 'event_category='), 'event_ds': parse_gspath(gspath, 'event_ds='), 'event_time': parse_gspath(gspath, 'event_time='), 'event': 'parse_initiated', 'gspath': gspath}) | 'WriteParseInitiated' >> beam.io.WriteToBigQuery(table='events_logs_dataflow_backfill', dataset='logs', project=args.gcp, method='FILE_LOADS', create_disposition=beam.io.gcp.bigquery.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.gcp.bigquery.BigQueryDisposition.WRITE_APPEND, insert_retry_strategy=beam.io.gcp.bigquery_tools.RetryStrategy.RETRY_ON_TRANSIENT_ERROR, schema='job_name:STRING,processed_timestamp:TIMESTAMP,batch_id:STRING,analytics_environment:STRING,event_category:STRING,event_ds:DATE,event_time:STRING,event:STRING,gspath:STRING')) # Write to Pub/Sub: PDone = (parseList | 'DumpParseListPubSub' >> beam.io.WriteToText('gs://{bucket_name}/data_type=dataflow/batch/output/{job_name}/parselist'.format(bucket_name=args.bucket_name, job_name=job_name)) | 'WriteToPubSub' >> beam.ParDo(WriteToPubSub(), job_name, args.topic, args.gcp, args.bucket_name)) p1.run().wait_until_finish() return job_name