def test_invalid_event(data): with pytest.raises(UnparseableEvent): parse(data)
def process_fastly_log(data, context): storage_client = storage.Client() bigquery_client = bigquery.Client() identifier = os.path.basename(data["name"]).split("-", 3)[-1].rstrip(".log.gz") default_partition = datetime.datetime.utcnow().strftime("%Y%m%d") bob_logs_log_blob = storage_client.bucket(data["bucket"]).get_blob(data["name"]) if bob_logs_log_blob is None: return # This has already been processed? unprocessed_lines = 0 simple_lines = 0 download_lines = 0 with ExitStack() as stack: input_file_obj = stack.enter_context(NamedTemporaryFile()) bob_logs_log_blob.download_to_file(input_file_obj) input_file_obj.flush() input_file = stack.enter_context(gzip.open(input_file_obj.name, "rb")) unprocessed_file = stack.enter_context(NamedTemporaryFile()) simple_results_file = stack.enter_context(NamedTemporaryFile()) download_results_file = stack.enter_context(NamedTemporaryFile()) for line in input_file: try: res = parse(line.decode()) if res is not None: if res.__class__.__name__ == Simple.__name__: simple_results_file.write( json.dumps(_cattr.unstructure(res)).encode() + b"\n" ) simple_lines += 1 elif res.__class__.__name__ == Download.__name__: download_results_file.write( json.dumps(_cattr.unstructure(res)).encode() + b"\n" ) download_lines += 1 else: unprocessed_file.write(line) unprocessed_lines += 1 else: unprocessed_file.write(line) unprocessed_lines += 1 except Exception as e: unprocessed_file.write(line) unprocessed_lines += 1 total = unprocessed_lines + simple_lines + download_lines print( f"Processed gs://{data['bucket']}/{data['name']}: {total} lines, {simple_lines} simple_requests, {download_lines} file_downloads, {unprocessed_lines} unprocessed" ) dataset_ref = bigquery_client.dataset(DATASET) job_config = bigquery.LoadJobConfig() job_config.source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON job_config.ignore_unknown_values = True if download_lines > 0: load_job = bigquery_client.load_table_from_file( download_results_file, dataset_ref.table(DOWNLOAD_TABLE), job_id_prefix="linehaul_file_downloads", location="US", job_config=job_config, rewind=True, ) load_job.result() print(f"Loaded {load_job.output_rows} rows into {DATASET}:{DOWNLOAD_TABLE}") if simple_lines > 0: load_job = bigquery_client.load_table_from_file( simple_results_file, dataset_ref.table(SIMPLE_TABLE), job_id_prefix="linehaul_file_downloads", location="US", job_config=job_config, rewind=True, ) load_job.result() print(f"Loaded {load_job.output_rows} rows into {DATASET}:{SIMPLE_TABLE}") bucket = storage_client.bucket(os.environ.get("RESULT_BUCKET")) if unprocessed_lines > 0: blob = bucket.blob(f"unprocessed/{default_partition}/{identifier}.txt") try: blob.upload_from_file(unprocessed_file, rewind=True) except: # Be opprotunistic about unprocessed files... pass bob_logs_log_blob.delete()
def test_download_parsing(event_data, expected): if inspect.isclass(expected) and issubclass(expected, Exception): with pytest.raises(expected): parse(event_data) else: assert parse(event_data) == expected
def process_fastly_log(data, context): storage_client = storage.Client() file_name = os.path.basename(data["name"]).rstrip(".log.gz") print(f"Beginning processing for gs://{data['bucket']}/{data['name']}") bob_logs_log_blob = storage_client.bucket(data["bucket"]).get_blob( data["name"]) if bob_logs_log_blob is None: return # This has already been processed? unprocessed_lines = 0 simple_lines = 0 download_lines = 0 with ExitStack() as stack: input_file_obj = stack.enter_context(NamedTemporaryFile()) bob_logs_log_blob.download_to_file(input_file_obj) input_file_obj.flush() input_file = stack.enter_context(gzip.open(input_file_obj.name, "rb")) unprocessed_file = stack.enter_context(NamedTemporaryFile()) simple_results_file = stack.enter_context(NamedTemporaryFile()) download_results_file = stack.enter_context(NamedTemporaryFile()) min_timestamp = arrow.utcnow() for line in input_file: try: res = parse(line.decode()) min_timestamp = min(min_timestamp, res.timestamp) if res is not None: if res.__class__.__name__ == Simple.__name__: simple_results_file.write( json.dumps(_cattr.unstructure(res)).encode() + b"\n") simple_lines += 1 elif res.__class__.__name__ == Download.__name__: download_results_file.write( json.dumps(_cattr.unstructure(res)).encode() + b"\n") download_lines += 1 else: unprocessed_file.write(line) unprocessed_lines += 1 else: unprocessed_file.write(line) unprocessed_lines += 1 except Exception: unprocessed_file.write(line) unprocessed_lines += 1 total = unprocessed_lines + simple_lines + download_lines print( f"Processed gs://{data['bucket']}/{data['name']}: {total} lines, {simple_lines} simple_requests, {download_lines} file_downloads, {unprocessed_lines} unprocessed" ) bucket = storage_client.bucket(RESULT_BUCKET) partition = min_timestamp.strftime("%Y%m%d") if simple_lines > 0: blob = bucket.blob( f"processed/{partition}/simple-{file_name}.json") blob.upload_from_file(simple_results_file, rewind=True) if download_lines > 0: blob = bucket.blob( f"processed/{partition}/downloads-{file_name}.json") blob.upload_from_file(download_results_file, rewind=True) if unprocessed_lines > 0: blob = bucket.blob(f"unprocessed/{partition}/{file_name}.txt") try: blob.upload_from_file(unprocessed_file, rewind=True) except Exception: # Be opprotunistic about unprocessed files... pass # Remove the log file we processed try: bob_logs_log_blob.delete() except exceptions.NotFound: # Sometimes we try to delete twice pass