Пример #1
0
def test_invalid_event(data):
    with pytest.raises(UnparseableEvent):
        parse(data)
Пример #2
0
def process_fastly_log(data, context):
    storage_client = storage.Client()
    bigquery_client = bigquery.Client()
    identifier = os.path.basename(data["name"]).split("-", 3)[-1].rstrip(".log.gz")
    default_partition = datetime.datetime.utcnow().strftime("%Y%m%d")

    bob_logs_log_blob = storage_client.bucket(data["bucket"]).get_blob(data["name"])
    if bob_logs_log_blob is None:
        return  # This has already been processed?

    unprocessed_lines = 0
    simple_lines = 0
    download_lines = 0

    with ExitStack() as stack:
        input_file_obj = stack.enter_context(NamedTemporaryFile())
        bob_logs_log_blob.download_to_file(input_file_obj)
        input_file_obj.flush()

        input_file = stack.enter_context(gzip.open(input_file_obj.name, "rb"))
        unprocessed_file = stack.enter_context(NamedTemporaryFile())
        simple_results_file = stack.enter_context(NamedTemporaryFile())
        download_results_file = stack.enter_context(NamedTemporaryFile())

        for line in input_file:
            try:
                res = parse(line.decode())
                if res is not None:
                    if res.__class__.__name__ == Simple.__name__:
                        simple_results_file.write(
                            json.dumps(_cattr.unstructure(res)).encode() + b"\n"
                        )
                        simple_lines += 1
                    elif res.__class__.__name__ == Download.__name__:
                        download_results_file.write(
                            json.dumps(_cattr.unstructure(res)).encode() + b"\n"
                        )
                        download_lines += 1
                    else:
                        unprocessed_file.write(line)
                        unprocessed_lines += 1
                else:
                    unprocessed_file.write(line)
                    unprocessed_lines += 1
            except Exception as e:
                unprocessed_file.write(line)
                unprocessed_lines += 1

        total = unprocessed_lines + simple_lines + download_lines
        print(
            f"Processed gs://{data['bucket']}/{data['name']}: {total} lines, {simple_lines} simple_requests, {download_lines} file_downloads, {unprocessed_lines} unprocessed"
        )

        dataset_ref = bigquery_client.dataset(DATASET)

        job_config = bigquery.LoadJobConfig()
        job_config.source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON
        job_config.ignore_unknown_values = True

        if download_lines > 0:
            load_job = bigquery_client.load_table_from_file(
                download_results_file,
                dataset_ref.table(DOWNLOAD_TABLE),
                job_id_prefix="linehaul_file_downloads",
                location="US",
                job_config=job_config,
                rewind=True,
            )
            load_job.result()
            print(f"Loaded {load_job.output_rows} rows into {DATASET}:{DOWNLOAD_TABLE}")

        if simple_lines > 0:
            load_job = bigquery_client.load_table_from_file(
                simple_results_file,
                dataset_ref.table(SIMPLE_TABLE),
                job_id_prefix="linehaul_file_downloads",
                location="US",
                job_config=job_config,
                rewind=True,
            )
            load_job.result()
            print(f"Loaded {load_job.output_rows} rows into {DATASET}:{SIMPLE_TABLE}")

        bucket = storage_client.bucket(os.environ.get("RESULT_BUCKET"))
        if unprocessed_lines > 0:
            blob = bucket.blob(f"unprocessed/{default_partition}/{identifier}.txt")
            try:
                blob.upload_from_file(unprocessed_file, rewind=True)
            except:
                # Be opprotunistic about unprocessed files...
                pass

        bob_logs_log_blob.delete()
Пример #3
0
def test_download_parsing(event_data, expected):
    if inspect.isclass(expected) and issubclass(expected, Exception):
        with pytest.raises(expected):
            parse(event_data)
    else:
        assert parse(event_data) == expected
Пример #4
0
def process_fastly_log(data, context):
    storage_client = storage.Client()
    file_name = os.path.basename(data["name"]).rstrip(".log.gz")

    print(f"Beginning processing for gs://{data['bucket']}/{data['name']}")

    bob_logs_log_blob = storage_client.bucket(data["bucket"]).get_blob(
        data["name"])
    if bob_logs_log_blob is None:
        return  # This has already been processed?

    unprocessed_lines = 0
    simple_lines = 0
    download_lines = 0

    with ExitStack() as stack:
        input_file_obj = stack.enter_context(NamedTemporaryFile())
        bob_logs_log_blob.download_to_file(input_file_obj)
        input_file_obj.flush()

        input_file = stack.enter_context(gzip.open(input_file_obj.name, "rb"))
        unprocessed_file = stack.enter_context(NamedTemporaryFile())
        simple_results_file = stack.enter_context(NamedTemporaryFile())
        download_results_file = stack.enter_context(NamedTemporaryFile())

        min_timestamp = arrow.utcnow()
        for line in input_file:
            try:
                res = parse(line.decode())
                min_timestamp = min(min_timestamp, res.timestamp)
                if res is not None:
                    if res.__class__.__name__ == Simple.__name__:
                        simple_results_file.write(
                            json.dumps(_cattr.unstructure(res)).encode() +
                            b"\n")
                        simple_lines += 1
                    elif res.__class__.__name__ == Download.__name__:
                        download_results_file.write(
                            json.dumps(_cattr.unstructure(res)).encode() +
                            b"\n")
                        download_lines += 1
                    else:
                        unprocessed_file.write(line)
                        unprocessed_lines += 1
                else:
                    unprocessed_file.write(line)
                    unprocessed_lines += 1
            except Exception:
                unprocessed_file.write(line)
                unprocessed_lines += 1

        total = unprocessed_lines + simple_lines + download_lines
        print(
            f"Processed gs://{data['bucket']}/{data['name']}: {total} lines, {simple_lines} simple_requests, {download_lines} file_downloads, {unprocessed_lines} unprocessed"
        )

        bucket = storage_client.bucket(RESULT_BUCKET)
        partition = min_timestamp.strftime("%Y%m%d")

        if simple_lines > 0:
            blob = bucket.blob(
                f"processed/{partition}/simple-{file_name}.json")
            blob.upload_from_file(simple_results_file, rewind=True)
        if download_lines > 0:
            blob = bucket.blob(
                f"processed/{partition}/downloads-{file_name}.json")
            blob.upload_from_file(download_results_file, rewind=True)

        if unprocessed_lines > 0:
            blob = bucket.blob(f"unprocessed/{partition}/{file_name}.txt")
            try:
                blob.upload_from_file(unprocessed_file, rewind=True)
            except Exception:
                # Be opprotunistic about unprocessed files...
                pass

        # Remove the log file we processed
        try:
            bob_logs_log_blob.delete()
        except exceptions.NotFound:
            # Sometimes we try to delete twice
            pass