def run_dataflow_job(utils: Utils, bucket_name: str, build_image: str) -> str:
    # Run the Beam pipeline in Dataflow making sure GPUs are used.
    yield from utils.cloud_build_submit(
        config="run.yaml",
        substitutions={
            "_JOB_NAME": utils.hyphen_name(NAME),
            "_IMAGE": f"{NAME}:{utils.uuid}",
            "_TEMP_LOCATION": f"gs://{bucket_name}/temp",
            "_REGION": utils.region,
        },
        source="--no-source",
    )
예제 #2
0
def test_tensorflow_landsat(
    utils: Utils, bucket_name: str, run_dataflow_job: str
) -> None:
    # Wait until the job finishes.
    timeout = 30 * 60  # 30 minutes
    status = utils.dataflow_jobs_wait(
        job_name=utils.hyphen_name(NAME), timeout_sec=timeout
    )
    assert status == "JOB_STATE_DONE", f"Dataflow pipeline finished in {status} status"

    # Check that output files were created and are not empty.
    storage_client = storage.Client()
    print(f">> Checking for output files in: gs://{bucket_name}/outputs/")
    output_files = list(storage_client.list_blobs(bucket_name, prefix="outputs/"))
    assert len(output_files) > 0, f"No files found in gs://{bucket_name}/outputs/"
    for output_file in output_files:
        assert output_file.size > 0, f"Output file is empty: {output_file.name}"
def pubsub_publisher(utils: Utils, pubsub_topic: str) -> bool:
    yield from utils.pubsub_publisher(
        pubsub_topic,
        new_msg=lambda i: json.dumps(
            {
                "url": "https://beam.apache.org/",
                "review": "positive" if i % 2 == 0 else "negative",
            }),
    )
def test_flex_template_streaming_beam(
    utils: Utils,
    bucket_name: str,
    pubsub_publisher: str,
    pubsub_subscription: str,
    flex_template_path: str,
    bigquery_dataset: str,
) -> None:

    bigquery_table = "output_table"
    job_id = utils.dataflow_flex_template_run(
        job_name=NAME,
        template_path=flex_template_path,
        bucket_name=bucket_name,
        parameters={
            "input_subscription": pubsub_subscription,
            "output_table": f"{bigquery_dataset}.{bigquery_table}",
        },
    )

    # Since this is a streaming job, it will never finish running.
    # First, lets wait until the job is running.
    utils.dataflow_jobs_wait(job_id, until_status="JOB_STATE_RUNNING")

    # Then, wait 3 minutes for data to arrive, get processed, and cancel it.
    time.sleep(3 * 60)
    utils.dataflow_jobs_cancel(job_id, drain=True)

    # Check for the output data in BigQuery.
    query = f"SELECT * FROM {bigquery_dataset.replace(':', '.')}.{bigquery_table}"
    rows = list(utils.bigquery_query(query))
    assert len(rows) > 0
    for row in rows:
        assert "score" in row
예제 #5
0
def dataflow_job_id(
    utils: Utils,
    bucket_name: str,
    flex_template_path: str,
    bigquery_dataset: str,
    pubsub_subscription: str,
) -> str:
    yield from utils.dataflow_flex_template_run(
        job_name=NAME,
        template_path=flex_template_path,
        bucket_name=bucket_name,
        parameters={
            "input_subscription": pubsub_subscription,
            "output_table": f"{bigquery_dataset}.{BIGQUERY_TABLE}",
        },
    )
def flex_template_path(utils: Utils, bucket_name: str,
                       flex_template_image: str) -> str:
    yield from utils.dataflow_flex_template_build(bucket_name,
                                                  flex_template_image)
def flex_template_image(utils: Utils) -> str:
    yield from utils.cloud_build_submit(NAME)
def bigquery_dataset(utils: Utils) -> str:
    yield from utils.bigquery_dataset(NAME)
def pubsub_subscription(utils: Utils, pubsub_topic: str) -> str:
    yield from utils.pubsub_subscription(pubsub_topic, NAME)
예제 #10
0
def pubsub_topic(utils: Utils) -> str:
    yield from utils.pubsub_topic(NAME)
예제 #11
0
def bucket_name(utils: Utils) -> str:
    yield from utils.storage_bucket(NAME)
예제 #12
0
def test_flex_template_streaming_beam(utils: Utils,
                                      dataflow_job_id: str) -> None:
    # Wait until the dataflow job starts running successfully.
    # The job is cancelled as part of the fixture teardown to avoid leaking resources.
    utils.dataflow_jobs_wait(dataflow_job_id,
                             target_states={"JOB_STATE_RUNNING"})
예제 #13
0
def test_tensorflow_minimal(utils: Utils, run_dataflow_job: str) -> None:
    # Wait until the job finishes.
    status = utils.dataflow_jobs_wait(job_name=utils.hyphen_name(NAME))
    assert status == "JOB_STATE_DONE", f"Dataflow pipeline finished in {status} status"
예제 #14
0
def build_image(utils: Utils) -> str:
    yield from utils.cloud_build_submit(
        image_name=NAME,
        config="build.yaml",
        substitutions={"_IMAGE": f"{NAME}:{utils.uuid}"},
    )