示例#1
0
def test_databricks_wait_for_run(mock_submit_run, databricks_run_config):
    mock_submit_run.return_value = 1

    context = create_test_pipeline_execution_context()
    runner = DatabricksJobRunner(HOST, TOKEN, poll_interval_sec=0.01)
    task = databricks_run_config.pop("task")
    databricks_run_id = runner.submit_run(databricks_run_config, task)

    calls = {
        "num_calls": 0,
        "final_state": DatabricksRunState(
            DatabricksRunLifeCycleState.Terminated,
            DatabricksRunResultState.Success,
            "Finished",
        ),
    }

    def new_get_run_state(_run_id):
        calls["num_calls"] += 1

        if calls["num_calls"] == 1:
            return DatabricksRunState(
                DatabricksRunLifeCycleState.Pending,
                None,
                None,
            )
        elif calls["num_calls"] == 2:
            return DatabricksRunState(
                DatabricksRunLifeCycleState.Running,
                None,
                None,
            )
        else:
            return calls["final_state"]

    with mock.patch.object(runner.client, "get_run_state", new=new_get_run_state):
        runner.wait_for_run_to_complete(context.log, databricks_run_id)

    calls["num_calls"] = 0
    calls["final_state"] = DatabricksRunState(
        DatabricksRunLifeCycleState.Terminated,
        DatabricksRunResultState.Failed,
        "Failed",
    )
    with pytest.raises(DatabricksError) as exc_info:
        with mock.patch.object(runner.client, "get_run_state", new=new_get_run_state):
            runner.wait_for_run_to_complete(context.log, databricks_run_id)
    assert "Run 1 failed with result state" in str(exc_info.value)
示例#2
0
def test_construct_event_record():
    messages = []

    def _append_message(logger_message):
        messages.append(construct_event_record(logger_message))

    logger = define_structured_logger('some_name',
                                      _append_message,
                                      level=DEBUG)
    context = create_test_pipeline_execution_context(
        loggers=[logger], tags={'pipeline': 'some_pipeline'})
    context.log.info('random message')

    assert len(messages) == 1
    message = messages[0]
    assert isinstance(message, LogMessageRecord)
示例#3
0
def test_emr_log_location_for_cluster(emr_cluster_config, mock_s3_bucket):
    context = create_test_pipeline_execution_context()
    emr = EmrJobRunner(region=REGION)
    cluster_id = emr.run_job_flow(context.log, emr_cluster_config)
    assert emr.log_location_for_cluster(cluster_id) == (mock_s3_bucket.name,
                                                        "elasticmapreduce/")

    # Should raise when the log URI is missing
    emr_cluster_config = copy.deepcopy(emr_cluster_config)
    del emr_cluster_config["LogUri"]
    cluster_id = emr.run_job_flow(context.log, emr_cluster_config)
    with pytest.raises(EmrError) as exc_info:
        emr.log_location_for_cluster(cluster_id)

    assert "Log URI not specified, cannot retrieve step execution logs" in str(
        exc_info.value)
示例#4
0
def test_emr_log_location_for_cluster(emr_cluster_config):
    context = create_test_pipeline_execution_context()
    emr = EmrJobRunner(region=REGION)
    cluster_id = emr.run_job_flow(context, emr_cluster_config)
    assert emr.log_location_for_cluster(cluster_id) == ('emr-cluster-logs',
                                                        'elasticmapreduce/')

    # Should raise when the log URI is missing
    emr_cluster_config = copy.deepcopy(emr_cluster_config)
    del emr_cluster_config['LogUri']
    cluster_id = emr.run_job_flow(context, emr_cluster_config)
    with pytest.raises(EmrError) as exc_info:
        emr.log_location_for_cluster(cluster_id)

    assert 'Log URI not specified, cannot retrieve step execution logs' in str(
        exc_info.value)
def test_structured_logger_in_context():
    messages = []

    def _append_message(logger_message):
        messages.append(logger_message)

    logger = define_structured_logger('some_name',
                                      _append_message,
                                      level=DEBUG)
    context = create_test_pipeline_execution_context(loggers=[logger])
    context.log.debug('from_context', foo=2)
    assert len(messages) == 1
    message = messages[0]
    assert message.name == 'some_name'
    assert message.level == DEBUG
    assert message.meta['foo'] == 2
    assert message.meta['orig_message'] == 'from_context'
示例#6
0
def test_pyspark_emr(mock_is_emr_step_complete, mock_read_events):
    mock_read_events.return_value = execute_pipeline(
        reconstructable(define_do_nothing_pipe),
        mode="local").events_by_step_key["do_nothing_solid.compute"]

    run_job_flow_args = dict(
        Instances={
            "InstanceCount": 1,
            "KeepJobFlowAliveWhenNoSteps": True,
            "MasterInstanceType": "c3.medium",
            "Placement": {
                "AvailabilityZone": "us-west-1a"
            },
            "SlaveInstanceType": "c3.xlarge",
        },
        JobFlowRole="EMR_EC2_DefaultRole",
        LogUri="s3://mybucket/log",
        Name="cluster",
        ServiceRole="EMR_DefaultRole",
        VisibleToAllUsers=True,
    )

    # Doing cluster setup outside of a solid here, because run_job_flow is not yet plumbed through
    # to the pyspark EMR resource.
    job_runner = EmrJobRunner(region="us-west-1")
    context = create_test_pipeline_execution_context()
    cluster_id = job_runner.run_job_flow(context.log, run_job_flow_args)

    result = execute_pipeline(
        pipeline=reconstructable(define_do_nothing_pipe),
        mode="prod",
        run_config={
            "resources": {
                "pyspark_step_launcher": {
                    "config":
                    deep_merge_dicts(BASE_EMR_PYSPARK_STEP_LAUNCHER_CONFIG,
                                     {"cluster_id": cluster_id}),
                }
            },
        },
    )
    assert result.success
    assert mock_is_emr_step_complete.called
示例#7
0
def test_is_emr_step_complete(emr_cluster_config):
    context = create_test_pipeline_execution_context()
    emr = EmrJobRunner(region=REGION, check_cluster_every=1)

    cluster_id = emr.run_job_flow(context.log, emr_cluster_config)

    step_name = "test_step"
    step_cmd = ["ls", "/"]
    step_ids = emr.add_job_flow_steps(
        context.log, cluster_id, [emr.construct_step_dict_for_command(step_name, step_cmd)]
    )

    def get_step_dict(step_id, step_state):
        return {
            "Step": {
                "Id": step_id,
                "Name": step_name,
                "Config": {"Jar": "command-runner.jar", "Properties": {}, "Args": step_cmd},
                "ActionOnFailure": "CONTINUE",
                "Status": {
                    "State": step_state,
                    "StateChangeReason": {"Message": "everything is hosed"},
                    "Timeline": {"StartDateTime": _boto3_now()},
                },
            },
        }

    emr_step_id = step_ids[0]
    describe_step_returns = [
        get_step_dict(emr_step_id, "PENDING"),
        get_step_dict(emr_step_id, "RUNNING"),
        get_step_dict(emr_step_id, "COMPLETED"),
        get_step_dict(emr_step_id, "FAILED"),
    ]
    with mock.patch.object(EmrJobRunner, "describe_step", side_effect=describe_step_returns):
        assert not emr.is_emr_step_complete(context.log, cluster_id, emr_step_id)
        assert not emr.is_emr_step_complete(context.log, cluster_id, emr_step_id)
        assert emr.is_emr_step_complete(context.log, cluster_id, emr_step_id)

        with pytest.raises(EmrError) as exc_info:
            emr.is_emr_step_complete(context.log, cluster_id, emr_step_id)
            assert "step failed" in str(exc_info.value)
示例#8
0
def test_pyspark_emr(mock_wait, mock_get_step_events):
    run_job_flow_args = dict(
        Instances={
            'InstanceCount': 1,
            'KeepJobFlowAliveWhenNoSteps': True,
            'MasterInstanceType': 'c3.medium',
            'Placement': {
                'AvailabilityZone': 'us-west-1a'
            },
            'SlaveInstanceType': 'c3.xlarge',
        },
        JobFlowRole='EMR_EC2_DefaultRole',
        LogUri='s3://mybucket/log',
        Name='cluster',
        ServiceRole='EMR_DefaultRole',
        VisibleToAllUsers=True,
    )

    # Doing cluster setup outside of a solid here, because run_job_flow is not yet plumbed through
    # to the pyspark EMR resource.
    job_runner = EmrJobRunner(region='us-west-1')
    context = create_test_pipeline_execution_context()
    cluster_id = job_runner.run_job_flow(context.log, run_job_flow_args)

    pipeline_def = ExecutionTargetHandle.for_pipeline_fn(
        define_do_nothing_pipe).build_pipeline_definition()
    result = execute_pipeline(
        pipeline=pipeline_def,
        mode='prod',
        environment_dict={
            'resources': {
                'pyspark_step_launcher': {
                    'config':
                    deep_merge_dicts(BASE_EMR_PYSPARK_STEP_LAUNCHER_CONFIG,
                                     {'cluster_id': cluster_id}),
                }
            },
        },
    )
    assert result.success
    assert mock_wait.called_once
    assert mock_get_step_events.called_once
示例#9
0
def test_pyspark_emr(mock_wait):
    run_job_flow_args = dict(
        Instances={
            'InstanceCount': 1,
            'KeepJobFlowAliveWhenNoSteps': True,
            'MasterInstanceType': 'c3.medium',
            'Placement': {'AvailabilityZone': 'us-west-1a'},
            'SlaveInstanceType': 'c3.xlarge',
        },
        JobFlowRole='EMR_EC2_DefaultRole',
        LogUri='s3://mybucket/log',
        Name='cluster',
        ServiceRole='EMR_DefaultRole',
        VisibleToAllUsers=True,
    )

    # Doing cluster setup outside of a solid here, because run_job_flow is not yet plumbed through
    # to the pyspark EMR resource.
    job_runner = EmrJobRunner(region='us-west-1')
    context = create_test_pipeline_execution_context()
    cluster_id = job_runner.run_job_flow(context, run_job_flow_args)

    result = execute_pipeline_with_mode(
        pipeline=example_pipe,
        mode='prod',
        environment_dict={
            'solids': {'blah': {'config': {'foo': 'a string', 'bar': 123}}},
            'resources': {
                'pyspark': {
                    'config': {
                        'pipeline_file': __file__,
                        'pipeline_fn_name': 'example_pipe',
                        'cluster_id': cluster_id,
                        'staging_bucket': 'dagster-scratch-80542c2',
                        'region_name': 'us-west-1',
                    }
                }
            },
        },
    )
    assert result.success
    assert mock_wait.called_once
示例#10
0
def test_wait_for_log():
    s3 = boto3.resource('s3', region_name=REGION)
    s3.create_bucket(Bucket='log_bucket')  # pylint: disable=no-member

    def create_log():
        time.sleep(0.5)
        out = io.BytesIO()
        with gzip.GzipFile(fileobj=out, mode='w') as fo:
            fo.write('foo bar'.encode())

        s3.Object('log_bucket', 'some_log_file').put(  # pylint: disable=no-member
            Body=out.getvalue()
        )

    thread = threading.Thread(target=create_log, args=())
    thread.daemon = True
    thread.start()

    context = create_test_pipeline_execution_context()
    emr = EmrJobRunner(region=REGION)
    res = emr.wait_for_log(
        context.log,
        log_bucket='log_bucket',
        log_key='some_log_file',
        waiter_delay=1,
        waiter_max_attempts=2,
    )
    assert res == 'foo bar'

    with pytest.raises(EmrError) as exc_info:
        emr.wait_for_log(
            context.log,
            log_bucket='log_bucket',
            log_key='does_not_exist',
            waiter_delay=1,
            waiter_max_attempts=1,
        )
    assert 'EMR log file did not appear on S3 after waiting' in str(exc_info.value)
示例#11
0
def test_emr_create_cluster(emr_cluster_config):
    context = create_test_pipeline_execution_context()
    cluster = EmrJobRunner(region=REGION)
    cluster_id = cluster.run_job_flow(context, emr_cluster_config)
    assert cluster_id.startswith('j-')
示例#12
0
def test_noarg_ctor():
    legacy_context = create_test_pipeline_execution_context()
    assert uuid.UUID(legacy_context.run_id)
示例#13
0
def test_emr_wait_for_step(emr_cluster_config):
    context = create_test_pipeline_execution_context()
    emr = EmrJobRunner(region=REGION, check_cluster_every=1)

    cluster_id = emr.run_job_flow(context, emr_cluster_config)

    step_name = 'test_step'
    step_cmd = ['ls', '/']
    step_ids = emr.add_job_flow_steps(
        context, cluster_id,
        [emr.construct_step_dict_for_command(step_name, step_cmd)])

    def get_step_dict(step_id, step_state):
        return {
            'Step': {
                'Id': step_id,
                'Name': step_name,
                'Config': {
                    'Jar': 'command-runner.jar',
                    'Properties': {},
                    'Args': step_cmd
                },
                'ActionOnFailure': 'CONTINUE',
                'Status': {
                    'State': step_state,
                    'StateChangeReason': {
                        'Message': 'everything is hosed'
                    },
                    'Timeline': {
                        'StartDateTime': _boto3_now()
                    },
                },
            },
        }

    calls = {'num_calls': 0, 'final_state': 'COMPLETED'}

    def new_describe_step(_, cluster_id, step_id):
        calls['num_calls'] += 1

        if calls['num_calls'] == 1:
            return get_step_dict(step_id, 'PENDING')
        elif calls['num_calls'] == 2:
            return get_step_dict(step_id, 'RUNNING')
        else:
            return get_step_dict(step_id, calls['final_state'])

        return emr.describe_step(cluster_id, step_id)

    with mock.patch.object(EmrJobRunner,
                           'describe_step',
                           new=new_describe_step):
        emr.wait_for_steps_to_complete(context, cluster_id, step_ids)

    calls['num_calls'] = 0
    calls['final_state'] = 'FAILED'
    with pytest.raises(EmrError) as exc_info:
        with mock.patch.object(EmrJobRunner,
                               'describe_step',
                               new=new_describe_step):
            emr.wait_for_steps_to_complete(context, cluster_id, step_ids)
    assert 'step failed' in str(exc_info.value)
示例#14
0
def create_sql_alchemy_context_from_engine(engine, *args, **kwargs):
    resources = DefaultSqlAlchemyResources(SqlAlchemyResource(engine))
    context = create_test_pipeline_execution_context(resources=resources,
                                                     *args,
                                                     **kwargs)
    return check_supports_sql_alchemy_resource(context)