예제 #1
0
def test_pd_df_load():
    dataset = get_dataset()
    table = '%s.%s' % (dataset, 'df')

    test_df = pd.DataFrame({'num1': [1, 3], 'num2': [2, 4]})

    create_solid = bq_create_dataset.alias('create_solid')
    load_solid = import_df_to_bq.alias('load_solid')
    query_solid = bq_solid_for_queries(['SELECT num1, num2 FROM %s' % table]).alias('query_solid')
    delete_solid = bq_delete_dataset.alias('delete_solid')

    @solid(
        input_defs=[InputDefinition('success', Nothing)], output_defs=[OutputDefinition(DataFrame)]
    )
    def return_df(_context):  # pylint: disable=unused-argument
        return test_df

    config = {
        'solids': {
            'create_solid': {'config': {'dataset': dataset, 'exists_ok': True}},
            'load_solid': {'config': {'destination': table}},
            'delete_solid': {'config': {'dataset': dataset, 'delete_contents': True}},
        }
    }

    @pipeline(mode_defs=bq_modes())
    def bq_pipeline():
        delete_solid(query_solid(load_solid(return_df(create_solid()))))

    result = execute_pipeline(bq_pipeline, config)
    assert result.success

    values = result.result_for_solid('query_solid').output_value()
    assert values[0].to_dict() == test_df.to_dict()

    # BQ loads should throw an exception if pyarrow and fastparquet aren't available
    with mock.patch.dict(sys.modules, {'pyarrow': None, 'fastparquet': None}):
        with pytest.raises(DagsterExecutionStepExecutionError) as exc_info:
            result = execute_pipeline(bq_pipeline, config)
        assert (
            'loading data to BigQuery from pandas DataFrames requires either pyarrow or fastparquet'
            ' to be installed' in str(exc_info.value.user_exception)
        )
        cleanup_config = {
            'solids': {'delete_solid': {'config': {'dataset': dataset, 'delete_contents': True}}}
        }

        @pipeline(mode_defs=bq_modes())
        def cleanup():
            delete_solid()

        assert execute_pipeline(cleanup, cleanup_config).success

    assert not dataset_exists(dataset)
예제 #2
0
def test_pd_df_load():
    dataset = get_dataset()
    table = "%s.%s" % (dataset, "df")

    test_df = pd.DataFrame({"num1": [1, 3], "num2": [2, 4]})

    create_solid = bq_create_dataset.alias("create_solid")
    load_solid = import_df_to_bq.alias("load_solid")
    query_solid = bq_solid_for_queries(["SELECT num1, num2 FROM %s" % table]).alias("query_solid")
    delete_solid = bq_delete_dataset.alias("delete_solid")

    @solid(
        input_defs=[InputDefinition("success", Nothing)], output_defs=[OutputDefinition(DataFrame)]
    )
    def return_df(_context):  # pylint: disable=unused-argument
        return test_df

    config = {
        "solids": {
            "create_solid": {"config": {"dataset": dataset, "exists_ok": True}},
            "load_solid": {"config": {"destination": table}},
            "delete_solid": {"config": {"dataset": dataset, "delete_contents": True}},
        }
    }

    @pipeline(mode_defs=bq_modes())
    def bq_pipeline():
        delete_solid(query_solid(load_solid(return_df(create_solid()))))

    result = execute_pipeline(bq_pipeline, config)
    assert result.success

    values = result.result_for_solid("query_solid").output_value()
    assert values[0].to_dict() == test_df.to_dict()

    # BQ loads should throw an exception if pyarrow and fastparquet aren't available
    with mock.patch.dict(sys.modules, {"pyarrow": None, "fastparquet": None}):
        with pytest.raises(DagsterExecutionStepExecutionError) as exc_info:
            result = execute_pipeline(bq_pipeline, config)
        assert (
            "loading data to BigQuery from pandas DataFrames requires either pyarrow or fastparquet"
            " to be installed" in str(exc_info.value.user_exception)
        )
        cleanup_config = {
            "solids": {"delete_solid": {"config": {"dataset": dataset, "delete_contents": True}}}
        }

        @pipeline(mode_defs=bq_modes())
        def cleanup():
            delete_solid()

        assert execute_pipeline(cleanup, cleanup_config).success

    assert not dataset_exists(dataset)
예제 #3
0
def test_pd_df_load():
    dataset = get_dataset()
    table = "%s.%s" % (dataset, "df")

    test_df = pd.DataFrame({"num1": [1, 3], "num2": [2, 4]})

    create_op = bq_create_dataset.alias("create_op")
    load_op = import_df_to_bq.alias("load_op")
    query_op = bq_op_for_queries(["SELECT num1, num2 FROM %s" % table]).alias("query_op")
    delete_op = bq_delete_dataset.alias("delete_op")

    @op(input_defs=[InputDefinition("success", Nothing)], output_defs=[OutputDefinition(DataFrame)])
    def return_df(_context):  # pylint: disable=unused-argument
        return test_df

    @job(resource_defs={"bigquery": bigquery_resource})
    def bq_circle_of_life():
        delete_op(query_op(load_op(return_df(create_op()))))

    result = bq_circle_of_life.execute_in_process(
        run_config={
            "ops": {
                "create_op": {"config": {"dataset": dataset, "exists_ok": True}},
                "load_op": {"config": {"destination": table}},
                "delete_op": {"config": {"dataset": dataset, "delete_contents": True}},
            }
        }
    )
    assert result.success

    values = result.output_for_node("query_op")
    assert values[0].to_dict() == test_df.to_dict()

    # BQ loads should throw an exception if pyarrow and fastparquet aren't available
    with mock.patch.dict(sys.modules, {"pyarrow": None, "fastparquet": None}):
        with pytest.raises(DagsterExecutionStepExecutionError) as exc_info:
            bq_circle_of_life.execute_in_process(
                run_config={
                    "ops": {
                        "create_op": {"config": {"dataset": dataset, "exists_ok": True}},
                        "load_op": {"config": {"destination": table}},
                        "delete_op": {"config": {"dataset": dataset, "delete_contents": True}},
                    }
                }
            )
        assert (
            "loading data to BigQuery from pandas DataFrames requires either pyarrow or fastparquet"
            " to be installed" in str(exc_info.value.user_exception)
        )

        @job(resource_defs={"bigquery": bigquery_resource})
        def cleanup_bq():
            delete_op()

        result = cleanup_bq.execute_in_process(
            run_config={
                "ops": {"delete_op": {"config": {"dataset": dataset, "delete_contents": True}}}
            }
        )
        assert result.success

    assert not dataset_exists(dataset)