# Here is a BatchRequest naming a data_asset
batch_request = BatchRequest(
    datasource_name="my_azure_datasource",
    data_connector_name="default_inferred_data_connector_name",
    data_asset_name="<YOUR_DATA_ASSET_NAME>",
    batch_spec_passthrough={
        "reader_method": "csv",
        "reader_options": {
            "header": True
        }
    },
)

# Please note this override is only to provide good UX for docs and tests.
# In normal usage you'd set your data asset name directly in the BatchRequest above.
batch_request.data_asset_name = (
    "data/taxi_yellow_tripdata_samples/yellow_tripdata_sample_2019-01")

context.create_expectation_suite(expectation_suite_name="test_suite",
                                 overwrite_existing=True)
validator = context.get_validator(batch_request=batch_request,
                                  expectation_suite_name="test_suite")
print(validator.head())

# NOTE: The following code is only for testing and can be ignored by users.
assert isinstance(validator, ge.validator.validator.Validator)
assert [ds["name"]
        for ds in context.list_datasources()] == ["my_azure_datasource"]
assert set(
    context.get_available_data_asset_names()["my_azure_datasource"]
    ["default_inferred_data_connector_name"]) == {
        "data/taxi_yellow_tripdata_samples/yellow_tripdata_sample_2019-01",
Пример #2
0
    data_connector_names="default_inferred_data_connector_name"
)[
    "default_inferred_data_connector_name"
]
assert len(available_data_asset_names) == 36

# Here is a BatchRequest naming an inferred data_asset.
batch_request = BatchRequest(
    datasource_name="taxi_datasource",
    data_connector_name="default_inferred_data_connector_name",
    data_asset_name="<YOUR_DATA_ASSET_NAME>",
)

# Please note this override is only to provide good UX for docs and tests.
# In normal usage you'd set your data asset name directly in the BatchRequest above.
batch_request.data_asset_name = "yellow_tripdata_sample_2019-01.csv"

context.create_expectation_suite(
    expectation_suite_name="test_suite", overwrite_existing=True
)
validator = context.get_validator(
    batch_request=batch_request, expectation_suite_name="test_suite"
)
print(validator.head(n_rows=10))

batch_list = context.get_batch_list(batch_request=batch_request)
assert len(batch_list) == 1
assert batch_list[0].data.dataframe.shape[0] == 10000

# Here is a BatchRequest naming a configured data_asset representing an un-partitioned (flat) filename structure.
batch_request = BatchRequest(
    "name_starts_with"] = "data/taxi_yellow_tripdata_samples/"

context.test_yaml_config(yaml.dump(datasource_config))

context.add_datasource(**datasource_config)

# Here is a BatchRequest naming a data_asset
batch_request = BatchRequest(
    datasource_name="my_azure_datasource",
    data_connector_name="configured_data_connector_name",
    data_asset_name="<YOUR_DATA_ASSET_NAME>",
)

# Please note this override is only to provide good UX for docs and tests.
# In normal usage you'd set your data asset name directly in the BatchRequest above.
batch_request.data_asset_name = "taxi_data"

context.create_expectation_suite(expectation_suite_name="test_suite",
                                 overwrite_existing=True)
validator = context.get_validator(batch_request=batch_request,
                                  expectation_suite_name="test_suite")
print(validator.head())

# NOTE: The following code is only for testing and can be ignored by users.
assert isinstance(validator, ge.validator.validator.Validator)
assert [ds["name"]
        for ds in context.list_datasources()] == ["my_azure_datasource"]
assert set(context.get_available_data_asset_names()["my_azure_datasource"]
           ["configured_data_connector_name"]) == {"taxi_data"}

batch_list: List[Batch] = context.get_batch_list(batch_request=batch_request)
Пример #4
0
context.add_datasource(**my_spark_datasource_config)

batch_request = BatchRequest(
    datasource_name="insert_your_datasource_name_here",
    data_connector_name="insert_your_data_connector_name_here",
    data_asset_name="yellow_tripdata",
    batch_spec_passthrough={
        "reader_method": "csv",
        "reader_options": {
            "header": True,
        },
    },
)

# For the purposes of this script, the data_asset_name includes "sample"
batch_request.data_asset_name = "yellow_tripdata_sample"
# CODE ^^^^^ ^^^^^

# NOTE: The following code is only for testing and can be ignored by users.
# ASSERTIONS vvvvv vvvvv
assert len(context.list_datasources()) == 1
assert context.list_datasources(
)[0]["name"] == "insert_your_datasource_name_here"
assert list(context.list_datasources()[0]["data_connectors"].keys()) == [
    "insert_your_data_connector_name_here"
]

sorted_available_data_asset_names_from_datasource = sorted(
    context.datasources["insert_your_datasource_name_here"].
    get_available_data_asset_names(
        data_connector_names="insert_your_data_connector_name_here")
context.add_datasource(**yaml.load(datasource_yaml))
available_data_asset_names = context.datasources[
    "taxi_datasource"].get_available_data_asset_names(
        data_connector_names="whole_table")["whole_table"]
assert len(available_data_asset_names) == 2

# Here is a BatchRequest referring to an un-partitioned inferred data_asset.
batch_request = BatchRequest(
    datasource_name="taxi_datasource",
    data_connector_name="whole_table",
    data_asset_name="<YOUR_DATA_ASSET_NAME>",
)

# Please note this override is only to provide good UX for docs and tests.
# In normal usage you'd set your data asset name directly in the BatchRequest above.
batch_request.data_asset_name = "yellow_tripdata_sample_2019_01"

context.create_expectation_suite(expectation_suite_name="test_suite",
                                 overwrite_existing=True)
validator = context.get_validator(batch_request=batch_request,
                                  expectation_suite_name="test_suite")
print(validator.head(n_rows=10))

batch_list = context.get_batch_list(batch_request=batch_request)
assert len(batch_list) == 1
batch_data = batch_list[0].data
num_rows = batch_data.execution_engine.engine.execute(
    sa.select([sa.func.count()]).select_from(batch_data.selectable)).one()[0]
assert num_rows == 10000

# Here is a BatchRequest naming an inferred data_asset partitioned by day.