def test_step_resource_update_new_name():
    source = Package("data/package/datapackage.json")
    target = transform(
        source,
        steps=[
            steps.resource_update(name="data", new_name="new-name"),
        ],
    )
    assert target.get_resource("new-name").path == "data.csv"
def test_step_resource_update():
    source = Package("data/package/datapackage.json")
    target = transform(
        source,
        steps=[
            steps.resource_update(name="data", title="New title"),
        ],
    )
    assert target.get_resource("data").title == "New title"
def test_validate_package_composite_primary_key_unique_issue_215():
    source = {
        "resources": [{
            "name": "name",
            "data": [["id1", "id2"], ["a", "1"], ["a", "2"]],
            "schema": {
                "fields": [{
                    "name": "id1"
                }, {
                    "name": "id2"
                }],
                "primaryKey": ["id1", "id2"],
            },
        }],
    }
    package = Package(source)
    report = package.validate()
    assert report.valid
def test_resource_integrity_onerror_header_raise():
    data = [["name"], [1], [2], [3]]
    schema = {"fields": [{"name": "bad", "type": "integer"}]}
    package = Package({"resources": [{"data": data, "schema": schema}]}, onerror="raise")
    resource = package.resources[0]
    assert package.onerror == "raise"
    assert resource.onerror == "raise"
    with pytest.raises(FrictionlessException):
        resource.read_rows()
Exemplo n.º 5
0
def test_package_resources_inline():
    data = [["id", "name"], ["1", "english"], ["2", "中国人"]]
    package = Package({"resources": [{"name": "name", "data": data}]})
    resource = package.get_resource("name")
    assert len(package.resources) == 1
    assert resource.path is None
    assert resource.data == data
    assert resource.fullpath is None
    assert resource.read_rows() == [
        {
            "id": 1,
            "name": "english"
        },
        {
            "id": 2,
            "name": "中国人"
        },
    ]
Exemplo n.º 6
0
def test_package_from_zip_remote():
    package = Package(BASE_URL % "data/package.zip")
    assert package.name == "testing"
    assert len(package.resources) == 2
    assert package.get_resource("data2").read_rows() == [
        {
            "parent": "A3001",
            "comment": "comment1"
        },
        {
            "parent": "A3001",
            "comment": "comment2"
        },
        {
            "parent": "A5032",
            "comment": "comment3"
        },
    ]
def test_resource_integrity_onerror_row_warn():
    data = [["name"], [1], [2], [3]]
    schema = {"fields": [{"name": "name", "type": "string"}]}
    package = Package(resources=[Resource(data=data, schema=schema)], onerror="warn")
    resource = package.resources[0]
    assert package.onerror == "warn"
    assert resource.onerror == "warn"
    with pytest.warns(UserWarning):
        resource.read_rows()
def test_package_from_zip_no_descriptor(tmpdir):
    descriptor = str(tmpdir.join("package.zip"))
    with zipfile.ZipFile(descriptor, "w") as zip:
        zip.writestr("data.txt", "foobar")
    with pytest.raises(FrictionlessException) as excinfo:
        Package(descriptor)
    error = excinfo.value.error
    assert error.code == "package-error"
    assert error.note.count("datapackage.json")
Exemplo n.º 9
0
def test_step_resource_remove():
    source = Package("data/package/datapackage.json")
    target = transform(
        source,
        steps=[
            steps.resource_remove(name="data2"),
        ],
    )
    assert target.resource_names == ["data"]
Exemplo n.º 10
0
def test_storage_integrity(options):

    # Export/Import
    source = Package("data/storage/integrity.json")
    storage = source.to_bigquery(force=True, **options)
    target = Package.from_bigquery(**options)

    # Assert metadata (main)
    assert target.get_resource("integrity_main").schema == {
        "fields": [
            # added required
            {"name": "id", "type": "integer"},
            {"name": "parent", "type": "integer"},
            {"name": "description", "type": "string"},
        ],
        # primary key removal
        # foreign keys removal
    }

    # Assert metadata (link)
    assert target.get_resource("integrity_link").schema == {
        "fields": [
            {"name": "main_id", "type": "integer"},
            {"name": "some_id", "type": "integer"},  # constraint removal
            {"name": "description", "type": "string"},  # constraint removal
        ],
        # primary key removal
        # foreign keys removal
    }

    # Assert data (main)
    assert target.get_resource("integrity_main").read_rows() == [
        {"id": 1, "parent": None, "description": "english"},
        {"id": 2, "parent": 1, "description": "中国人"},
    ]

    # Assert data (link)
    assert target.get_resource("integrity_link").read_rows() == [
        {"main_id": 1, "some_id": 1, "description": "note1"},
        {"main_id": 2, "some_id": 2, "description": "note2"},
    ]

    # Cleanup storage
    storage.delete_package(target.resource_names)
Exemplo n.º 11
0
def test_sql_storage_postgresql_integrity_different_order_issue_957(postgresql_url):
    dialect = SqlDialect(prefix="prefix_")
    source = Package("data/storage/integrity.json")
    source.add_resource(source.remove_resource("integrity_main"))
    storage = source.to_sql(postgresql_url, dialect=dialect)
    target = Package.from_sql(postgresql_url, dialect=dialect)
    assert len(target.resources) == 2
    storage.delete_package(target.resource_names)
def test_analyze_package_detailed_numeric_values_descriptive_summary():
    package = Package("data/package-1067.json")
    analysis = package.analyze(detailed=True)
    path = "data/analysis-data.csv" if IS_UNIX else "data\\analysis-data.csv"
    assert list(analysis[path]["fieldStats"]["parent_age"].keys()) == [
        "type",
        "mean",
        "median",
        "mode",
        "variance",
        "quantiles",
        "stdev",
        "max",
        "min",
        "bounds",
        "uniqueValues",
        "outliers",
        "missingValues",
    ]
Exemplo n.º 13
0
def test_describe_package_with_dialect_path_1126():
    package = Package.describe("data/country-2.csv", dialect="data/dialect.json")
    assert package.get_resource("country-2")["schema"] == {
        "fields": [
            {"type": "integer", "name": "id"},
            {"type": "integer", "name": "neighbor_id"},
            {"type": "string", "name": "name"},
            {"type": "integer", "name": "population"},
        ]
    }
Exemplo n.º 14
0
def test_validate_package_schema_multiple_foreign_key_resource_violation_non_existent(
):
    descriptor = deepcopy(DESCRIPTOR_FK)
    # remove London
    del descriptor["resources"][0]["data"][1]
    descriptor["resources"].append(MULTI_FK_RESSOURCE)
    package = Package(descriptor)
    report = package.validate()
    assert report.flatten([
        "rowPosition", "fieldPosition", "code", "cells", "note"
    ]) == [
        [
            2,
            None,
            "foreign-key-error",
            ["1", "2", "1.5"],
            'for "from, to": values "1, 2" not found in the lookup table "cities" as "id, next_id"',
        ],
    ]
Exemplo n.º 15
0
def test_read_package_bad_package_id():
    dataset_id = "bad-dataset-id"
    base_url = "https://demo.ckan.org"

    _mock_json_call(
        f"{base_url}/api/3/action/package_show?id={dataset_id}",
        "data/ckan_mock_responses/ckan_error.json",
        status=404,
    )

    with pytest.raises(exceptions.FrictionlessException) as excinfo:
        Package.from_ckan(
            base_url=base_url,
            dataset_id=dataset_id,
            api_key="env:CKAN_API_KEY",
        )
    error = excinfo.value.error
    assert error.code == "storage-error"
    assert error.note.count("Not found")
Exemplo n.º 16
0
def construct_dictionary_knowledge_graph(datapackage_fp, temp_dir_loc, resource_name='ids'):
    package = Package(datapackage_fp, profile='tabular-data-package')
    ids_resource = package.get_resource(resource_name)

    field_hierarchies = get_field_name_tags(ids_resource.schema)
    root_field = field_hierarchies_to_root(field_hierarchies)
    root_field_type = [field['type'] for field in ids_resource.schema['fields'] if field['name']==root_field][0]

    df_ids = assign_idx_fields(ids_resource.to_pandas(), root_field, root_field_type)
    site_data = initialise_site_data_with_ids(df_ids, field_hierarchies)

    fk_external_datapackage_refs = extract_external_foreignkey_datapackage_refs(ids_resource, primary_key_field=root_field)
    fk_external_datapackage_refs = add_resource_locs_to_external_datapackage_refs(fk_external_datapackage_refs)

    download_attribute_data_to_temp_dir(fk_external_datapackage_refs, temp_dir_loc=temp_dir_loc)
    site_data = extract_attrs_from_resource_dfs(site_data, fk_external_datapackage_refs, temp_dir_loc)
    site_data = json_nan_to_none(site_data)

    return site_data
Exemplo n.º 17
0
def test_package_validation_duplicate_resource_names_issue_942():
    package = Package(
        resources=[
            Resource(name="name", path="data/table.csv"),
            Resource(name="name", path="data/table.csv"),
        ]
    )
    errors = package.metadata_errors
    assert len(errors) == 1
    assert errors[0].note == "names of the resources are not unique"
Exemplo n.º 18
0
def test_package_from_path_remote():
    package = Package(BASEURL % "data/package.json")
    assert package.basepath == BASEURL % "data"
    assert package == {
        "name": "name",
        "resources": [{
            "name": "name",
            "path": "table.csv"
        }],
    }
Exemplo n.º 19
0
def package(c):
    Path("build/").mkdir(parents=True, exist_ok=True)
    metadata = {
        "title": "Podnebnik",
        "description": "TODO",
    }

    for datapackage_path in discover_datapackages():
        pkg = Package(datapackage_path)
        dbname = os.path.basename(os.path.dirname(datapackage_path))
        # convert Package to sqlite db
        pkg.to_sql(f"sqlite:///build/{dbname}.db")
        # extract metadata from Package
        pkg_meta = extract_metadata(pkg, dbname)
        # merge metadata
        metadata = always_merger.merge(metadata, pkg_meta)

    with open('build/metadata.json', 'w') as f:
        json.dump(metadata, f, indent=4, sort_keys=True)
    def run():
        base_dir = pathlib.Path(__file__).absolute().parent.parent
        data_file = os.path.join(base_dir, "data", "datapackage.json")
        package = Package(data_file)
        resource = package.get_resource("content-pages")
        for row in resource.read_rows():
            try:
                resp = requests.get(row["url"])
                resp.raise_for_status()
                soup = BeautifulSoup(resp.text, "html.parser")

                # fixup relative css hrefs to be absolute
                for link in soup.find_all("link", attrs={"rel": "stylesheet"}):
                    href = urljoin(row["url"], link["href"])
                    link["href"] = href

                # remove js and forms
                for remove in ["script", "form"]:
                    for item in soup.find_all(remove):
                        item.decompose()

                # remove meta tags apart from encoding
                for m in soup.find_all("meta"):
                    try:
                        if (m is not None and m.get("charset") is None
                                and m.get("content") !=
                                "text/html; charset=utf-8"):
                            m.decompose()
                    except Exception as e:
                        print(e)

                # gov.uk page that was tested had a specific aria attr
                # with dynamic id attached which changes and can be ignored for diffs
                labelled_ids = []
                for tag in soup():
                    if tag.attrs.get("aria-labelledby"):
                        labelled_ids.append(tag.attrs.get("aria-labelledby"))
                        del tag.attrs["aria-labelledby"]
                for id in labelled_ids:
                    for element in soup.find_all(id=id):
                        del element["id"]

                for tag in soup.find_all(class_="attachment embedded"):
                    del tag["id"]

                html = soup.prettify()
                out = os.path.join(base_dir, "collected", f"{row['id']}.html")
                updated = content_updated(out, html)
                with open(out, "w") as html_file:
                    html_file.write(html)
                if updated:
                    package.update(updated=datetime.now().isoformat())
                    package.to_json(data_file)

            except requests.HTTPError as e:
                print(f"Error getting {row['url']}")
                print(e)
Exemplo n.º 21
0
def test_pipeline(tmpdir):

    # Write
    pipeline = DataflowsPipeline({
        "type":
        "dataflows",
        "steps": [
            {
                "type": "load",
                "spec": {
                    "loadSource": "data/table.csv"
                }
            },
            {
                "type": "setType",
                "spec": {
                    "name": "id",
                    "type": "string"
                }
            },
            {
                "type": "dumpToPath",
                "spec": {
                    "outPath": tmpdir
                }
            },
        ],
    })
    pipeline.run()

    # Read
    package = Package(os.path.join(tmpdir, "datapackage.json"))
    assert package.get_resource("table").read_rows() == [
        {
            "id": "1",
            "name": "english"
        },
        {
            "id": "2",
            "name": "中国人"
        },
    ]
Exemplo n.º 22
0
def test_ensure_dir(tmpdir):

    # Write
    transform(
        {
            "type": "package",
            "steps": [
                {"type": "load", "spec": {"loadSource": "data/table.csv"}},
                {"type": "set_type", "spec": {"name": "id", "type": "string"}},
                {"type": "dump_to_path", "spec": {"outPath": tmpdir}},
            ],
        }
    )

    # Read
    package = Package(os.path.join(tmpdir, "datapackage.json"))
    assert package.get_resource("table").read_rows() == [
        {"id": "1", "name": "english"},
        {"id": "2", "name": "中国人"},
    ]
def test_validate_package_composite_primary_key_not_unique_issue_215():
    descriptor = {
        "resources": [{
            "name": "name",
            "data": [["id1", "id2"], ["a", "1"], ["a", "1"]],
            "schema": {
                "fields": [{
                    "name": "id1"
                }, {
                    "name": "id2"
                }],
                "primaryKey": ["id1", "id2"],
            },
        }],
    }
    package = Package(descriptor)
    report = package.validate(skip_errors=["duplicate-row"])
    assert report.flatten(["rowPosition", "fieldPosition", "code"]) == [
        [3, None, "primary-key-error"],
    ]
def test_analyze_package_detailed_non_numeric_summary():
    package = Package("data/package-1067.json")
    analysis = package.analyze(detailed=True)
    path_1 = "data/capital-valid.csv" if IS_UNIX else "data\\capital-valid.csv"
    path_2 = "data/analysis-data.csv" if IS_UNIX else "data\\analysis-data.csv"
    assert analysis[path_1]["fieldStats"]["name"]["type"] == "categorical"
    assert analysis[path_1]["fieldStats"]["name"]["values"] == {
        "Berlin",
        "London",
        "Madrid",
        "Paris",
        "Rome",
    }
    assert analysis[path_2]["fieldStats"]["school_accreditation"][
        "type"] == "categorical"
    assert analysis[path_2]["fieldStats"]["school_accreditation"][
        "values"] == {
            "A",
            "B",
        }
Exemplo n.º 25
0
def extract_translations(c):
    Path("i18n/").mkdir(parents=True, exist_ok=True)
    catalog = Catalog()
    for datapackage_path in discover_datapackages():
        pkg = Package(datapackage_path)
        dbname = os.path.basename(os.path.dirname(datapackage_path))
        # extract catalog for translation
        collect_messages(catalog, pkg)

    with open(f"i18n/messages.pot", 'wb') as f:
        write_po(f, catalog)
def test_analyze_package_invalid_data():
    descriptor = {
        "name": "capitals and schools",
        "resources": [
            {
                "name": "capital-invalid",
                "path": "data/invalid.csv"
            },
        ],
    }
    package = Package(descriptor)
    analysis = package.analyze()
    assert (round(analysis["data/invalid.csv"]["averageRecordSizeInBytes"])
            == 12 if IS_UNIX else 14)
    assert analysis["data/invalid.csv"]["fields"] == 4
    assert analysis["data/invalid.csv"]["fieldStats"] == {}
    assert analysis["data/invalid.csv"]["rows"] == 4
    assert analysis["data/invalid.csv"]["rowsWithNullValues"] == 3
    assert analysis["data/invalid.csv"]["notNullRows"] == 1
    assert analysis["data/invalid.csv"]["variableTypes"] == {}
Exemplo n.º 27
0
def test_package_from_path():
    package = Package("data/package.json")
    assert package.name == "name"
    assert package.basepath == "data"
    assert package.profile == "data-package"
    assert package.resources == [
        {
            "name": "name",
            "path": "table.csv"
        },
    ]
Exemplo n.º 28
0
def test_pandas_parser_write_bug_1100():
    datapackage = Package("data/issue-1100.package.json")
    target = datapackage.resources[0].to_pandas()
    assert target.to_dict("records") == [
        {
            "timestamp": pd.Timestamp(2022, 5, 25, 10, 39, 15)
        },
        {
            "timestamp": pd.Timestamp(2022, 5, 25, 10, 39, 15)
        },
    ]
Exemplo n.º 29
0
def test_package_expand_resource_dialect():
    dialect = {"delimiter": ";"}
    package = Package({"resources": [{"path": "data/table.csv", "dialect": dialect}]})
    package.expand()
    assert package == {
        "resources": [
            {
                "path": "data/table.csv",
                "dialect": {
                    "delimiter": ";",
                    "lineTerminator": "\r\n",
                    "quoteChar": '"',
                    "doubleQuote": True,
                    "skipInitialSpace": False,
                },
                "profile": "data-resource",
            }
        ],
        "profile": "data-package",
    }
def test_analyze_package_detailed_variable_types():
    package = Package("data/package-1067.json")
    analysis = package.analyze(detailed=True)
    path_1 = "data/capital-valid.csv" if IS_UNIX else "data\\capital-valid.csv"
    path_2 = "data/capital-invalid.csv" if IS_UNIX else "data\\capital-invalid.csv"
    path_3 = "data/analysis-data.csv" if IS_UNIX else "data\\analysis-data.csv"
    assert len(analysis) == 3
    assert analysis[path_1]["variableTypes"] == {
        "number": 1,
        "string": 1,
    }
    assert analysis[path_2]["variableTypes"] == {
        "integer": 1,
        "string": 1,
    }
    assert analysis[path_3]["variableTypes"] == {
        "boolean": 2,
        "integer": 2,
        "number": 2,
        "string": 5,
    }