def test_step_resource_update_new_name(): source = Package("data/package/datapackage.json") target = transform( source, steps=[ steps.resource_update(name="data", new_name="new-name"), ], ) assert target.get_resource("new-name").path == "data.csv"
def test_step_resource_update(): source = Package("data/package/datapackage.json") target = transform( source, steps=[ steps.resource_update(name="data", title="New title"), ], ) assert target.get_resource("data").title == "New title"
def test_validate_package_composite_primary_key_unique_issue_215(): source = { "resources": [{ "name": "name", "data": [["id1", "id2"], ["a", "1"], ["a", "2"]], "schema": { "fields": [{ "name": "id1" }, { "name": "id2" }], "primaryKey": ["id1", "id2"], }, }], } package = Package(source) report = package.validate() assert report.valid
def test_resource_integrity_onerror_header_raise(): data = [["name"], [1], [2], [3]] schema = {"fields": [{"name": "bad", "type": "integer"}]} package = Package({"resources": [{"data": data, "schema": schema}]}, onerror="raise") resource = package.resources[0] assert package.onerror == "raise" assert resource.onerror == "raise" with pytest.raises(FrictionlessException): resource.read_rows()
def test_package_resources_inline(): data = [["id", "name"], ["1", "english"], ["2", "中国人"]] package = Package({"resources": [{"name": "name", "data": data}]}) resource = package.get_resource("name") assert len(package.resources) == 1 assert resource.path is None assert resource.data == data assert resource.fullpath is None assert resource.read_rows() == [ { "id": 1, "name": "english" }, { "id": 2, "name": "中国人" }, ]
def test_package_from_zip_remote(): package = Package(BASE_URL % "data/package.zip") assert package.name == "testing" assert len(package.resources) == 2 assert package.get_resource("data2").read_rows() == [ { "parent": "A3001", "comment": "comment1" }, { "parent": "A3001", "comment": "comment2" }, { "parent": "A5032", "comment": "comment3" }, ]
def test_resource_integrity_onerror_row_warn(): data = [["name"], [1], [2], [3]] schema = {"fields": [{"name": "name", "type": "string"}]} package = Package(resources=[Resource(data=data, schema=schema)], onerror="warn") resource = package.resources[0] assert package.onerror == "warn" assert resource.onerror == "warn" with pytest.warns(UserWarning): resource.read_rows()
def test_package_from_zip_no_descriptor(tmpdir): descriptor = str(tmpdir.join("package.zip")) with zipfile.ZipFile(descriptor, "w") as zip: zip.writestr("data.txt", "foobar") with pytest.raises(FrictionlessException) as excinfo: Package(descriptor) error = excinfo.value.error assert error.code == "package-error" assert error.note.count("datapackage.json")
def test_step_resource_remove(): source = Package("data/package/datapackage.json") target = transform( source, steps=[ steps.resource_remove(name="data2"), ], ) assert target.resource_names == ["data"]
def test_storage_integrity(options): # Export/Import source = Package("data/storage/integrity.json") storage = source.to_bigquery(force=True, **options) target = Package.from_bigquery(**options) # Assert metadata (main) assert target.get_resource("integrity_main").schema == { "fields": [ # added required {"name": "id", "type": "integer"}, {"name": "parent", "type": "integer"}, {"name": "description", "type": "string"}, ], # primary key removal # foreign keys removal } # Assert metadata (link) assert target.get_resource("integrity_link").schema == { "fields": [ {"name": "main_id", "type": "integer"}, {"name": "some_id", "type": "integer"}, # constraint removal {"name": "description", "type": "string"}, # constraint removal ], # primary key removal # foreign keys removal } # Assert data (main) assert target.get_resource("integrity_main").read_rows() == [ {"id": 1, "parent": None, "description": "english"}, {"id": 2, "parent": 1, "description": "中国人"}, ] # Assert data (link) assert target.get_resource("integrity_link").read_rows() == [ {"main_id": 1, "some_id": 1, "description": "note1"}, {"main_id": 2, "some_id": 2, "description": "note2"}, ] # Cleanup storage storage.delete_package(target.resource_names)
def test_sql_storage_postgresql_integrity_different_order_issue_957(postgresql_url): dialect = SqlDialect(prefix="prefix_") source = Package("data/storage/integrity.json") source.add_resource(source.remove_resource("integrity_main")) storage = source.to_sql(postgresql_url, dialect=dialect) target = Package.from_sql(postgresql_url, dialect=dialect) assert len(target.resources) == 2 storage.delete_package(target.resource_names)
def test_analyze_package_detailed_numeric_values_descriptive_summary(): package = Package("data/package-1067.json") analysis = package.analyze(detailed=True) path = "data/analysis-data.csv" if IS_UNIX else "data\\analysis-data.csv" assert list(analysis[path]["fieldStats"]["parent_age"].keys()) == [ "type", "mean", "median", "mode", "variance", "quantiles", "stdev", "max", "min", "bounds", "uniqueValues", "outliers", "missingValues", ]
def test_describe_package_with_dialect_path_1126(): package = Package.describe("data/country-2.csv", dialect="data/dialect.json") assert package.get_resource("country-2")["schema"] == { "fields": [ {"type": "integer", "name": "id"}, {"type": "integer", "name": "neighbor_id"}, {"type": "string", "name": "name"}, {"type": "integer", "name": "population"}, ] }
def test_validate_package_schema_multiple_foreign_key_resource_violation_non_existent( ): descriptor = deepcopy(DESCRIPTOR_FK) # remove London del descriptor["resources"][0]["data"][1] descriptor["resources"].append(MULTI_FK_RESSOURCE) package = Package(descriptor) report = package.validate() assert report.flatten([ "rowPosition", "fieldPosition", "code", "cells", "note" ]) == [ [ 2, None, "foreign-key-error", ["1", "2", "1.5"], 'for "from, to": values "1, 2" not found in the lookup table "cities" as "id, next_id"', ], ]
def test_read_package_bad_package_id(): dataset_id = "bad-dataset-id" base_url = "https://demo.ckan.org" _mock_json_call( f"{base_url}/api/3/action/package_show?id={dataset_id}", "data/ckan_mock_responses/ckan_error.json", status=404, ) with pytest.raises(exceptions.FrictionlessException) as excinfo: Package.from_ckan( base_url=base_url, dataset_id=dataset_id, api_key="env:CKAN_API_KEY", ) error = excinfo.value.error assert error.code == "storage-error" assert error.note.count("Not found")
def construct_dictionary_knowledge_graph(datapackage_fp, temp_dir_loc, resource_name='ids'): package = Package(datapackage_fp, profile='tabular-data-package') ids_resource = package.get_resource(resource_name) field_hierarchies = get_field_name_tags(ids_resource.schema) root_field = field_hierarchies_to_root(field_hierarchies) root_field_type = [field['type'] for field in ids_resource.schema['fields'] if field['name']==root_field][0] df_ids = assign_idx_fields(ids_resource.to_pandas(), root_field, root_field_type) site_data = initialise_site_data_with_ids(df_ids, field_hierarchies) fk_external_datapackage_refs = extract_external_foreignkey_datapackage_refs(ids_resource, primary_key_field=root_field) fk_external_datapackage_refs = add_resource_locs_to_external_datapackage_refs(fk_external_datapackage_refs) download_attribute_data_to_temp_dir(fk_external_datapackage_refs, temp_dir_loc=temp_dir_loc) site_data = extract_attrs_from_resource_dfs(site_data, fk_external_datapackage_refs, temp_dir_loc) site_data = json_nan_to_none(site_data) return site_data
def test_package_validation_duplicate_resource_names_issue_942(): package = Package( resources=[ Resource(name="name", path="data/table.csv"), Resource(name="name", path="data/table.csv"), ] ) errors = package.metadata_errors assert len(errors) == 1 assert errors[0].note == "names of the resources are not unique"
def test_package_from_path_remote(): package = Package(BASEURL % "data/package.json") assert package.basepath == BASEURL % "data" assert package == { "name": "name", "resources": [{ "name": "name", "path": "table.csv" }], }
def package(c): Path("build/").mkdir(parents=True, exist_ok=True) metadata = { "title": "Podnebnik", "description": "TODO", } for datapackage_path in discover_datapackages(): pkg = Package(datapackage_path) dbname = os.path.basename(os.path.dirname(datapackage_path)) # convert Package to sqlite db pkg.to_sql(f"sqlite:///build/{dbname}.db") # extract metadata from Package pkg_meta = extract_metadata(pkg, dbname) # merge metadata metadata = always_merger.merge(metadata, pkg_meta) with open('build/metadata.json', 'w') as f: json.dump(metadata, f, indent=4, sort_keys=True)
def run(): base_dir = pathlib.Path(__file__).absolute().parent.parent data_file = os.path.join(base_dir, "data", "datapackage.json") package = Package(data_file) resource = package.get_resource("content-pages") for row in resource.read_rows(): try: resp = requests.get(row["url"]) resp.raise_for_status() soup = BeautifulSoup(resp.text, "html.parser") # fixup relative css hrefs to be absolute for link in soup.find_all("link", attrs={"rel": "stylesheet"}): href = urljoin(row["url"], link["href"]) link["href"] = href # remove js and forms for remove in ["script", "form"]: for item in soup.find_all(remove): item.decompose() # remove meta tags apart from encoding for m in soup.find_all("meta"): try: if (m is not None and m.get("charset") is None and m.get("content") != "text/html; charset=utf-8"): m.decompose() except Exception as e: print(e) # gov.uk page that was tested had a specific aria attr # with dynamic id attached which changes and can be ignored for diffs labelled_ids = [] for tag in soup(): if tag.attrs.get("aria-labelledby"): labelled_ids.append(tag.attrs.get("aria-labelledby")) del tag.attrs["aria-labelledby"] for id in labelled_ids: for element in soup.find_all(id=id): del element["id"] for tag in soup.find_all(class_="attachment embedded"): del tag["id"] html = soup.prettify() out = os.path.join(base_dir, "collected", f"{row['id']}.html") updated = content_updated(out, html) with open(out, "w") as html_file: html_file.write(html) if updated: package.update(updated=datetime.now().isoformat()) package.to_json(data_file) except requests.HTTPError as e: print(f"Error getting {row['url']}") print(e)
def test_pipeline(tmpdir): # Write pipeline = DataflowsPipeline({ "type": "dataflows", "steps": [ { "type": "load", "spec": { "loadSource": "data/table.csv" } }, { "type": "setType", "spec": { "name": "id", "type": "string" } }, { "type": "dumpToPath", "spec": { "outPath": tmpdir } }, ], }) pipeline.run() # Read package = Package(os.path.join(tmpdir, "datapackage.json")) assert package.get_resource("table").read_rows() == [ { "id": "1", "name": "english" }, { "id": "2", "name": "中国人" }, ]
def test_ensure_dir(tmpdir): # Write transform( { "type": "package", "steps": [ {"type": "load", "spec": {"loadSource": "data/table.csv"}}, {"type": "set_type", "spec": {"name": "id", "type": "string"}}, {"type": "dump_to_path", "spec": {"outPath": tmpdir}}, ], } ) # Read package = Package(os.path.join(tmpdir, "datapackage.json")) assert package.get_resource("table").read_rows() == [ {"id": "1", "name": "english"}, {"id": "2", "name": "中国人"}, ]
def test_validate_package_composite_primary_key_not_unique_issue_215(): descriptor = { "resources": [{ "name": "name", "data": [["id1", "id2"], ["a", "1"], ["a", "1"]], "schema": { "fields": [{ "name": "id1" }, { "name": "id2" }], "primaryKey": ["id1", "id2"], }, }], } package = Package(descriptor) report = package.validate(skip_errors=["duplicate-row"]) assert report.flatten(["rowPosition", "fieldPosition", "code"]) == [ [3, None, "primary-key-error"], ]
def test_analyze_package_detailed_non_numeric_summary(): package = Package("data/package-1067.json") analysis = package.analyze(detailed=True) path_1 = "data/capital-valid.csv" if IS_UNIX else "data\\capital-valid.csv" path_2 = "data/analysis-data.csv" if IS_UNIX else "data\\analysis-data.csv" assert analysis[path_1]["fieldStats"]["name"]["type"] == "categorical" assert analysis[path_1]["fieldStats"]["name"]["values"] == { "Berlin", "London", "Madrid", "Paris", "Rome", } assert analysis[path_2]["fieldStats"]["school_accreditation"][ "type"] == "categorical" assert analysis[path_2]["fieldStats"]["school_accreditation"][ "values"] == { "A", "B", }
def extract_translations(c): Path("i18n/").mkdir(parents=True, exist_ok=True) catalog = Catalog() for datapackage_path in discover_datapackages(): pkg = Package(datapackage_path) dbname = os.path.basename(os.path.dirname(datapackage_path)) # extract catalog for translation collect_messages(catalog, pkg) with open(f"i18n/messages.pot", 'wb') as f: write_po(f, catalog)
def test_analyze_package_invalid_data(): descriptor = { "name": "capitals and schools", "resources": [ { "name": "capital-invalid", "path": "data/invalid.csv" }, ], } package = Package(descriptor) analysis = package.analyze() assert (round(analysis["data/invalid.csv"]["averageRecordSizeInBytes"]) == 12 if IS_UNIX else 14) assert analysis["data/invalid.csv"]["fields"] == 4 assert analysis["data/invalid.csv"]["fieldStats"] == {} assert analysis["data/invalid.csv"]["rows"] == 4 assert analysis["data/invalid.csv"]["rowsWithNullValues"] == 3 assert analysis["data/invalid.csv"]["notNullRows"] == 1 assert analysis["data/invalid.csv"]["variableTypes"] == {}
def test_package_from_path(): package = Package("data/package.json") assert package.name == "name" assert package.basepath == "data" assert package.profile == "data-package" assert package.resources == [ { "name": "name", "path": "table.csv" }, ]
def test_pandas_parser_write_bug_1100(): datapackage = Package("data/issue-1100.package.json") target = datapackage.resources[0].to_pandas() assert target.to_dict("records") == [ { "timestamp": pd.Timestamp(2022, 5, 25, 10, 39, 15) }, { "timestamp": pd.Timestamp(2022, 5, 25, 10, 39, 15) }, ]
def test_package_expand_resource_dialect(): dialect = {"delimiter": ";"} package = Package({"resources": [{"path": "data/table.csv", "dialect": dialect}]}) package.expand() assert package == { "resources": [ { "path": "data/table.csv", "dialect": { "delimiter": ";", "lineTerminator": "\r\n", "quoteChar": '"', "doubleQuote": True, "skipInitialSpace": False, }, "profile": "data-resource", } ], "profile": "data-package", }
def test_analyze_package_detailed_variable_types(): package = Package("data/package-1067.json") analysis = package.analyze(detailed=True) path_1 = "data/capital-valid.csv" if IS_UNIX else "data\\capital-valid.csv" path_2 = "data/capital-invalid.csv" if IS_UNIX else "data\\capital-invalid.csv" path_3 = "data/analysis-data.csv" if IS_UNIX else "data\\analysis-data.csv" assert len(analysis) == 3 assert analysis[path_1]["variableTypes"] == { "number": 1, "string": 1, } assert analysis[path_2]["variableTypes"] == { "integer": 1, "string": 1, } assert analysis[path_3]["variableTypes"] == { "boolean": 2, "integer": 2, "number": 2, "string": 5, }