def _test_script_world_bank(DIRECTORY, script, rebase=False): """World Bank urban population time-series, in wide format. Demonstrate define a schema, create a method, import data, perform actions and generate a schema-compliant output. From https://databank.worldbank.org/reports.aspx?source=2&type=metadata&series=SP.URB.TOTL""" SCHEMA_NAME = "/data/urban_population.json" SCHEMA_SOURCE = SOURCE_DIRECTORY + SCHEMA_NAME SCHEMA = whyqd.Schema(source=SCHEMA_SOURCE) SOURCE_DATA = SOURCE_DIRECTORY + "/data/working_test_world_bank_data.xlsx" method = whyqd.Method(directory=DIRECTORY, schema=SCHEMA) method.set({"name": "urban_population_method"}) method.add_data(source=SOURCE_DATA) scripts = [] if rebase: scripts = [ "DEBLANK", "DEDUPE", "REBASE < [2]", ] # Run the test script scripts.append(script) source_data = method.get.input_data[0] method.add_actions(scripts, source_data.uuid.hex, sheet_name=source_data.sheet_name) method.transform(source_data) return True
def test_build(self): s = whyqd.Schema() details = {"name": data["name"], "title": data["title"], "description": data["description"]} s.set(schema=details) for field in data["fields"]: s.add_field(field=field) # also this https://github.com/samuelcolvin/pydantic/issues/1283#issuecomment-594041870 # foo_excludes = {idx: {"id"} for idx in range(len(my_bar.foos))} # my_bar.dict(exclude={"foos": foo_excludes}) # https://pydantic-docs.helpmanual.io/usage/exporting_models/#advanced-include-and-exclude schema_exclude = { f_idx: ( { "uuid": ..., "constraints": {"category": {c_idx: {"uuid"} for c_idx in range(len(f.constraints.category))}}, } if f.constraints else {"uuid"} ) for f_idx, f in enumerate(s.get.fields) } d = s.get.dict( by_alias=True, exclude_defaults=True, exclude_none=True, exclude={"uuid": ..., "fields": schema_exclude} ) assert d == data
def _test_script_portsmouth(DIRECTORY, script, test_filter=False): """Portsmouth ratepayer data in multiple spreadsheets. Demonstrating create method, add date, actions and perform a merge, plus filter the final result.""" SCHEMA_NAME = "/data/test_schema.json" SCHEMA_SOURCE = SOURCE_DIRECTORY + SCHEMA_NAME SCHEMA = whyqd.Schema(source=SCHEMA_SOURCE) INPUT_DATA = SOURCE_DIRECTORY + "/data/working_test_data.xlsx" if test_filter: INPUT_DATA = SOURCE_DIRECTORY + "/data/restructured_test_data.xlsx" method = whyqd.Method(directory=DIRECTORY, schema=SCHEMA) method.set({"name": "test_method"}) method.add_data(source=INPUT_DATA) source_data = method.get.input_data[0] method.add_actions(script, source_data.uuid.hex, sheet_name=source_data.sheet_name) method.transform(source_data) return True
def test_create(self): s = whyqd.Schema() field: whyqd.FieldModel = { "name": "test_field", "type": "string", "constraints": { "required": True, "category": [ {"name": "dog", "description": "A type of mammal"}, {"name": "cat", "description": "A different type of mammal"}, {"name": "mouse", "description": "A small type of mammal"}, ], }, } schema: whyqd.SchemaModel = { "name": "test_schema", } s.set(schema) s.add_field(field)
def test_merge(self, tmp_path): """Portsmouth ratepayer data in multiple spreadsheets. Demonstrating create method, add date, actions and perform a merge, plus filter the final result.""" DIRECTORY = str(tmp_path) + "/" CoreScript().check_path(DIRECTORY) SCHEMA_NAME = "/data/test_schema.json" SCHEMA_SOURCE = SOURCE_DIRECTORY + SCHEMA_NAME SCHEMA = whyqd.Schema(source=SCHEMA_SOURCE) INPUT_DATA = [ SOURCE_DIRECTORY + "/data/raw_E06000044_014_0.XLSX", SOURCE_DIRECTORY + "/data/raw_E06000044_014_1.XLSX", SOURCE_DIRECTORY + "/data/raw_E06000044_014_2.XLSX", ] method = whyqd.Method(directory=DIRECTORY, schema=SCHEMA) method.set({"name": "test_method"}) input_data = [{"path": d} for d in INPUT_DATA] method.add_data(source=input_data) # reorder input_order = [m.uuid.hex for m in method.get.input_data] input_order.reverse() method.reorder_data(order=input_order) # "MERGE < ['key_column'::'source_hex'::'sheet_name', etc.]" merge_reference = [ { "source_hex": method.get.input_data[0].uuid.hex, "key_column": "Property ref no" }, { "source_hex": method.get.input_data[1].uuid.hex, "key_column": "Property Reference Number" }, { "source_hex": method.get.input_data[2].uuid.hex, "key_column": "Property Reference Number" }, ] merge_terms = ", ".join([ f"'{m['key_column']}'::'{m['source_hex']}'" for m in merge_reference ]) merge_script = f"MERGE < [{merge_terms}]" method.merge(merge_script)
from pathlib import Path import whyqd from whyqd.parsers import CoreScript SCHEMA_NAME = "/data/test_schema.json" SOURCE_DIRECTORY = str(Path(__file__).resolve().parent) SCHEMA_SOURCE = SOURCE_DIRECTORY + SCHEMA_NAME SCHEMA = whyqd.Schema(source=SCHEMA_SOURCE) INPUT_DATA = [ SOURCE_DIRECTORY + "/data/raw_E06000044_014_0.XLSX", SOURCE_DIRECTORY + "/data/raw_E06000044_014_1.XLSX", SOURCE_DIRECTORY + "/data/raw_E06000044_014_2.XLSX", ] class TestMethod: def test_tutorial(self, tmp_path): """Portsmouth ratepayer data in multiple spreadsheets. Demonstrating create method, add date, actions and perform a merge, plus filter the final result.""" DIRECTORY = str(tmp_path) + "/" CoreScript().check_path(DIRECTORY) method = whyqd.Method(directory=DIRECTORY, schema=SCHEMA) method.set({"name": "test_method"}) input_data = [{"path": d} for d in INPUT_DATA] method.add_data(source=input_data, get_row_count=True) # reorder input_order = [m.uuid.hex for m in method.get.input_data] input_order.reverse() method.reorder_data(order=input_order) # "MERGE < ['key_column'::'source_hex'::'sheet_name', etc.]"
def test_tutorial(self, tmp_path): """World Bank urban population time-series, in wide format. Demonstrate define a schema, create a method, import data, perform actions and generate a schema-compliant output. From https://databank.worldbank.org/reports.aspx?source=2&type=metadata&series=SP.URB.TOTL""" DIRECTORY = str(tmp_path) + "/" CoreScript().check_path(DIRECTORY) # DEFINE SCHEMA details = { "name": "urban_population", "title": "Urban population", "description": "Urban population refers to people living in urban areas as defined by national statistical offices. It is calculated using World Bank population estimates and urban ratios from the United Nations World Urbanization Prospects. Aggregation of urban and rural population may not add up to total population because of different country coverages.", } schema = whyqd.Schema() schema.set(details) fields = [ { "name": "Indicator Code", "title": "Indicator Code", "type": "string", "description": "World Bank code reference for Indicator Name.", "constraints": { "required": True }, }, { "name": "Country Name", "title": "Country Name", "type": "string", "description": "Official country names.", "constraints": { "required": True }, }, { "name": "Country Code", "title": "Country Code", "type": "string", "description": "UN ISO 3-letter country code.", "constraints": { "required": True }, }, { "name": "Indicator Name", "title": "Indicator Name", "type": "string", "description": "Indicator described in the data series.", "constraints": { "required": True }, }, { "name": "Year", "title": "Year", "type": "year", "description": "Year of release.", "constraints": { "required": True }, }, { "name": "Values", "title": "Values", "type": "number", "description": "Value for the Year and Indicator Name.", "constraints": { "required": True }, }, ] for field in fields: schema.add_field(field) schema.save(DIRECTORY) # CREATE METHOD SCHEMA_SOURCE = DIRECTORY + "urban_population.json" SCHEMA = whyqd.Schema(source=SCHEMA_SOURCE) method = whyqd.Method(directory=DIRECTORY, schema=SCHEMA) method.set({"name": "urban_population_method"}) input_data = {"path": SOURCE_DATA} method.add_data(source=input_data) # Define actions schema_scripts = [ "DEBLANK", "DEDUPE", "REBASE < [2]", ] source_data = method.get.input_data[0] method.add_actions(schema_scripts, source_data.uuid.hex, sheet_name=source_data.sheet_name) # df = method.transform(source_data) source_data = method.get.input_data[0] source_columns = [c.name for c in source_data.columns] schema_scripts = [ f"PIVOT_LONGER > {source_columns[4:]}", "RENAME > 'indicator_code' < ['Indicator Code']", "RENAME > 'indicator_name' < ['Indicator Name']", "RENAME > 'country_code' < ['Country Code']", "RENAME > 'country_name' < ['Country Name']", "RENAME > 'year' < ['PIVOT_LONGER_names_idx_4']", "RENAME > 'values' < ['PIVOT_LONGER_values_idx_5']", ] method.add_actions(schema_scripts, source_data.uuid.hex, sheet_name=source_data.sheet_name) # Unambiguous deletion so they are not part of the research record for unwanted_data in method.get.input_data[1:]: method.remove_data(unwanted_data.uuid.hex, sheet_name=unwanted_data.sheet_name) method.build() assert method.validate()
def test_load(self): s = whyqd.Schema(source=source) d = s.get.dict(by_alias=True, exclude_defaults=True, exclude_none=True, exclude_unset=True) assert d == data
def test_tutorial(self, tmp_path): """World Bank Human Development Report 2007 - 2008 in Cthulhu format. Demonstrate define a schema, create a method, import data, perform actions and generate a schema-compliant output. Copies at: https://github.com/whythawk/data-wrangling-and-validation/tree/master/data/lesson-spreadsheet""" DIRECTORY = str(tmp_path) + "/" CoreScript().check_path(DIRECTORY) # DEFINE SCHEMA details = { "name": "human-development-report", "title": "UN Human Development Report 2007 - 2008", "description": """ In 1990 the first Human Development Report introduced a new approach for advancing human wellbeing. Human development – or the human development approach - is about expanding the richness of human life, rather than simply the richness of the economy in which human beings live. It is an approach that is focused on people and their opportunities and choices.""", } schema = whyqd.Schema() schema.set(details) fields = [ { "name": "year", "title": "Year", "type": "year", "description": "Year of release.", }, { "name": "country_name", "title": "Country Name", "type": "string", "description": "Official country names.", "constraints": {"required": True}, }, { "name": "indicator_name", "title": "Indicator Name", "type": "string", "description": "Indicator described in the data series.", }, { "name": "values", "title": "Values", "type": "number", "description": "Value for the Year and Indicator Name.", "constraints": {"required": True}, }, { "name": "reference", "title": "Reference", "type": "string", "description": "Reference to data source.", }, ] for field in fields: schema.add_field(field) schema.save(DIRECTORY) # CREATE METHOD SCHEMA_SOURCE = DIRECTORY + "human-development-report.json" SCHEMA = whyqd.Schema(source=SCHEMA_SOURCE) method = whyqd.Method(directory=DIRECTORY, schema=SCHEMA) method.set({"name": "human-development-report-method"}) input_data = {"path": SOURCE_DATA} method.add_data(source=input_data) # Define actions schema_scripts = [ "DEBLANK", "DEDUPE", "REBASE < [11]", ] source_data = method.get.input_data[0] method.add_actions(schema_scripts, source_data.uuid.hex, sheet_name=source_data.sheet_name) # Get the index source_data = method.get.input_data[0] df = method.transform(source_data) schema_scripts = [ f"DELETE_ROWS < {[int(i) for i in np.arange(144, df.index[-1]+1)]}", "RENAME_ALL > ['HDI rank', 'Country', 'Human poverty index (HPI-1) - Rank;;2008', 'Reference 1', 'Human poverty index (HPI-1) - Value (%);;2008', 'Probability at birth of not surviving to age 40 (% of cohort);;2000-05', 'Reference 2', 'Adult illiteracy rate (% aged 15 and older);;1995-2005', 'Reference 3', 'Population not using an improved water source (%);;2004', 'Reference 4', 'Children under weight for age (% under age 5);;1996-2005', 'Reference 5', 'Population below income poverty line (%) - $1 a day;;1990-2005', 'Reference 6', 'Population below income poverty line (%) - $2 a day;;1990-2005', 'Reference 7', 'Population below income poverty line (%) - National poverty line;;1990-2004', 'Reference 8', 'HPI-1 rank minus income poverty rank;;2008']", "PIVOT_CATEGORIES > ['HDI rank'] < [14,44,120]", "RENAME_NEW > 'HDI Category'::['PIVOT_CATEGORIES_idx_20_0']", "PIVOT_LONGER > = ['HDI rank', 'HDI Category', 'Human poverty index (HPI-1) - Rank;;2008', 'Human poverty index (HPI-1) - Value (%);;2008', 'Probability at birth of not surviving to age 40 (% of cohort);;2000-05', 'Adult illiteracy rate (% aged 15 and older);;1995-2005', 'Population not using an improved water source (%);;2004', 'Children under weight for age (% under age 5);;1996-2005', 'Population below income poverty line (%) - $1 a day;;1990-2005', 'Population below income poverty line (%) - $2 a day;;1990-2005', 'Population below income poverty line (%) - National poverty line;;1990-2004', 'HPI-1 rank minus income poverty rank;;2008']", "SPLIT > ';;'::['PIVOT_LONGER_names_idx_9']", "DEBLANK", "DEDUPE", ] method.add_actions(schema_scripts, source_data.uuid.hex, sheet_name=source_data.sheet_name) # Get the column list reference_columns = [c.name for c in method.get.input_data[0].columns if c.name.startswith("Reference")] schema_scripts = [ f"JOIN > 'reference' < {reference_columns}", "RENAME > 'indicator_name' < ['SPLIT_idx_11_0']", "RENAME > 'country_name' < ['Country']", "RENAME > 'year' < ['SPLIT_idx_12_1']", "RENAME > 'values' < ['PIVOT_LONGER_values_idx_10']", ] method.add_actions(schema_scripts, source_data.uuid.hex, sheet_name=source_data.sheet_name) method.build() assert method.validate()