def test_create(self, tmp_path): DIRECTORY = str(tmp_path) + "/" CoreScript().check_path(DIRECTORY) method = whyqd.Method(directory=DIRECTORY, schema=SCHEMA) method.set({"name": "test_method"}) input_data = [{"path": d} for d in INPUT_DATA] method.add_data(source=input_data) method.save(created_by="Gavin Chait", hide_uuid=True) test_source = DIRECTORY + "test_method.json" method = whyqd.Method(directory=DIRECTORY, schema=SCHEMA, method=CoreScript().load_json(test_source)) assert METHOD["input_data"][0]["checksum"] == method.get.input_data[0].checksum assert METHOD["input_data"][1]["checksum"] == method.get.input_data[1].checksum assert METHOD["input_data"][2]["checksum"] == method.get.input_data[2].checksum
def _test_script_world_bank(DIRECTORY, script, rebase=False): """World Bank urban population time-series, in wide format. Demonstrate define a schema, create a method, import data, perform actions and generate a schema-compliant output. From https://databank.worldbank.org/reports.aspx?source=2&type=metadata&series=SP.URB.TOTL""" SCHEMA_NAME = "/data/urban_population.json" SCHEMA_SOURCE = SOURCE_DIRECTORY + SCHEMA_NAME SCHEMA = whyqd.Schema(source=SCHEMA_SOURCE) SOURCE_DATA = SOURCE_DIRECTORY + "/data/working_test_world_bank_data.xlsx" method = whyqd.Method(directory=DIRECTORY, schema=SCHEMA) method.set({"name": "urban_population_method"}) method.add_data(source=SOURCE_DATA) scripts = [] if rebase: scripts = [ "DEBLANK", "DEDUPE", "REBASE < [2]", ] # Run the test script scripts.append(script) source_data = method.get.input_data[0] method.add_actions(scripts, source_data.uuid.hex, sheet_name=source_data.sheet_name) method.transform(source_data) return True
def test_merge_and_build(self, tmp_path): DIRECTORY = str(tmp_path) + "/" CoreScript().check_path(DIRECTORY) method = whyqd.Method(directory=DIRECTORY, schema=SCHEMA) method.set({"name": "test_method"}) input_data = [{"path": d} for d in INPUT_DATA] method.add_data(source=input_data) # "MERGE < ['key_column'::'source_hex'::'sheet_name', etc.]" merge_reference = [ {"source_hex": method.get.input_data[2].uuid.hex, "key_column": "Property ref no"}, {"source_hex": method.get.input_data[1].uuid.hex, "key_column": "Property Reference Number"}, {"source_hex": method.get.input_data[0].uuid.hex, "key_column": "Property Reference Number"}, ] merge_terms = ", ".join([f"'{m['key_column']}'::'{m['source_hex']}'" for m in merge_reference]) merge_script = f"MERGE < [{merge_terms}]" method.merge(merge_script) schema_scripts = whyqd.parsers.LegacyScript().parse_legacy_method( version="1", schema=SCHEMA, source_path=METHOD_SOURCE_V1 ) source_data = method.get.working_data method.add_actions(schema_scripts, source_data.uuid.hex) method.build() citation = { "author": "Gavin Chait", "month": "feb", "year": 2020, "title": "Portsmouth City Council normalised database of commercial ratepayers", "url": "https://github.com/whythawk/whyqd/tree/master/tests/data", } method.set_citation(citation) test_citation = method.get_citation() del test_citation["input_sources"] del test_citation["restructured_data"] assert test_citation == citation assert method.save(created_by="Gavin Chait", hide_uuid=True)
def test_tutorial(self, tmp_path): """Portsmouth ratepayer data in multiple spreadsheets. Demonstrating create method, add date, actions and perform a merge, plus filter the final result.""" DIRECTORY = str(tmp_path) + "/" CoreScript().check_path(DIRECTORY) method = whyqd.Method(directory=DIRECTORY, schema=SCHEMA) method.set({"name": "test_method"}) input_data = [{"path": d} for d in INPUT_DATA] method.add_data(source=input_data, get_row_count=True) # reorder input_order = [m.uuid.hex for m in method.get.input_data] input_order.reverse() method.reorder_data(order=input_order) # "MERGE < ['key_column'::'source_hex'::'sheet_name', etc.]" merge_reference = [ { "source_hex": method.get.input_data[0].uuid.hex, "key_column": "Property ref no" }, { "source_hex": method.get.input_data[1].uuid.hex, "key_column": "Property Reference Number" }, { "source_hex": method.get.input_data[2].uuid.hex, "key_column": "Property Reference Number" }, ] merge_terms = ", ".join([ f"'{m['key_column']}'::'{m['source_hex']}'" for m in merge_reference ]) merge_script = f"MERGE < [{merge_terms}]" method.merge(merge_script) schema_scripts = [ "NEW > 'la_code' < ['E06000044']", "RENAME > 'ba_ref' < ['Property ref no']", "ORDER > 'prop_ba_rates' < ['Current Rateable Value_x', 'Current Rateable Value_y', 'Current Rateable Value']", "ORDER > 'occupant_name' < ['Primary Liable party name_x', 'Primary Liable party name_y', 'Primary Liable party name']", "ORDER > 'postcode' < ['Full Property Address_x', 'Full Property Address_y', 'Full Property Address']", "CATEGORISE > 'occupation_state' < [+ 'Current Property Exemption Code', + 'Current Relief Type']", "ASSIGN_CATEGORY_UNIQUES > 'occupation_state'::False < 'Current Property Exemption Code'::['EPRN', 'EPRI', 'VOID', 'EPCH', 'LIQUIDATE', 'DECEASED', 'PROHIBITED', 'BANKRUPT']", "ASSIGN_CATEGORY_UNIQUES > 'occupation_state'::False < 'Current Relief Type'::['Empty Property Rate Non-Industrial', 'Empty Property Rate Industrial', 'Empty Property Rate Charitable']", "CATEGORISE > 'occupation_state_reliefs' < [+ 'Current Property Exemption Code', + 'Current Relief Type']", "ASSIGN_CATEGORY_UNIQUES > 'occupation_state_reliefs'::'small_business' < 'Current Relief Type'::['Small Business Relief England', 'Sbre Extension For 12 Months', 'Supporting Small Business Relief']", "ASSIGN_CATEGORY_UNIQUES > 'occupation_state_reliefs'::'enterprise_zone' < 'Current Property Exemption Code'::['INDUSTRIAL']", "ASSIGN_CATEGORY_UNIQUES > 'occupation_state_reliefs'::'vacancy' < 'Current Property Exemption Code'::['EPRN', 'EPRI', 'VOID', 'EPCH', 'LIQUIDATE', 'DECEASED', 'PROHIBITED', 'BANKRUPT']", "ASSIGN_CATEGORY_UNIQUES > 'occupation_state_reliefs'::'vacancy' < 'Current Relief Type'::['Empty Property Rate Non-Industrial', 'Empty Property Rate Industrial', 'Empty Property Rate Charitable']", "ASSIGN_CATEGORY_UNIQUES > 'occupation_state_reliefs'::'retail' < 'Current Relief Type'::['Retail Discount']", "ASSIGN_CATEGORY_UNIQUES > 'occupation_state_reliefs'::'exempt' < 'Current Property Exemption Code'::['C', 'LOW RV', 'LAND']", "ASSIGN_CATEGORY_UNIQUES > 'occupation_state_reliefs'::'other' < 'Current Relief Type'::['Sports Club (Registered CASC)', 'Mandatory']", "ORDER_NEW > 'occupation_state_date' < ['Current Prop Exemption Start Date' + 'Current Prop Exemption Start Date', 'Current Relief Award Start Date' + 'Current Relief Award Start Date', 'Account Start date_x' + 'Account Start date_x', 'Account Start date_y' + 'Account Start date_y']", ] source_data = method.get.working_data method.add_actions(schema_scripts, source_data.uuid.hex) filter_script = "FILTER_AFTER > 'occupation_state_date'::'2010-01-01'" method.add_actions(filter_script, source_data.uuid.hex) method.build() method.validate()
def test_method_conversion_v1(self, tmp_path): DIRECTORY = str(tmp_path) + "/" CoreScript().check_path(DIRECTORY) method = whyqd.Method(directory=DIRECTORY, schema=SCHEMA) method.set({"name": "test_method"}) input_data = {"path": INPUT_DATA_ONESHOT} method.add_data(source=input_data) schema_scripts = whyqd.parsers.LegacyScript().parse_legacy_method( version="1", schema=SCHEMA, source_path=METHOD_SOURCE_V1 ) source_data = method.get.input_data[0] method.add_actions(schema_scripts, source_data.uuid.hex) method.transform(source_data) method.save(created_by="Gavin Chait", hide_uuid=True)
def _test_script_portsmouth(DIRECTORY, script, test_filter=False): """Portsmouth ratepayer data in multiple spreadsheets. Demonstrating create method, add date, actions and perform a merge, plus filter the final result.""" SCHEMA_NAME = "/data/test_schema.json" SCHEMA_SOURCE = SOURCE_DIRECTORY + SCHEMA_NAME SCHEMA = whyqd.Schema(source=SCHEMA_SOURCE) INPUT_DATA = SOURCE_DIRECTORY + "/data/working_test_data.xlsx" if test_filter: INPUT_DATA = SOURCE_DIRECTORY + "/data/restructured_test_data.xlsx" method = whyqd.Method(directory=DIRECTORY, schema=SCHEMA) method.set({"name": "test_method"}) method.add_data(source=INPUT_DATA) source_data = method.get.input_data[0] method.add_actions(script, source_data.uuid.hex, sheet_name=source_data.sheet_name) method.transform(source_data) return True
def test_merge(self, tmp_path): """Portsmouth ratepayer data in multiple spreadsheets. Demonstrating create method, add date, actions and perform a merge, plus filter the final result.""" DIRECTORY = str(tmp_path) + "/" CoreScript().check_path(DIRECTORY) SCHEMA_NAME = "/data/test_schema.json" SCHEMA_SOURCE = SOURCE_DIRECTORY + SCHEMA_NAME SCHEMA = whyqd.Schema(source=SCHEMA_SOURCE) INPUT_DATA = [ SOURCE_DIRECTORY + "/data/raw_E06000044_014_0.XLSX", SOURCE_DIRECTORY + "/data/raw_E06000044_014_1.XLSX", SOURCE_DIRECTORY + "/data/raw_E06000044_014_2.XLSX", ] method = whyqd.Method(directory=DIRECTORY, schema=SCHEMA) method.set({"name": "test_method"}) input_data = [{"path": d} for d in INPUT_DATA] method.add_data(source=input_data) # reorder input_order = [m.uuid.hex for m in method.get.input_data] input_order.reverse() method.reorder_data(order=input_order) # "MERGE < ['key_column'::'source_hex'::'sheet_name', etc.]" merge_reference = [ { "source_hex": method.get.input_data[0].uuid.hex, "key_column": "Property ref no" }, { "source_hex": method.get.input_data[1].uuid.hex, "key_column": "Property Reference Number" }, { "source_hex": method.get.input_data[2].uuid.hex, "key_column": "Property Reference Number" }, ] merge_terms = ", ".join([ f"'{m['key_column']}'::'{m['source_hex']}'" for m in merge_reference ]) merge_script = f"MERGE < [{merge_terms}]" method.merge(merge_script)
def test_tutorial(self, tmp_path): """World Bank urban population time-series, in wide format. Demonstrate define a schema, create a method, import data, perform actions and generate a schema-compliant output. From https://databank.worldbank.org/reports.aspx?source=2&type=metadata&series=SP.URB.TOTL""" DIRECTORY = str(tmp_path) + "/" CoreScript().check_path(DIRECTORY) # DEFINE SCHEMA details = { "name": "urban_population", "title": "Urban population", "description": "Urban population refers to people living in urban areas as defined by national statistical offices. It is calculated using World Bank population estimates and urban ratios from the United Nations World Urbanization Prospects. Aggregation of urban and rural population may not add up to total population because of different country coverages.", } schema = whyqd.Schema() schema.set(details) fields = [ { "name": "Indicator Code", "title": "Indicator Code", "type": "string", "description": "World Bank code reference for Indicator Name.", "constraints": { "required": True }, }, { "name": "Country Name", "title": "Country Name", "type": "string", "description": "Official country names.", "constraints": { "required": True }, }, { "name": "Country Code", "title": "Country Code", "type": "string", "description": "UN ISO 3-letter country code.", "constraints": { "required": True }, }, { "name": "Indicator Name", "title": "Indicator Name", "type": "string", "description": "Indicator described in the data series.", "constraints": { "required": True }, }, { "name": "Year", "title": "Year", "type": "year", "description": "Year of release.", "constraints": { "required": True }, }, { "name": "Values", "title": "Values", "type": "number", "description": "Value for the Year and Indicator Name.", "constraints": { "required": True }, }, ] for field in fields: schema.add_field(field) schema.save(DIRECTORY) # CREATE METHOD SCHEMA_SOURCE = DIRECTORY + "urban_population.json" SCHEMA = whyqd.Schema(source=SCHEMA_SOURCE) method = whyqd.Method(directory=DIRECTORY, schema=SCHEMA) method.set({"name": "urban_population_method"}) input_data = {"path": SOURCE_DATA} method.add_data(source=input_data) # Define actions schema_scripts = [ "DEBLANK", "DEDUPE", "REBASE < [2]", ] source_data = method.get.input_data[0] method.add_actions(schema_scripts, source_data.uuid.hex, sheet_name=source_data.sheet_name) # df = method.transform(source_data) source_data = method.get.input_data[0] source_columns = [c.name for c in source_data.columns] schema_scripts = [ f"PIVOT_LONGER > {source_columns[4:]}", "RENAME > 'indicator_code' < ['Indicator Code']", "RENAME > 'indicator_name' < ['Indicator Name']", "RENAME > 'country_code' < ['Country Code']", "RENAME > 'country_name' < ['Country Name']", "RENAME > 'year' < ['PIVOT_LONGER_names_idx_4']", "RENAME > 'values' < ['PIVOT_LONGER_values_idx_5']", ] method.add_actions(schema_scripts, source_data.uuid.hex, sheet_name=source_data.sheet_name) # Unambiguous deletion so they are not part of the research record for unwanted_data in method.get.input_data[1:]: method.remove_data(unwanted_data.uuid.hex, sheet_name=unwanted_data.sheet_name) method.build() assert method.validate()
def test_tutorial(self, tmp_path): """World Bank Human Development Report 2007 - 2008 in Cthulhu format. Demonstrate define a schema, create a method, import data, perform actions and generate a schema-compliant output. Copies at: https://github.com/whythawk/data-wrangling-and-validation/tree/master/data/lesson-spreadsheet""" DIRECTORY = str(tmp_path) + "/" CoreScript().check_path(DIRECTORY) # DEFINE SCHEMA details = { "name": "human-development-report", "title": "UN Human Development Report 2007 - 2008", "description": """ In 1990 the first Human Development Report introduced a new approach for advancing human wellbeing. Human development – or the human development approach - is about expanding the richness of human life, rather than simply the richness of the economy in which human beings live. It is an approach that is focused on people and their opportunities and choices.""", } schema = whyqd.Schema() schema.set(details) fields = [ { "name": "year", "title": "Year", "type": "year", "description": "Year of release.", }, { "name": "country_name", "title": "Country Name", "type": "string", "description": "Official country names.", "constraints": {"required": True}, }, { "name": "indicator_name", "title": "Indicator Name", "type": "string", "description": "Indicator described in the data series.", }, { "name": "values", "title": "Values", "type": "number", "description": "Value for the Year and Indicator Name.", "constraints": {"required": True}, }, { "name": "reference", "title": "Reference", "type": "string", "description": "Reference to data source.", }, ] for field in fields: schema.add_field(field) schema.save(DIRECTORY) # CREATE METHOD SCHEMA_SOURCE = DIRECTORY + "human-development-report.json" SCHEMA = whyqd.Schema(source=SCHEMA_SOURCE) method = whyqd.Method(directory=DIRECTORY, schema=SCHEMA) method.set({"name": "human-development-report-method"}) input_data = {"path": SOURCE_DATA} method.add_data(source=input_data) # Define actions schema_scripts = [ "DEBLANK", "DEDUPE", "REBASE < [11]", ] source_data = method.get.input_data[0] method.add_actions(schema_scripts, source_data.uuid.hex, sheet_name=source_data.sheet_name) # Get the index source_data = method.get.input_data[0] df = method.transform(source_data) schema_scripts = [ f"DELETE_ROWS < {[int(i) for i in np.arange(144, df.index[-1]+1)]}", "RENAME_ALL > ['HDI rank', 'Country', 'Human poverty index (HPI-1) - Rank;;2008', 'Reference 1', 'Human poverty index (HPI-1) - Value (%);;2008', 'Probability at birth of not surviving to age 40 (% of cohort);;2000-05', 'Reference 2', 'Adult illiteracy rate (% aged 15 and older);;1995-2005', 'Reference 3', 'Population not using an improved water source (%);;2004', 'Reference 4', 'Children under weight for age (% under age 5);;1996-2005', 'Reference 5', 'Population below income poverty line (%) - $1 a day;;1990-2005', 'Reference 6', 'Population below income poverty line (%) - $2 a day;;1990-2005', 'Reference 7', 'Population below income poverty line (%) - National poverty line;;1990-2004', 'Reference 8', 'HPI-1 rank minus income poverty rank;;2008']", "PIVOT_CATEGORIES > ['HDI rank'] < [14,44,120]", "RENAME_NEW > 'HDI Category'::['PIVOT_CATEGORIES_idx_20_0']", "PIVOT_LONGER > = ['HDI rank', 'HDI Category', 'Human poverty index (HPI-1) - Rank;;2008', 'Human poverty index (HPI-1) - Value (%);;2008', 'Probability at birth of not surviving to age 40 (% of cohort);;2000-05', 'Adult illiteracy rate (% aged 15 and older);;1995-2005', 'Population not using an improved water source (%);;2004', 'Children under weight for age (% under age 5);;1996-2005', 'Population below income poverty line (%) - $1 a day;;1990-2005', 'Population below income poverty line (%) - $2 a day;;1990-2005', 'Population below income poverty line (%) - National poverty line;;1990-2004', 'HPI-1 rank minus income poverty rank;;2008']", "SPLIT > ';;'::['PIVOT_LONGER_names_idx_9']", "DEBLANK", "DEDUPE", ] method.add_actions(schema_scripts, source_data.uuid.hex, sheet_name=source_data.sheet_name) # Get the column list reference_columns = [c.name for c in method.get.input_data[0].columns if c.name.startswith("Reference")] schema_scripts = [ f"JOIN > 'reference' < {reference_columns}", "RENAME > 'indicator_name' < ['SPLIT_idx_11_0']", "RENAME > 'country_name' < ['Country']", "RENAME > 'year' < ['SPLIT_idx_12_1']", "RENAME > 'values' < ['PIVOT_LONGER_values_idx_10']", ] method.add_actions(schema_scripts, source_data.uuid.hex, sheet_name=source_data.sheet_name) method.build() assert method.validate()