def definitions(base='.'): """ Check that all definition.py files are valid. """ def warn(message, slug): if message not in seen: print('%-50s %s' % (slug, message)) seen.add(message) seen = set() division_ids = set() for slug, config in registry(base).items(): directory = dirname(config['file']) # Validate LICENSE.txt. license_path = os.path.join(directory, 'LICENSE.txt') if os.path.exists(license_path): with open(license_path) as f: license_text = f.read().rstrip('\n') if config.get('licence_url'): licence_url = config['licence_url'] if licence_url in open_data_licenses or licence_url in some_rights_reserved_licenses: if not terms.get(licence_url) and not terms_re.get(licence_url): warn('No LICENSE.txt template for License URL %s' % licence_url, slug) elif terms.get(licence_url) and license_text != terms[licence_url] or terms_re.get(licence_url) and not terms_re[licence_url].search(license_text): print('%-50s Expected LICENSE.txt to match license-specific template' % slug) elif licence_url in all_rights_reserved_licenses: if not all_rights_reserved_terms_re.search(license_text): print('%-50s Expected LICENSE.txt to match "all rights reserved" template' % slug) else: print('%-50s Unrecognized License URL %s' % (slug, licence_url)) elif not all_rights_reserved_terms_re.search(license_text): print('%-50s Expected LICENSE.txt to match "all rights reserved" template' % slug) # Check for invalid keys, non-unique or empty values. invalid_keys = set(config.keys()) - valid_keys if invalid_keys: print('%-50s Unrecognized key: %s' % (slug, ', '.join(invalid_keys))) values = [value for key, value in config.items() if key != 'metadata'] if len(values) > len(set(values)): print('%-50s Non-unique values' % slug) for key, value in config.items(): if not value: print('%-50s Empty value for %s' % (slug, key)) # Check for missing required keys. for key in ('domain', 'last_updated', 'name_func', 'authority', 'encoding'): if not config.get(key): print('%-50s Missing %s' % (slug, key)) if not config.get('source_url') and config.get('data_url'): print('%-50s Missing source_url' % slug) if config.get('source_url') and not config.get('licence_url') and not config.get('data_url'): print('%-50s Missing licence_url or data_url' % slug) # Validate fields. for key in ('name', 'singular'): if config.get(key): print('%-50s Expected %s to be missing' % (slug, key)) if config.get('encoding') and config['encoding'] != 'iso-8859-1': print('%-50s Expected encoding to be iso-8859-1 not %s' % (slug, config['encoding'])) if slug not in ('Census divisions', 'Census subdivisions'): # Check for invalid keys or empty values. invalid_keys = set(config['metadata'].keys()) - valid_metadata_keys if invalid_keys: print('%-50s Unrecognized key: %s' % (slug, ', '.join(invalid_keys))) for key, value in config['metadata'].items(): if not value: print('%-50s Empty value for %s' % (slug, key)) division_id = get_division_id(slug, config) # Ensure division_id is unique. if division_id in division_ids: print('%-50s Duplicate division_id %s' % (slug, division_id)) else: division_ids.add(division_id) expected_slug, expected_config = get_definition(division_id) # Check for unexpected values. assert_match(slug, 'slug', slug, expected_slug) for key, value in expected_config.items(): assert_match(slug, key, config[key], value)
def spreadsheet(base='.', private_base='../represent-canada-private-data'): sgc_code_to_ocdid_map = sgc_code_to_ocdid() ocdid_to_sgc_code_map = {v: k for k, v in sgc_code_to_ocdid_map.items()} records = OrderedDict() reader = csv_reader('https://raw.github.com/opencivicdata/ocd-division-ids/master/identifiers/country-ca/ca_municipal_subdivisions-has_children.csv') reader.next() for row in reader: municipal_subdivisions[row[0].split(':')[-1]] = row[1] urls = {} reader = csv_reader('https://raw.github.com/opencivicdata/ocd-division-ids/master/identifiers/country-ca/ca_census_subdivisions-url.csv') reader.next() for row in reader: urls[row[0].split(':')[-1]] = row[1] abbreviations = {} reader = csv_reader('https://raw.github.com/opencivicdata/ocd-division-ids/master/identifiers/country-ca/ca_provinces_and_territories.csv') reader.next() for row in reader: abbreviations[row[2]] = row[0].split(':')[-1].upper() # Get scraper URLs. scraper_urls = {} for representative_set in requests.get('http://represent.opennorth.ca/representative-sets/?limit=0').json()['objects']: boundary_set_url = representative_set['related']['boundary_set_url'] if boundary_set_url: if boundary_set_url != '/boundary-sets/census-subdivisions/': boundary_set = requests.get('http://represent.opennorth.ca%s' % boundary_set_url).json() if boundary_set.get('extra') and boundary_set['extra'].get('geographic_code'): scraper_urls[boundary_set['extra']['geographic_code']] = representative_set['data_about_url'] or representative_set['data_url'] else: sys.stderr.write('%-60s No extra\n' % boundary_set_url) else: sys.stderr.write('%-60s No boundary_set_url\n' % representative_set['url']) # Create records for provinces and territories. reader = csv_reader('https://raw.github.com/opencivicdata/ocd-division-ids/master/identifiers/country-ca/ca_provinces_and_territories.csv') reader.next() for row in reader: records[ocdid_to_sgc_code_map[row[0]]] = { 'OCD': row[0], 'Geographic code': ocdid_to_sgc_code_map[row[0]], 'Geographic name': row[1], 'Geographic type': '', 'Province or territory': row[0].split(':')[-1].upper(), 'Population': '', 'URL': '', 'Scraper?': scraper_urls.get(ocdid_to_sgc_code_map[row[0]], ''), 'Shapefile?': '', 'Contact': '', 'Request notes': '', 'Received via': '', 'Last boundary': '', 'Next boundary': '', 'Permission to distribute': '', 'Highrise URL': '', 'Type of license': '', 'License URL': '', 'Denial notes': '', } # Create records for census subdivisions. reader = csv_reader('http://www12.statcan.gc.ca/census-recensement/2011/dp-pd/hlt-fst/pd-pl/FullFile.cfm?T=301&LANG=Eng&OFT=CSV&OFN=98-310-XWE2011002-301.CSV') reader.next() # title reader.next() # headers for row in reader: if row: result = re.search('\A(.+) \((.+)\)\Z', row[1].decode('iso-8859-1')) if result: name = result.group(1) province_or_territory = abbreviations[result.group(2)] elif row[1] == 'Canada': name = 'Canada' province_or_territory = '' else: raise Exception('Unrecognized name "%s"' % row[1]) record = { 'OCD': sgc_code_to_ocdid_map[row[0]], 'Geographic code': row[0], 'Geographic name': name, 'Province or territory': province_or_territory, 'Geographic type': row[2].decode('iso-8859-1'), 'Population': row[4], 'URL': urls.get(row[0], ''), 'Scraper?': scraper_urls.get(row[0], ''), 'Contact': '', 'Request notes': '', 'Received via': '', 'Last boundary': '', 'Next boundary': '', 'Permission to distribute': '', 'Highrise URL': '', 'Type of license': '', 'License URL': '', 'Denial notes': '', } if municipal_subdivisions.get(row[0]): if municipal_subdivisions[row[0]] == 'N': for header in ['Shapefile?'] + request_headers + receipt_headers: record[header] = 'N/A' elif municipal_subdivisions[row[0]] == 'Y': record['Shapefile?'] = 'Request' elif municipal_subdivisions[row[0]] == '?': record['Shapefile?'] = '' else: record['Shapefile?'] = '' records[row[0]] = record else: break # Merge information from received data. for directory, permission_to_distribute in [(base, 'Y'), (private_base, 'N')]: boundaries.registry = {} for slug, config in registry(directory).items(): if config.get('metadata'): geographic_code = config['metadata'].get('geographic_code') if geographic_code: license_path = os.path.join(dirname(config['file']), 'LICENSE.txt') license_text = '' if os.path.exists(license_path): with open(license_path) as f: license_text = f.read().rstrip('\n').decode('utf-8') record = records[geographic_code] record['Shapefile?'] = 'Y' record['Permission to distribute'] = permission_to_distribute record['License URL'] = config.get('licence_url', '') if config.get('last_updated'): record['Last boundary'] = config['last_updated'].strftime('%-m/%-d/%Y') if config.get('source_url'): record['Contact'] = 'N/A' record['Received via'] = 'online' else: match = all_rights_reserved_terms_re.search(license_text) if match: record['Contact'] = match.group(1) record['Received via'] = 'email' if config.get('licence_url'): licence_url = config['licence_url'] if licence_url in open_data_licenses: record['Type of license'] = 'Open' elif licence_url in some_rights_reserved_licenses: record['Type of license'] = 'Most rights reserved' elif licence_url in all_rights_reserved_licenses: record['Type of license'] = 'All rights reserved' else: raise Exception(licence_url) elif all_rights_reserved_terms_re.search(license_text): record['Type of license'] = 'All rights reserved' else: record['Type of license'] = 'Custom' elif not config['metadata'].get('ocd_division'): sys.stderr.write('%-60s No geographic_code or ocd_division\n' % slug) reader = csv.DictReader(StringIO(requests.get('https://docs.google.com/spreadsheet/pub?key=0AtzgYYy0ZABtdGpJdVBrbWtUaEV0THNUd2JIZ1JqM2c&single=true&gid=25&output=csv').content)) for row in reader: geographic_code = row['Geographic code'] record = records[geographic_code] for key in row: a = record[key] b = row[key].decode('utf-8') if a != b: # Columns that are always tracked manually. # Scrapers for municipalities without wards are tracked manually. # In-progress requests are tracked manually. # Contacts for in-progress requests and private data are tracked manually. # We may have a contact to confirm the nonexistence of municipal subdivisions. # MFIPPA requests are tracked manually. # Additional details about license agreements and written consent are tracked manually. # We may have information for a bad shapefile from an in-progress request. if b and ( (key in ('Highrise URL', 'Request notes', 'Next boundary', 'Denial notes')) or (key == 'Scraper?' and not a and record['Shapefile?'] == 'N/A') or (key == 'Shapefile?' and not a and b in ('Request', 'Requested')) or (key == 'Shapefile?' and a == 'Request'and b == 'Requested') or (key == 'Contact' and not a and (row['Shapefile?'] == 'Requested' or record['Permission to distribute'] == 'N')) or (key == 'Contact' and a == 'N/A' and record['Shapefile?'] == 'N/A') or (key == 'Received via' and a == 'email' and b == 'MFIPPA') or (key == 'Type of license' and '(' in b) or (key in ('Received via', 'Type of license', 'Permission to distribute') and not a and row['Shapefile?'] == 'Requested')): record[key] = b # The spreadsheet can add contacts and URLs. elif key != 'Population' and (key not in ('Contact', 'URL') or a): # separators sys.stderr.write(u'%-25s %s: expected "%s" got "%s"\n' % (key, geographic_code, a, b)) writer = UnicodeWriter(sys.stdout) writer.writerow(headers) for _, record in records.items(): writer.writerow([record[header] for header in headers])
def definitions(base='.'): """ Check that all definition.py files are valid. """ def warn(message, slug): if message not in seen: print('%-60s %s' % (slug, message)) seen.add(message) def assert_match(slug, field, actual, expected): if isinstance(expected, re._pattern_type): if not expected.search(actual): print('%-60s Expected %s to match %s not %s' % (slug, field, expected.pattern, actual)) elif isinstance(expected, list): if actual not in expected: print('%-60s Expected %s to be %s not %s' % (slug, field, expected[-1], actual)) elif actual != expected and expected is not None: print('%-60s Expected %s to be %s not %s' % (slug, field, expected, actual)) def has_multiple_sets(division_id): ocd_type = division_id.rsplit('/', 1)[1].split(':')[0] return ocd_type in ('country', 'province', 'territory') or division_id in ( 'ocd-division/country:ca/csd:3520005', # Toronto ) response = requests.get('https://docs.google.com/spreadsheets/d/1AmLQD2KwSpz3B4eStLUPmUQJmOOjRLI3ZUZSD5xUTWM/pub?gid=0&single=true&output=csv') response.encoding = 'utf-8' reader = csv.DictReader(StringIO(response.text)) licenses_with_templates = set(filter(None, (row['License URL'] for row in reader))) licenses_with_templates.update(more_licenses_with_templates) seen = set() division_ids = set() for slug, config in registry(base).items(): directory = dirname(config['file']) if config.get('extra'): division_id = config['extra']['division_id'] else: division_id = None # Validate LICENSE.txt. license_path = os.path.join(directory, 'LICENSE.txt') if os.path.exists(license_path): with open(license_path) as f: license_text = f.read().rstrip('\n') if 'licence_url' in config: licence_url = config['licence_url'] if licence_url in licenses_with_templates: if licence_url not in terms and not terms_re.get(licence_url): warn('No LICENSE.txt template for License URL %s' % licence_url, slug) elif licence_url in terms and license_text != (terms[licence_url] % licence_url) or terms_re.get(licence_url) and not terms_re[licence_url].search(license_text): print('%-60s Expected LICENSE.txt to match license-specific template' % slug) elif licence_url in all_rights_reserved_licenses: if not all_rights_reserved_terms_re.search(license_text): print('%-60s Expected LICENSE.txt to match "all rights reserved" template' % slug) else: print('%-60s Unrecognized License URL %s' % (slug, licence_url)) elif not all_rights_reserved_terms_re.search(license_text): print('%-60s Expected LICENSE.txt to match "all rights reserved" template' % slug) # Check for invalid keys, non-unique or empty values. invalid_keys = set(config.keys()) - valid_keys if invalid_keys: print('%-60s Unrecognized key: %s' % (slug, ', '.join(invalid_keys))) values = [value for key, value in config.items() if key != 'extra'] if len(values) > len(set(values)): print('%-60s Non-unique values' % slug) for key, value in config.items(): if not value: print('%-60s Empty value for %s' % (slug, key)) # Check for missing required keys. for key in ('domain', 'last_updated', 'name_func', 'authority', 'encoding'): if key not in config: print('%-60s Missing %s' % (slug, key)) if 'source_url' not in config and 'data_url' in config: print('%-60s Missing source_url' % slug) if 'source_url' in config and 'licence_url' not in config and 'data_url' not in config: print('%-60s Missing licence_url or data_url' % slug) # Validate fields. if 'name' in config: print('%-60s Expected name to be missing' % slug) if 'singular' in config and not slug.endswith(')') and not has_multiple_sets(division_id): print('%-60s Expected singular to be missing' % slug) if slug not in ('Census divisions', 'Census subdivisions'): # Check for invalid keys or empty values. invalid_keys = set(config['extra'].keys()) - {'division_id'} if invalid_keys: print('%-60s Unrecognized key: %s' % (slug, ', '.join(invalid_keys))) for key, value in config['extra'].items(): if not value: print('%-60s Empty value for %s' % (slug, key)) # Ensure division_id is unique. if division_id in division_ids and division_id not in divisions_with_boroughs() and not has_multiple_sets(division_id): print('%-60s Duplicate division_id %s' % (slug, division_id)) else: division_ids.add(division_id) expected_slug, expected_config = get_definition(division_id, path=config['file']) # Check for unexpected values. if not slug.endswith(')'): assert_match(slug, 'slug', slug, expected_slug) for key, value in expected_config.items(): assert_match(slug, key, config[key], value)
def spreadsheet(base='.', private_base='../represent-canada-private-data'): """ Validate the spreadsheet for tracking progress on data collection. """ expecteds = OrderedDict() # Append to `municipal_subdivisions` from `constants.py`. for division in Division.all('ca', from_csv=ocd_division_csv): if division.attrs['has_children']: municipal_subdivisions[type_id(division.id)] = division.attrs['has_children'] expecteds['ocd-division/country:ca'] = default_expectation.copy() expecteds['ocd-division/country:ca'].update({ 'OCD': 'ocd-division/country:ca', 'Geographic name': 'Canada', }) # Create expectations for provinces and territories. for division in Division.all('ca', from_csv=ocd_division_csv): if division._type in ('province', 'territory'): expected = default_expectation.copy() expected.update({ 'OCD': division.id, 'Geographic name': division.name, 'Province or territory': type_id(division.id).upper(), }) expecteds[division.id] = expected # Create expectations for census subdivisions. reader = csv_dict_reader('http://www12.statcan.gc.ca/census-recensement/2016/dp-pd/hlt-fst/pd-pl/Tables/CompFile.cfm?Lang=Eng&T=301&OFT=FULLCSV', 'ISO-8859-1') for row in reader: code = row['Geographic code'] if code == 'Note:': break division = Division.get('ocd-division/country:ca/csd:%s' % row['Geographic code'], from_csv=ocd_division_csv) expected = default_expectation.copy() expected.update({ 'OCD': division.id, 'Geographic name': division.name, 'Province or territory': province_or_territory_abbreviation(division.id), 'Geographic type': division.attrs['classification'], 'Population': row['Population, 2016'], }) if code in municipal_subdivisions: if municipal_subdivisions[code] == 'N': expected['Shapefile?'] = 'N/A' expected['Contact'] = 'N/A' expected['Received via'] = 'N/A' expected['Last boundary'] = 'N/A' expected['Permission to distribute'] = 'N/A' elif municipal_subdivisions[code] == 'Y': expected['Shapefile?'] = 'Request' expecteds[division.id] = expected # Merge information from received data. for directory, permission_to_distribute in [(base, 'Y'), (private_base, 'N')]: boundaries.registry = {} for slug, config in registry(directory).items(): if 'extra' in config: division_id = config['extra']['division_id'] if division_id in expecteds: license_path = os.path.join(dirname(config['file']), 'LICENSE.txt') license_text = '' if os.path.exists(license_path): with open(license_path) as f: license_text = f.read().rstrip('\n') expected = expecteds[division_id] expected['Shapefile?'] = 'Y' expected['Permission to distribute'] = permission_to_distribute if 'data_url' in config: expected['Last boundary'] = 'N/A' elif 'last_updated' in config: expected['Last boundary'] = config['last_updated'].strftime('%-m/%-d/%Y') if 'source_url' in config: expected['Contact'] = 'N/A' expected['Received via'] = 'online' elif expected['Province or territory'] == 'SK' and expected['Geographic type'] == 'RM': expected['Contact'] = 'Mikael Nagel\nMapping Technician\nTel: 1-306-569-2988 x205\nToll free: 1-800-663-6864\[email protected]' expected['Received via'] = 'purchase' else: match = all_rights_reserved_terms_re.search(license_text) if match: expected['Contact'] = match.group(1) expected['Received via'] = 'email' # The spreadsheet doesn't track borough boundaries. elif '/borough:' not in division_id: sys.stderr.write('%-25s no record\n' % division_id) else: sys.stderr.write('%-25s no extra\n' % slug) reader = csv_dict_reader('https://docs.google.com/spreadsheets/d/1ihCIDc9EtvxF7kzPg3Yk6e928DN7JzaycH92IBYr0QU/pub?gid=25&single=true&output=csv', 'utf-8') actuals = set() for actual in reader: expected = expecteds[actual['OCD']] actuals.add(actual['OCD']) for key in actual.keys() - ('Population', 'URL', 'Request notes', 'Next boundary', 'Response notes'): e = expected[key] a = actual[key] # Note: Some of the following conditions are repetitive for readability. if e != a: # Change expectations for in-progress requests. if expected['Shapefile?'] == 'Request' and actual['Shapefile?'] == 'Requested' and ( # Request sent. (key == 'Shapefile?' and e == 'Request' and a == 'Requested') or # Contact found. (key == 'Contact' and not e and a)): continue # Some provinces and territories have missing expectations, in which case we defer to the spreadsheet. elif actual['Province or territory'] in ('ON', 'MB') and not expected['Shapefile?'] and ( # We determined that we needed to request boundaries. (key == 'Shapefile?' and not e and a in ('Request', 'Requested')) or # We determined that we needed to request boundaries, and did so. (actual['Shapefile?'] == 'Requested' and key == 'Contact' and not e and a)): continue # Contacts for private data are only stored in the spreadsheet. elif ((expected['Permission to distribute'] == 'N' and key == 'Contact' and not e and a) or # MFIPPA receptions are only stored in the spreadsheet. (key == 'Received via' and e == 'email' and a == 'MFIPPA')): continue sys.stderr.write('%-25s %s: expected "%s" got "%s"\n' % (key, actual['OCD'], e, a)) for division_id in expecteds.keys() - actuals: record = expecteds[division_id] if record['Shapefile?'] != 'N/A': sys.stderr.write('%s\t%s\t%s\t%s\t%s\n' % (division_id, record['Geographic name'], record['Province or territory'], record['Geographic type'], record['Population'])) for division_id in actuals - expecteds.keys(): sys.stderr.write('Remove %s\n' % division_id)
def spreadsheet(base='.', private_base='../represent-canada-private-data'): """ Validate the spreadsheet for tracking progress on data collection. """ expecteds = OrderedDict() # Append to `municipal_subdivisions` from `constants.py`. for division in Division.all('ca', from_csv=ocd_division_csv): if division.attrs['has_children']: municipal_subdivisions[type_id( division.id)] = division.attrs['has_children'] expecteds['ocd-division/country:ca'] = default_expectation.copy() expecteds['ocd-division/country:ca'].update({ 'OCD': 'ocd-division/country:ca', 'Geographic name': 'Canada', }) # Create expectations for provinces and territories. for division in Division.all('ca', from_csv=ocd_division_csv): if division._type in ('province', 'territory'): expected = default_expectation.copy() expected.update({ 'OCD': division.id, 'Geographic name': division.name, 'Province or territory': type_id(division.id).upper(), }) expecteds[division.id] = expected # Create expectations for census subdivisions. reader = csv_dict_reader( 'http://www12.statcan.gc.ca/census-recensement/2016/dp-pd/hlt-fst/pd-pl/Tables/CompFile.cfm?Lang=Eng&T=301&OFT=FULLCSV', 'ISO-8859-1') for row in reader: code = row['Geographic code'] if code == 'Note:': break division = Division.get('ocd-division/country:ca/csd:%s' % row['Geographic code'], from_csv=ocd_division_csv) expected = default_expectation.copy() expected.update({ 'OCD': division.id, 'Geographic name': division.name, 'Province or territory': province_or_territory_abbreviation(division.id), 'Geographic type': division.attrs['classification'], 'Population': row['Population, 2016'], }) if code in municipal_subdivisions: if municipal_subdivisions[code] == 'N': expected['Shapefile?'] = 'N/A' expected['Contact'] = 'N/A' expected['Received via'] = 'N/A' expected['Last boundary'] = 'N/A' expected['Permission to distribute'] = 'N/A' elif municipal_subdivisions[code] == 'Y': expected['Shapefile?'] = 'Request' expecteds[division.id] = expected # Merge information from received data. for directory, permission_to_distribute in [(base, 'Y'), (private_base, 'N')]: boundaries.registry = {} for slug, config in registry(directory).items(): if 'extra' in config: division_id = config['extra']['division_id'] if division_id in expecteds: license_path = os.path.join(dirname(config['file']), 'LICENSE.txt') license_text = '' if os.path.exists(license_path): with open(license_path) as f: license_text = f.read().rstrip('\n') expected = expecteds[division_id] expected['Shapefile?'] = 'Y' expected[ 'Permission to distribute'] = permission_to_distribute if 'data_url' in config: expected['Last boundary'] = 'N/A' elif 'last_updated' in config: expected['Last boundary'] = config[ 'last_updated'].strftime('%-m/%-d/%Y') if 'source_url' in config: expected['Contact'] = 'N/A' expected['Received via'] = 'online' elif expected['Province or territory'] == 'SK' and expected[ 'Geographic type'] == 'RM': expected[ 'Contact'] = 'Mikael Nagel\nMapping Technician\nTel: 1-306-569-2988 x205\nToll free: 1-800-663-6864\[email protected]' expected['Received via'] = 'purchase' else: match = all_rights_reserved_terms_re.search( license_text) if match: expected['Contact'] = match.group(1) expected['Received via'] = 'email' # The spreadsheet doesn't track borough boundaries. elif '/borough:' not in division_id: sys.stderr.write('%-25s no record\n' % division_id) else: sys.stderr.write('%-25s no extra\n' % slug) reader = csv_dict_reader( 'https://docs.google.com/spreadsheets/d/1ihCIDc9EtvxF7kzPg3Yk6e928DN7JzaycH92IBYr0QU/pub?gid=25&single=true&output=csv', 'utf-8') actuals = set() for actual in reader: expected = expecteds[actual['OCD']] actuals.add(actual['OCD']) for key in actual.keys() - ('Population', 'URL', 'Request notes', 'Next boundary', 'Response notes'): e = expected[key] a = actual[key] # Note: Some of the following conditions are repetitive for readability. if e != a: # Change expectations for in-progress requests. if expected['Shapefile?'] == 'Request' and actual[ 'Shapefile?'] == 'Requested' and ( # Request sent. (key == 'Shapefile?' and e == 'Request' and a == 'Requested') or # Contact found. (key == 'Contact' and not e and a)): continue # Some provinces and territories have missing expectations, in which case we defer to the spreadsheet. elif actual['Province or territory'] in ( 'ON', 'MB' ) and not expected['Shapefile?'] and ( # We determined that we needed to request boundaries. (key == 'Shapefile?' and not e and a in ('Request', 'Requested')) or # We determined that we needed to request boundaries, and did so. (actual['Shapefile?'] == 'Requested' and key == 'Contact' and not e and a)): continue # Contacts for private data are only stored in the spreadsheet. elif ((expected['Permission to distribute'] == 'N' and key == 'Contact' and not e and a) or # MFIPPA receptions are only stored in the spreadsheet. (key == 'Received via' and e == 'email' and a == 'MFIPPA' )): continue sys.stderr.write('%-25s %s: expected "%s" got "%s"\n' % (key, actual['OCD'], e, a)) for division_id in expecteds.keys() - actuals: record = expecteds[division_id] if record['Shapefile?'] != 'N/A': sys.stderr.write('%s\t%s\t%s\t%s\t%s\n' % (division_id, record['Geographic name'], record['Province or territory'], record['Geographic type'], record['Population'])) for division_id in actuals - expecteds.keys(): sys.stderr.write('Remove %s\n' % division_id)
def definitions(base='.'): """ Check that all definition.py files are valid. """ def warn(message, slug): if message not in seen: print('%-60s %s' % (slug, message)) seen.add(message) def assert_match(slug, field, actual, expected): if isinstance(expected, re._pattern_type): if not expected.search(actual): print('%-60s Expected %s to match %s not %s' % (slug, field, expected.pattern, actual)) elif isinstance(expected, list): if actual not in expected: print('%-60s Expected %s to be %s not %s' % (slug, field, expected[-1], actual)) elif actual != expected and expected is not None: print('%-60s Expected %s to be %s not %s' % (slug, field, expected, actual)) def has_multiple_sets(division_id): ocd_type = division_id.rsplit('/', 1)[1].split(':')[0] return ocd_type in ( 'country', 'province', 'territory') or division_id in ( 'ocd-division/country:ca/csd:3520005', # Toronto ) response = requests.get( 'https://docs.google.com/spreadsheets/d/1AmLQD2KwSpz3B4eStLUPmUQJmOOjRLI3ZUZSD5xUTWM/pub?gid=0&single=true&output=csv' ) response.encoding = 'utf-8' reader = csv.DictReader(StringIO(response.text)) licenses_with_templates = set( filter(None, (row['License URL'] for row in reader))) licenses_with_templates.update(more_licenses_with_templates) seen = set() division_ids = set() for slug, config in registry(base).items(): directory = dirname(config['file']) if config.get('extra'): division_id = config['extra']['division_id'] else: division_id = None # Validate LICENSE.txt. license_path = os.path.join(directory, 'LICENSE.txt') if os.path.exists(license_path): with open(license_path) as f: license_text = f.read().rstrip('\n') if 'licence_url' in config: licence_url = config['licence_url'] if licence_url in licenses_with_templates: if licence_url not in terms and not terms_re.get( licence_url): warn( 'No LICENSE.txt template for License URL %s' % licence_url, slug) elif licence_url in terms and license_text != ( terms[licence_url] % licence_url) or terms_re.get( licence_url) and not terms_re[ licence_url].search(license_text): print( '%-60s Expected LICENSE.txt to match license-specific template' % slug) elif licence_url in all_rights_reserved_licenses: if not all_rights_reserved_terms_re.search(license_text): print( '%-60s Expected LICENSE.txt to match "all rights reserved" template' % slug) else: print('%-60s Unrecognized License URL %s' % (slug, licence_url)) elif not all_rights_reserved_terms_re.search(license_text): print( '%-60s Expected LICENSE.txt to match "all rights reserved" template' % slug) # Check for invalid keys, non-unique or empty values. invalid_keys = set(config.keys()) - valid_keys if invalid_keys: print('%-60s Unrecognized key: %s' % (slug, ', '.join(invalid_keys))) values = [value for key, value in config.items() if key != 'extra'] if len(values) > len(set(values)): print('%-60s Non-unique values' % slug) for key, value in config.items(): if not value: print('%-60s Empty value for %s' % (slug, key)) # Check for missing required keys. for key in ('domain', 'last_updated', 'name_func', 'authority', 'encoding'): if key not in config: print('%-60s Missing %s' % (slug, key)) if 'source_url' not in config and 'data_url' in config: print('%-60s Missing source_url' % slug) if 'source_url' in config and 'licence_url' not in config and 'data_url' not in config: print('%-60s Missing licence_url or data_url' % slug) # Validate fields. if 'name' in config: print('%-60s Expected name to be missing' % slug) if 'singular' in config and not slug.endswith( ')') and not has_multiple_sets(division_id): print('%-60s Expected singular to be missing' % slug) if slug not in ('Census divisions', 'Census subdivisions'): # Check for invalid keys or empty values. invalid_keys = set(config['extra'].keys()) - {'division_id'} if invalid_keys: print('%-60s Unrecognized key: %s' % (slug, ', '.join(invalid_keys))) for key, value in config['extra'].items(): if not value: print('%-60s Empty value for %s' % (slug, key)) # Ensure division_id is unique. if division_id in division_ids and division_id not in divisions_with_boroughs( ) and not has_multiple_sets(division_id): print('%-60s Duplicate division_id %s' % (slug, division_id)) else: division_ids.add(division_id) expected_slug, expected_config = get_definition( division_id, path=config['file']) # Check for unexpected values. if not slug.endswith(')'): assert_match(slug, 'slug', slug, expected_slug) for key, value in expected_config.items(): assert_match(slug, key, config[key], value)
def spreadsheet(base='.', private_base='../represent-canada-private-data'): sgc_code_to_ocdid_map = sgc_code_to_ocdid() ocdid_to_sgc_code_map = {v: k for k, v in sgc_code_to_ocdid_map.items()} records = OrderedDict() reader = csv_reader( 'https://raw.githubusercontent.com/opencivicdata/ocd-division-ids/master/identifiers/country-ca/ca_municipal_subdivisions-has_children.csv' ) next(reader) for row in reader: municipal_subdivisions[row[0].split(':')[-1]] = row[1] urls = {} reader = csv_reader( 'https://raw.githubusercontent.com/opencivicdata/ocd-division-ids/master/identifiers/country-ca/ca_census_subdivisions-url.csv' ) next(reader) for row in reader: urls[row[0].split(':')[-1]] = row[1] abbreviations = {} reader = csv_reader( 'https://raw.githubusercontent.com/opencivicdata/ocd-division-ids/master/identifiers/country-ca/ca_provinces_and_territories.csv' ) next(reader) for row in reader: abbreviations[row[2]] = row[0].split(':')[-1].upper() # Create records for provinces and territories. reader = csv_reader( 'https://raw.githubusercontent.com/opencivicdata/ocd-division-ids/master/identifiers/country-ca/ca_provinces_and_territories.csv' ) next(reader) for row in reader: records[ocdid_to_sgc_code_map[row[0]]] = { 'OCD': row[0], 'Geographic code': ocdid_to_sgc_code_map[row[0]], 'Geographic name': row[1], 'Geographic type': '', 'Province or territory': row[0].split(':')[-1].upper(), 'Population': '', 'URL': '', 'Shapefile?': '', 'Contact': '', 'Request notes': '', 'Received via': '', 'Last boundary': '', 'Next boundary': '', 'Permission to distribute': '', 'Highrise URL': '', 'Type of license': '', 'License URL': '', 'Denial notes': '', } # Create records for census subdivisions. reader = csv_reader( 'https://www12.statcan.gc.ca/census-recensement/2011/dp-pd/hlt-fst/pd-pl/FullFile.cfm?T=301&LANG=Eng&OFT=CSV&OFN=98-310-XWE2011002-301.CSV', 'ISO-8859-1') next(reader) # title next(reader) # headers for row in reader: if row: result = re.search('\A(.+) \((.+)\)\Z', row[1]) if result: name = result.group(1) province_or_territory = abbreviations[result.group(2)] elif row[1] == 'Canada': name = 'Canada' province_or_territory = '' else: raise Exception('Unrecognized name "%s"' % row[1]) record = { 'OCD': sgc_code_to_ocdid_map[row[0]], 'Geographic code': row[0], 'Geographic name': name, 'Province or territory': province_or_territory, 'Geographic type': row[2], 'Population': row[4], 'URL': urls.get(row[0], ''), 'Contact': '', 'Request notes': '', 'Received via': '', 'Last boundary': '', 'Next boundary': '', 'Permission to distribute': '', 'Highrise URL': '', 'Type of license': '', 'License URL': '', 'Denial notes': '', } if municipal_subdivisions.get(row[0]): if municipal_subdivisions[row[0]] == 'N': for header in ['Shapefile?' ] + request_headers + receipt_headers: record[header] = 'N/A' elif municipal_subdivisions[row[0]] == 'Y': record['Shapefile?'] = 'Request' elif municipal_subdivisions[row[0]] == '?': record['Shapefile?'] = '' else: record['Shapefile?'] = '' records[row[0]] = record else: break # Merge information from received data. for directory, permission_to_distribute in [(base, 'Y'), (private_base, 'N')]: boundaries.registry = {} for slug, config in registry(directory).items(): if config.get('metadata'): geographic_code = config['metadata'].get('geographic_code') if geographic_code: license_path = os.path.join(dirname(config['file']), 'LICENSE.txt') license_text = '' if os.path.exists(license_path): with open(license_path) as f: license_text = f.read().rstrip('\n') record = records[geographic_code] record['Shapefile?'] = 'Y' record[ 'Permission to distribute'] = permission_to_distribute record['License URL'] = config.get('licence_url', '') if config.get('last_updated'): record['Last boundary'] = config[ 'last_updated'].strftime('%-m/%-d/%Y') if config.get('source_url'): record['Contact'] = 'N/A' record['Received via'] = 'online' else: match = all_rights_reserved_terms_re.search( license_text) if match: record['Contact'] = match.group(1) record['Received via'] = 'email' if config.get('licence_url'): licence_url = config['licence_url'] if licence_url in open_data_licenses: record['Type of license'] = 'Open' elif licence_url in some_rights_reserved_licenses: record['Type of license'] = 'Most rights reserved' elif licence_url in all_rights_reserved_licenses: record['Type of license'] = 'All rights reserved' else: raise Exception(licence_url) elif all_rights_reserved_terms_re.search(license_text): record['Type of license'] = 'All rights reserved' else: record['Type of license'] = 'Custom' elif not config['metadata'].get('ocd_division'): sys.stderr.write( '%-60s No geographic_code or ocd_division\n' % slug) else: sys.stderr.write('%-60s No metadata\n' % slug) response = requests.get( 'https://docs.google.com/spreadsheet/pub?key=0AtzgYYy0ZABtdGpJdVBrbWtUaEV0THNUd2JIZ1JqM2c&single=true&gid=25&output=csv' ) response.encoding = 'utf-8' reader = csv.DictReader(StringIO(response.text)) for row in reader: geographic_code = row['Geographic code'] record = records[geographic_code] for key in row: a = record[key] b = row[key] if a != b: # Columns that are always tracked manually. # Scrapers for municipalities without wards are tracked manually. # In-progress requests are tracked manually. # Contacts for in-progress requests and private data are tracked manually. # We may have a contact to confirm the nonexistence of municipal subdivisions. # MFIPPA requests are tracked manually. # Additional details about license agreements and written consent are tracked manually. # We may have information for a bad shapefile from an in-progress request. if b and ((key in ('Highrise URL', 'Request notes', 'Next boundary', 'Denial notes')) or (key == 'Shapefile?' and not a and b in ('Request', 'Requested')) or (key == 'Shapefile?' and a == 'Request' and b == 'Requested') or (key == 'Contact' and not a and (row['Shapefile?'] == 'Requested' or record['Permission to distribute'] == 'N')) or (key == 'Contact' and a == 'N/A' and record['Shapefile?'] == 'N/A') or (key == 'Received via' and a == 'email' and b == 'MFIPPA') or (key == 'Type of license' and '(' in b) or (key in ('Received via', 'Type of license', 'Permission to distribute') and not a and row['Shapefile?'] == 'Requested')): record[key] = b # The spreadsheet can add contacts and URLs. elif key != 'Population' and (key not in ('Contact', 'URL') or a): # separators sys.stderr.write('%-25s %s: expected "%s" got "%s"\n' % (key, geographic_code, a, b)) writer = csv.writer(sys.stdout) writer.writerow(headers) for _, record in records.items(): writer.writerow([record[header] for header in headers])
def definitions(base='.'): """ Check that all definition.py files are valid. """ def warn(message, slug): if message not in seen: print('%-50s %s' % (slug, message)) seen.add(message) borough_division_ids = ( 'ocd-division/country:ca/csd:2458227', # Longueuil 'ocd-division/country:ca/csd:2466023', # Montréal 'ocd-division/country:ca/csd:2423027', # Québec 'ocd-division/country:ca/csd:2494068', # Saguenay 'ocd-division/country:ca/csd:2443027', # Sherbrooke ) seen = set() division_ids = set() for slug, config in registry(base).items(): directory = dirname(config['file']) # Validate LICENSE.txt. license_path = os.path.join(directory, 'LICENSE.txt') if os.path.exists(license_path): with open(license_path) as f: license_text = f.read().rstrip('\n') if config.get('licence_url'): licence_url = config['licence_url'] if licence_url in open_data_licenses or licence_url in some_rights_reserved_licenses: if not terms.get(licence_url) and not terms_re.get( licence_url): warn( 'No LICENSE.txt template for License URL %s' % licence_url, slug) elif terms.get( licence_url ) and license_text != terms[licence_url] or terms_re.get( licence_url ) and not terms_re[licence_url].search(license_text): print( '%-50s Expected LICENSE.txt to match license-specific template' % slug) elif licence_url in all_rights_reserved_licenses: if not all_rights_reserved_terms_re.search(license_text): print( '%-50s Expected LICENSE.txt to match "all rights reserved" template' % slug) else: print('%-50s Unrecognized License URL %s' % (slug, licence_url)) elif not all_rights_reserved_terms_re.search(license_text): print( '%-50s Expected LICENSE.txt to match "all rights reserved" template' % slug) # Check for invalid keys, non-unique or empty values. invalid_keys = set(config.keys()) - valid_keys if invalid_keys: print('%-50s Unrecognized key: %s' % (slug, ', '.join(invalid_keys))) values = [value for key, value in config.items() if key != 'metadata'] if len(values) > len(set(values)): print('%-50s Non-unique values' % slug) for key, value in config.items(): if not value: print('%-50s Empty value for %s' % (slug, key)) # Check for missing required keys. for key in ('domain', 'last_updated', 'name_func', 'authority', 'encoding'): if not config.get(key): print('%-50s Missing %s' % (slug, key)) if not config.get('source_url') and config.get('data_url'): print('%-50s Missing source_url' % slug) if config.get('source_url') and not config.get( 'licence_url') and not config.get('data_url'): print('%-50s Missing licence_url or data_url' % slug) # Validate fields. for key in ('name', 'singular'): if config.get(key): print('%-50s Expected %s to be missing' % (slug, key)) if slug not in ('Census divisions', 'Census subdivisions'): # Check for invalid keys or empty values. invalid_keys = set(config['metadata'].keys()) - valid_metadata_keys if invalid_keys: print('%-50s Unrecognized key: %s' % (slug, ', '.join(invalid_keys))) for key, value in config['metadata'].items(): if not value: print('%-50s Empty value for %s' % (slug, key)) division_id = get_division_id(slug, config) # Ensure division_id is unique. if division_id in division_ids and division_id not in borough_division_ids: print('%-50s Duplicate division_id %s' % (slug, division_id)) else: division_ids.add(division_id) expected_slug, expected_config = get_definition(division_id) # Check for unexpected values. assert_match(slug, 'slug', slug, expected_slug) for key, value in expected_config.items(): assert_match(slug, key, config[key], value)
def spreadsheet(base='.', private_base='../represent-canada-private-data'): sgc_code_to_ocdid_map = sgc_code_to_ocdid() records = OrderedDict() reader = csv_reader( 'https://raw.githubusercontent.com/opencivicdata/ocd-division-ids/master/identifiers/country-ca/ca_municipal_subdivisions-has_children.csv' ) next(reader) for row in reader: municipal_subdivisions[row[0].split(':')[-1]] = row[1] urls = {} reader = csv_reader( 'https://raw.githubusercontent.com/opencivicdata/ocd-division-ids/master/identifiers/country-ca/ca_census_subdivisions-url.csv' ) next(reader) for row in reader: urls[row[0].split(':')[-1]] = row[1] abbreviations = {} reader = csv_reader( 'https://raw.githubusercontent.com/opencivicdata/ocd-division-ids/master/identifiers/country-ca/ca_provinces_and_territories.csv' ) next(reader) for row in reader: abbreviations[row[2]] = row[0].split(':')[-1].upper() # Create records for provinces and territories. reader = csv_reader( 'https://raw.githubusercontent.com/opencivicdata/ocd-division-ids/master/identifiers/country-ca/ca_provinces_and_territories.csv' ) next(reader) for row in reader: records[row[0]] = { 'OCD': row[0], 'Geographic name': row[1], 'Geographic type': '', 'Province or territory': row[0].split(':')[-1].upper(), 'Population': '', 'URL': '', 'Shapefile?': '', 'Contact': '', 'Request notes': '', 'Received via': '', 'Last boundary': '', 'Next boundary': '', 'Permission to distribute': '', 'Highrise URL': '', 'Response notes': '', } # Create records for census subdivisions. reader = csv_reader( 'http://www12.statcan.gc.ca/census-recensement/2011/dp-pd/hlt-fst/pd-pl/FullFile.cfm?T=301&LANG=Eng&OFT=CSV&OFN=98-310-XWE2011002-301.CSV', 'ISO-8859-1') next(reader) # title next(reader) # headers for row in reader: if row: result = re.search('\A(.+) \((.+)\)\Z', row[1]) if result: name = result.group(1) province_or_territory = abbreviations[result.group(2)] elif row[1] == 'Canada': name = 'Canada' province_or_territory = '' else: raise Exception('Unrecognized name "%s"' % row[1]) division_id = sgc_code_to_ocdid_map[row[0]] record = { 'OCD': division_id, 'Geographic name': name, 'Province or territory': province_or_territory, 'Geographic type': row[2], 'Population': row[4], 'URL': urls.get(row[0], ''), 'Contact': '', 'Request notes': '', 'Received via': '', 'Last boundary': '', 'Next boundary': '', 'Permission to distribute': '', 'Highrise URL': '', 'Response notes': '', } if row[0] in municipal_subdivisions: if municipal_subdivisions[row[0]] == 'N': for header in ['Shapefile?' ] + request_headers + receipt_headers: record[header] = 'N/A' elif municipal_subdivisions[row[0]] == 'Y': record['Shapefile?'] = 'Request' elif municipal_subdivisions[row[0]] == '?': record['Shapefile?'] = '' else: record['Shapefile?'] = '' records[division_id] = record else: break # Merge information from received data. for directory, permission_to_distribute in [(base, 'Y'), (private_base, 'N')]: boundaries.registry = {} for slug, config in registry(directory).items(): if 'extra' in config: division_id = config['extra']['division_id'] if division_id in records: license_path = os.path.join(dirname(config['file']), 'LICENSE.txt') license_text = '' if os.path.exists(license_path): with open(license_path) as f: license_text = f.read().rstrip('\n') record = records[division_id] record['Shapefile?'] = 'Y' record[ 'Permission to distribute'] = permission_to_distribute if 'last_updated' in config: record['Last boundary'] = config[ 'last_updated'].strftime('%-m/%-d/%Y') if 'source_url' in config: record['Contact'] = 'N/A' record['Received via'] = 'online' else: match = all_rights_reserved_terms_re.search( license_text) if match: record['Contact'] = match.group(1) record['Received via'] = 'email' # The spreadsheet doesn't track borough boundaries. elif '/borough:' not in division_id: sys.stderr.write('%-25s no record\n' % division_id) else: sys.stderr.write('%-25s no extra\n' % slug) response = requests.get( 'https://docs.google.com/spreadsheets/d/1ihCIDc9EtvxF7kzPg3Yk6e928DN7JzaycH92IBYr0QU/pub?gid=25&single=true&output=csv' ) response.encoding = 'utf-8' reader = csv.DictReader(StringIO(response.text)) for row in reader: record = records[row['OCD']] for key in row: a = record[key] b = row[key] if a != b: if b and ( # Columns that are always tracked manually. (key in ('Highrise URL', 'Request notes', 'Next boundary', 'Response notes')) or # Scrapers for municipalities without wards are tracked manually. (key == 'Shapefile?' and not a and b in ('Request', 'Requested')) or # In-progress requests are tracked manually. (key == 'Shapefile?' and a == 'Request' and b == 'Requested') or # Contacts for in-progress requests and private data are tracked manually. (key == 'Contact' and not a and (row['Shapefile?'] == 'Requested' or record['Permission to distribute'] == 'N')) or # We may have a contact to confirm the nonexistence of municipal subdivisions. (key == 'Contact' and a == 'N/A' and record['Shapefile?'] == 'N/A') or # MFIPPA requests are tracked manually. (key == 'Received via' and a == 'email' and b == 'MFIPPA') or # We may have information for a bad shapefile from an in-progress request. (key in ('Received via', 'Permission to distribute') and not a and row['Shapefile?'] == 'Requested')): record[key] = b # The spreadsheet can add contacts and URLs. elif key != 'Population' and (key not in ('Contact', 'URL') or a): # separators sys.stderr.write('%-25s %s: expected "%s" got "%s"\n' % (key, row['OCD'], a, b)) writer = csv.writer(sys.stdout) writer.writerow(headers) for _, record in records.items(): writer.writerow([record[header] for header in headers])