def test_patched_release_schema_with_release_schema_patch_url(): url = 'https://raw.githubusercontent.com/open-contracting-extensions/ocds_coveredBy_extension/master/release-schema.json' # noqa: E501 builder = ProfileBuilder('1__1__4', [url]) result = builder.patched_release_schema() assert '$schema' in result assert 'coveredBy' in result['definitions']['Tender']['properties']
def test_extensions(): builder = ProfileBuilder('1__1__4', { 'charges': 'master', 'location': 'v1.1.4' }) result = list(builder.extensions()) assert len(result) == 2 assert result[0].as_dict() == { 'id': 'charges', 'date': '', 'version': 'master', 'base_url': 'https://raw.githubusercontent.com/open-contracting-extensions/ocds_charges_extension/master/', 'download_url': 'https://github.com/open-contracting-extensions/ocds_charges_extension/archive/master.zip', } assert result[1].as_dict() == { 'id': 'location', 'date': '2019-02-25', 'version': 'v1.1.4', 'base_url': 'https://raw.githubusercontent.com/open-contracting-extensions/ocds_location_extension/v1.1.4/', 'download_url': 'https://api.github.com/repos/open-contracting-extensions/ocds_location_extension/zipball/v1.1.4', # noqa: E501 }
def test_patched_release_schema_with_download_url(): url = 'https://github.com/open-contracting-extensions/ocds_coveredBy_extension/archive/master.zip' builder = ProfileBuilder('1__1__4', [url]) result = builder.patched_release_schema() assert '$schema' in result assert 'coveredBy' in result['definitions']['Tender']['properties']
def test_patched_release_schema_with_absolute_path(): url = Path(path('ocds_coveredBy_extension')).resolve().as_uri() builder = ProfileBuilder('1__1__4', [url]) result = builder.patched_release_schema() assert '$schema' in result assert 'coveredBy' in result['definitions']['Tender']['properties']
def test_get_standard_file_contents(): builder = ProfileBuilder('1__1__4', {}) data = builder.get_standard_file_contents('release-schema.json') # Repeat requests should return the same result. data = builder.get_standard_file_contents('release-schema.json') assert json.loads(data)
def test_patched_release_schema_with_schema_base_url(): schema_base_url = 'https://standard.open-contracting.org/profiles/ppp/schema/1__0__0__beta/' builder = ProfileBuilder('1__1__4', {}, schema_base_url=schema_base_url) result = builder.patched_release_schema() # Changes `id`. assert result[ 'id'] == 'https://standard.open-contracting.org/profiles/ppp/schema/1__0__0__beta/release-schema.json' # noqa: E501
def test_merge_with_schema(): builder = ProfileBuilder('1__1__4', {'additionalContactPoint': 'master'}) schema = builder.patched_release_schema() data = json.loads( read('release-package_additional-contact-points.json'))['releases'] compiled_release = list(merge(data, schema=schema))[0] assert compiled_release == json.loads(read('compile_extensions.json'))
def test_patched_release_schema_with_extension_field(): builder = ProfileBuilder('1__1__4', {'location': 'v1.1.4'}) result = builder.patched_release_schema(extension_field='extension') definition = result['definitions']['Location'] assert definition['extension'] == 'Location' assert definition['properties']['geometry']['extension'] == 'Location' assert definition['properties']['geometry']['properties']['type'][ 'extension'] == 'Location'
def test_release_package_schema_with_schema_base_url(): schema_base_url = 'https://standard.open-contracting.org/profiles/ppp/schema/1__0__0__beta/' builder = ProfileBuilder('1__1__4', {}, schema_base_url=schema_base_url) result = builder.release_package_schema() # Changes `id` and `$ref`. assert result[ 'id'] == 'https://standard.open-contracting.org/profiles/ppp/schema/1__0__0__beta/release-package-schema.json' # noqa: E501 assert result['properties']['releases']['items'][ '$ref'] == 'https://standard.open-contracting.org/profiles/ppp/schema/1__0__0__beta/release-schema.json' # noqa: E501
def get_patched_schema(self): schema_response = requests.get(self.schema_url) schema = schema_response.json() builder = ProfileBuilder(None, self.extensions_info.extension_urls) schema = builder.patched_release_schema( schema=schema, extension_field=self.extension_field) schema = jsonref.JsonRef.replace_refs(schema) with open('release-schema.json', 'w') as f: jsonref.dump(schema, f) return schema
def test_patched_release_schema_with_extension_field_and_language(): builder = ProfileBuilder('1__1__4', [ 'https://extensions.open-contracting.org/en/extensions/location/master/' ]) result = builder.patched_release_schema(extension_field='extension', language='es') definition = result['definitions']['Location'] assert definition['extension'] == 'Ubicación' assert definition['properties']['geometry']['extension'] == 'Ubicación' assert definition['properties']['geometry']['properties']['type'][ 'extension'] == 'Ubicación'
def test_extension_codelists(caplog): caplog.set_level(logging.INFO, logger='ocdsextensionregistry') # Note: We can't yet test, using real data, whether an error is raised if a codelist replacement either doesn't # contain added codes, or contains removed codes. If we were to use test data, we could create a test registry # and test extensions, or mock HTTP requests…. For now, additions were tested manually. We also can't yet test # whether an error is raised if two codelist replacements differ. # charges and tariffs both have chargePaidBy.csv, but the content is identical, so should not error. ppp has # documentType.csv and tariffs has +documentType.csv, but documentType.csv contains the codes added by # +documentType.csv, so should not error. ppp and enquiries both have +partyRole.csv. builder = ProfileBuilder( '1__1__4', { 'https://raw.githubusercontent.com/open-contracting-extensions/ocds_ppp_extension/70c5cb759d4739d1eca5db832e723afb69bbdae0/', # noqa: E501 'https://github.com/open-contracting-extensions/ocds_enquiry_extension/archive/v1.1.4.zip', 'https://github.com/open-contracting-extensions/ocds_charges_extension/archive/master.zip', 'https://github.com/open-contracting-extensions/ocds_tariffs_extension/archive/1.1.zip', }) result = sorted(builder.extension_codelists()) plus_party_role = next(codelist for codelist in result if codelist.name == '+partyRole.csv') # Collects codelists. assert len(result) == 9 assert [codelist.name for codelist in result] == sorted([ '+milestoneType.csv', '+partyRole.csv', '+releaseTag.csv', '-partyRole.csv', 'documentType.csv', 'initiationType.csv', ] + new_extension_codelists) # Preserves content. assert result[0].name == '+milestoneType.csv' assert len(result[0]) == 2 assert len(result[0][0]) == 4 assert result[0][0]['Code'] == 'procurement' assert result[0][0]['Title'] == 'Procurement' assert result[0][0]['Description'].startswith( 'Events taking place during the procurement which are not ') assert result[0][0]['Source'] == '' # Combines codelist additions and removals. assert len(plus_party_role) == 13 assert sorted(plus_party_role)[-1]['Code'] == 'socialWitness' # Logs ignored codelists. assert len(caplog.records) == 1 assert caplog.records[-1].levelname == 'INFO' assert caplog.records[ -1].message == 'documentType.csv has the codes added by +documentType.csv - ignoring +documentType.csv' # noqa: E501
def test_patched_release_schema(): # Use the ppp extension to test null values. builder = ProfileBuilder( '1__1__5', { 'https://raw.githubusercontent.com/open-contracting-extensions/ocds_ppp_extension/70c5cb759d4739d1eca5db832e723afb69bbdae0/', # noqa: E501 'https://github.com/open-contracting-extensions/ocds_location_extension/archive/v1.1.5.zip', }) result = builder.patched_release_schema() # Patches core. assert '$schema' in result assert 'Location' in result['definitions'] # Removes null'ed fields. assert 'buyer' not in result['properties']
def test_release_schema_patch(): # Use the ppp extension to test null values. builder = ProfileBuilder( '1__1__4', { 'https://raw.githubusercontent.com/open-contracting-extensions/ocds_ppp_extension/70c5cb759d4739d1eca5db832e723afb69bbdae0/', # noqa: E501 'https://github.com/open-contracting-extensions/ocds_location_extension/archive/v1.1.4.zip', }) result = builder.release_schema_patch() # Merges patches. assert 'Location' in result['definitions'] # Preserves null values. assert result['properties']['buyer'] is None assert 'REPLACE_WITH_NULL' not in json.dumps(result)
def parse_schema(self, input_format, schema=None): if schema: schema = resolve_file_uri(schema) if "release" in input_format: pkg_type = "releases" getter = attrgetter("release_package_schema") else: pkg_type = "records" getter = attrgetter("record_package_schema") url = DEFAULT_SCHEMA_URL[pkg_type].get( self.language[:2], DEFAULT_SCHEMA_URL[pkg_type]["en"]) if not schema: LOGGER.info( _("No schema provided, using version {}").format( CURRENT_SCHEMA_TAG)) profile = ProfileBuilder(CURRENT_SCHEMA_TAG, {}, schema_base_url=url) schema = getter(profile)() title = schema.get("title", "").lower() if not title: raise ValueError( _("Incomplete schema, please make sure your data is correct")) if "package" in title: # TODO: is is a good way to get release/record schema schema = jsonref.JsonRef.replace_refs(schema) schema = schema["properties"][pkg_type]["items"] self.schema = schema self.pkg_type = pkg_type
def test_record_package_schema_with_schema_base_url_and_embed(): schema_base_url = 'https://standard.open-contracting.org/profiles/ppp/schema/1__0__0__beta/' builder = ProfileBuilder('1__1__4', {}, schema_base_url=schema_base_url) result = builder.record_package_schema(embed=True) # Changes `id` and `$ref`. assert result[ 'id'] == 'https://standard.open-contracting.org/profiles/ppp/schema/1__0__0__beta/record-package-schema.json' # noqa: E501 assert result['definitions']['record']['properties']['compiledRelease'][ 'id'] == 'https://standard.open-contracting.org/profiles/ppp/schema/1__0__0__beta/release-schema.json' # noqa: E501 assert result['definitions']['record']['properties']['releases']['oneOf'][ 1]['items'][ 'id'] == 'https://standard.open-contracting.org/profiles/ppp/schema/1__0__0__beta/release-schema.json' # noqa: E501 assert '$ref' not in result['definitions']['record']['properties'][ 'compiledRelease'] assert '$ref' not in result['definitions']['record']['properties'][ 'releases']['oneOf'][1]['items']
def test_standard_codelists(): builder = ProfileBuilder('1__1__4', {}) result = builder.standard_codelists() # Collects codelists. assert len(result) == 19 assert [codelist.name for codelist in result] == standard_codelists # Preserves content. assert result[0].name == 'awardCriteria.csv' assert len(result[0]) == 8 assert len(result[0][0]) == 4 assert result[0][0]['Code'] == 'priceOnly' assert result[0][0]['Title'] == 'Price only' assert result[0][0]['Description'].startswith( 'The award will be made to the qualified bid with the lowest ') assert result[0][0]['Deprecated'] == ''
def test_patched_codelists(caplog): caplog.set_level(logging.INFO, logger='ocdsextensionregistry') builder = ProfileBuilder( '1__1__4', [ 'https://raw.githubusercontent.com/open-contracting-extensions/ocds_ppp_extension/70c5cb759d4739d1eca5db832e723afb69bbdae0/', # noqa: E501 'https://github.com/open-contracting-extensions/ocds_charges_extension/archive/master.zip', 'https://github.com/open-contracting-extensions/ocds_tariffs_extension/archive/1.1.zip', ]) result = builder.patched_codelists() party_role = next(codelist for codelist in result if codelist.name == 'partyRole.csv') initiation_type = next(codelist for codelist in result if codelist.name == 'initiationType.csv') # Collects codelists. assert len(result) == 22 assert [codelist.name for codelist in result ] == standard_codelists + new_extension_codelists # Preserves content. assert result[0].name == 'awardCriteria.csv' assert len(result[0]) == 8 assert len(result[0][0]) == 4 assert result[0][0]['Code'] == 'priceOnly' assert result[0][0]['Title'] == 'Price only' assert result[0][0]['Description'].startswith( 'The award will be made to the qualified bid with the lowest ') assert result[0][0]['Deprecated'] == '' # Adds codes. assert any(row['Code'] == 'publicAuthority' for row in party_role) # Removes codes. assert not any(row['Code'] == 'buyer' for row in party_role) # Replaces list. assert all(row['Code'] == 'ppp' for row in initiation_type) # Logs ignored codelists. assert len(caplog.records) == 1 assert caplog.records[-1].levelname == 'INFO' assert caplog.records[ -1].message == 'documentType.csv has the codes added by +documentType.csv - ignoring +documentType.csv' # noqa: E501
def get_schema(language, pkg_type): url = DEFAULT_SCHEMA_URL[pkg_type][language] getter = attrgetter( "release_package_schema") if "releases" in pkg_type else attrgetter( "record_package_schema") profile = ProfileBuilder(CURRENT_SCHEMA_TAG, {}, schema_base_url=url) schema = getter(profile)() title = schema.get("title", "").lower() if "package" in title: schema = jsonref.JsonRef.replace_refs(schema) schema = schema["properties"][pkg_type]["items"] return schema
def handle(self): with open(self.args.file) as f: schema = json.load(f) if self.args.extension: builder = ProfileBuilder(None, self.args.extension) schema = builder.patched_release_schema( schema=schema, extension_field=self.args.extension_field) base_uri = pathlib.Path(os.path.realpath(self.args.file)).as_uri() if not self.args.no_replace_refs: schema = jsonref.JsonRef.replace_refs(schema, base_uri=base_uri) try: mapping_sheet(schema, sys.stdout, order_by=self.args.order_by, infer_required=self.args.infer_required, extension_field=self.args.extension_field, include_deprecated=not self.args.no_deprecated, include_definitions=self.args.no_replace_refs) except MissingColumnError as e: raise CommandError(str(e)) from e
def merge(data, uri='', publisher=None, published_date='', version=DEFAULT_VERSION, schema=None, return_versioned_release=False, return_package=False, use_linked_releases=False, streaming=False): """ Merges release packages and individual releases. By default, yields compiled releases. If ``return_versioned_release`` is ``True``, yields versioned releases. If ``return_package`` is ``True``, wraps the compiled releases (and versioned releases if ``return_versioned_release`` is ``True``) in a record package. If ``return_package`` is set and ``publisher`` isn't set, the output record package will have the same publisher as the last input release package. :param data: an iterable of release packages and individual releases :param str uri: if ``return_package`` is ``True``, the record package's ``uri`` :param dict publisher: if ``return_package`` is ``True``, the record package's ``publisher`` :param str published_date: if ``return_package`` is ``True``, the record package's ``publishedDate`` :param str version: if ``return_package`` is ``True``, the record package's ``version`` :param dict schema: the URL, path or dict of the patched release schema to use :param bool return_package: wrap the compiled releases in a record package :param bool use_linked_releases: if ``return_package`` is ``True``, use linked releases instead of full releases, if the input is a release package :param bool return_versioned_release: if ``return_package`` is ``True``, include versioned releases in the record package; otherwise, yield versioned releases instead of compiled releases :param bool streaming: if ``return_package`` is ``True``, set the package's records to a generator (this only works if the calling code exhausts the generator before ``merge`` returns) :raises InconsistentVersionError: if the versions are inconsistent across packages to merge :raises MissingOcidKeyError: if the release is missing an ``ocid`` field """ with Packager() as packager: packager.add(data) if not schema and packager.version: prefix = packager.version.replace('.', '__') + '__' tag = next(tag for tag in reversed(get_tags()) if tag.startswith(prefix)) schema = get_release_schema_url(tag) if packager.package['extensions']: builder = ProfileBuilder(tag, list(packager.package['extensions'])) schema = builder.patched_release_schema() merger = Merger(schema) if return_package: packager.package['uri'] = uri packager.package['publishedDate'] = published_date packager.package['version'] = version if publisher: packager.package['publisher'] = publisher yield from packager.output_package( merger, return_versioned_release=return_versioned_release, use_linked_releases=use_linked_releases, streaming=streaming) else: yield from packager.output_releases( merger, return_versioned_release=return_versioned_release)
def get_extended_mapping_sheet(extensions, version): builder = ProfileBuilder(version, extensions) schema = jsonref.JsonRef.replace_refs(builder.patched_release_schema()) return _get_mapping_sheet(schema)
import json import sys import requests from ocdsextensionregistry import ProfileBuilder url = 'https://raw.githubusercontent.com/open-contracting-extensions/european-union/latest/docs/extension_versions.json' # noqa: E501 builder = ProfileBuilder('1__1__5', requests.get(url).json()) schema = builder.patched_release_schema(extension_field='extension') json.dump(schema, sys.stdout, ensure_ascii=False, indent=2)
def update(ppp_base_url): """ Aligns OC4IDS with OCDS. It uses OCDS for PPPs as a basis, as it includes most definitions and codelists needed in OC4IDS. It copies definitions and codelists across, making modifications as required. Run this command for every release of OCDS for PPPs, review any changes to schemas or codelists, and update the command as needed. Some OC4IDS-specific definitions have fields with the same names as in OCDS-specific definitions, notably: - procurementMethod - procurementMethodDetails - tenderers The descriptions of most other such fields have diverged. As such, the command makes no effort to copy the descriptions of such fields, and instead leaves this up to the editor. """ def copy_def(definition, replacements=None): value = deepcopy(ppp_schema['definitions'][definition]) schema['definitions'][definition] = value if replacements: for keys, replacement in replacements.items(): leaf = keys[-1] for key in keys[:-1]: value = value[key] value[leaf] = replacement(value[leaf]) ocds_base_url = 'https://standard.open-contracting.org/1.1/en/' builder = ProfileBuilder('1__1__5', {'budget': 'master'}) ppp_schema = get(f'{ppp_base_url}release-schema.json').json( object_pairs_hook=OrderedDict) ppp_schema = builder.patched_release_schema(schema=ppp_schema) schema_dir = basedir / 'schema' / 'project-level' codelists_dir = schema_dir / 'codelists' with (schema_dir / 'project-schema.json').open() as f: schema = json.load(f, object_pairs_hook=OrderedDict) infra_codelists = { 'contractingProcessStatus.csv', 'contractNature.csv', 'metricID.csv', 'modificationType.csv', 'projectSector.csv', 'projectStatus.csv', 'projectType.csv', 'relatedProjectScheme.csv', 'relatedProject.csv', } ocds_codelists = { 'currency.csv', 'documentType.csv', 'geometryType.csv', 'locationGazetteers.csv', 'method.csv', 'partyRole.csv', 'releaseTag.csv', 'unitClassificationScheme.csv', } compare([path.name for path in codelists_dir.iterdir()], infra_codelists, ocds_codelists, 'schema/project-level/codelists', 'codelists') infra_definitions = { 'ContractingProcess', 'ContractingProcessSummary', # Similar to individual release in OCDS 'LinkedRelease', # Similar to linked release in OCDS 'Modification', 'RelatedProject', # Similar to relatedProcess in OCDS 'Person', } ocds_definitions = { 'Period', 'Classification', 'Location', 'Value', 'Organization', 'OrganizationReference', 'Address', 'ContactPoint', 'BudgetBreakdown', 'Document', 'Identifier', 'Metric', 'Observation', 'Transaction', } compare(schema['definitions'], infra_definitions, ocds_definitions, 'schema/project-level/project-schema.json#/definitions', 'definitions') # Originally from https://docs.google.com/spreadsheets/d/1ttXgMmmLvqBlPRi_4jAJhIobjnCiwMv13YwGfFOnoJk/edit#gid=0 ignore = { # https://github.com/open-contracting/infrastructure/issues/269 'finalAudit', # https://github.com/open-contracting/standard/issues/870 'contractSchedule', # PPP-specific code or description 'needsAssessment', 'projectAdditionality', 'financeAdditionality', 'pppModeRationale', 'riskComparison', 'discountRate', 'equityTransferCaps', 'financeArrangements', 'guaranteeReports', 'grants', 'servicePayments', 'landTransfer', 'assetTransfer', 'revenueShare', 'otherGovernmentSupport', 'tariffMethod', 'tariffReview', 'tariffs', 'tariffIllustration', 'handover', 'financialStatement', } # Copy the OCDS codelists. for basename in ocds_codelists: path = schema_dir / 'codelists' / basename if basename in ('documentType.csv', 'partyRole.csv'): with open(path) as f: reader = csv.DictReader(f) fieldnames = reader.fieldnames oc4ids_rows = [] oc4ids_codes = [] for row in reader: if row['Source'] == 'OC4IDS': oc4ids_rows.append(row) oc4ids_codes.append(row['Code']) with open(path, 'w') as f: if basename == 'documentType.csv': io = StringIO() writer = csv.DictWriter(io, fieldnames, lineterminator='\n', extrasaction='ignore') writer.writeheader() seen = [] # Add codes from OCDS for PPPs. reader = csv_reader(f'{ppp_base_url}codelists/{basename}') for row in reader: if row['Code'] not in ignore: seen.append(row['Code']) # These codes' descriptions are entirely new. if row['Code'] in ('environmentalImpact', ): row = next(oc4ids_row for oc4ids_row in oc4ids_rows if oc4ids_row['Code'] == row['Code']) else: edit_code(row, oc4ids_codes, 'OCDS for PPPs') writer.writerow(row) # Add codes from OCDS. reader = csv_reader( f'{ocds_base_url}codelists/documentType.csv') for row in reader: if row['Code'] not in seen and row['Code'] not in ignore: seen.append(row['Code']) edit_code(row, oc4ids_codes, 'OCDS') writer.writerow(row) # Add pre-existing codes from OC4IDS. writer.writerows(row for row in oc4ids_rows if row['Code'] not in seen) text = io.getvalue() elif basename == 'partyRole.csv': io = StringIO() writer = csv.DictWriter(io, fieldnames, lineterminator='\n', extrasaction='ignore') writer.writeheader() seen = [] # Add codes from OCDS. reader = csv_reader(f'{ocds_base_url}codelists/partyRole.csv') for row in reader: if row['Code'] not in seen: seen.append(row['Code']) edit_code(row, oc4ids_codes, 'OCDS') writer.writerow(row) # Add pre-existing codes from OC4IDS. writer.writerows(row for row in oc4ids_rows if row['Code'] not in seen) text = io.getvalue() else: text = get(f'{ppp_base_url}codelists/{basename}').text f.write(text) # The following definitions follow the same order as in project-schema.json. copy_def( 'Period', { # Refer to project. ( 'description', ): lambda s: s.replace('contracting process', 'project or contracting process'), }) copy_def( 'Classification', { # Remove line item classifications from the definition. ('properties', 'scheme', 'description'): lambda s: s[:s.index(' For line item classifications,')], }) # Remove the `itemClassificationScheme.csv` codelist. del (schema['definitions']['Classification']['properties']['scheme'] ['codelist']) del (schema['definitions']['Classification']['properties']['scheme'] ['openCodelist']) copy_def('Location') # noqa: Original from ocds_location_extension: "The location where activity related to this tender, contract or license will be delivered, or will take place. A location can be described by either a geometry (point location, line or polygon), or a gazetteer entry, or both." schema['definitions']['Location'][ 'description'] = "The location where activity related to this project will be delivered, or will take place. A location may be described using a geometry (point location, line or polygon), a gazetteer entry, an address, or a combination of these." # noqa: E501 # Add id to Location. schema['definitions']['Location']['properties']['id'] = { 'title': 'Identifier', 'description': 'A local identifier for this location, unique within the array this location appears in.', 'type': 'string', 'minLength': 1, } # Add address to Location. schema['definitions']['Location']['properties']['address'] = { 'title': 'Address', 'description': 'A physical address where works will take place.', '$ref': '#/definitions/Address', } schema['definitions']['Location']['properties'].move_to_end('id', last=False) schema['definitions']['Location']['required'] = ['id'] # Set stricter validation on gazetteer identifiers schema['definitions']['Location']['properties']['gazetteer']['properties'][ 'identifiers']['uniqueItems'] = True copy_def('Value') copy_def( 'Organization', { # Refer to project instead of contracting process, link to infrastructure codelist instead of PPP codelist. ('properties', 'roles', 'description'): lambda s: s.replace('contracting process', 'project').replace( 'profiles/ppp/latest/en/', 'infrastructure/{{version}}/{{lang}}/') # noqa: E501 }) # Remove unneeded extensions and details from Organization. del (schema['definitions']['Organization']['properties']['shareholders']) del (schema['definitions']['Organization']['properties'] ['beneficialOwnership']) del (schema['definitions']['Organization']['properties']['details']) # Set stricter validation on party roles schema['definitions']['Organization']['properties']['roles'][ 'uniqueItems'] = True # Add `people` property to OrganizationReference schema['definitions']['Organization']['properties']['people'] = { "title": "People", "description": "People associated with, representing, or working on behalf of this organization in respect of this project.", # noqa: E501 "type": "array", "items": { "$ref": "#/definitions/Person" }, "uniqueItems": True } copy_def('OrganizationReference') copy_def('Address') copy_def( 'ContactPoint', { # Refer to project instead of contracting process. ('properties', 'name', 'description'): lambda s: s.replace('contracting process', 'project'), }) copy_def('BudgetBreakdown') copy_def( 'Document', { # Link to infrastructure codelist instead of PPP codelist ('properties', 'documentType', 'description'): lambda s: s. replace('profiles/ppp/latest/en/', 'infrastructure/{{version}}/{{lang}}/'), # noqa: E501 }) # noqa: Original from standard: "A short description of the document. We recommend descriptions do not exceed 250 words. In the event the document is not accessible online, the description field can be used to describe arrangements for obtaining a copy of the document.", schema['definitions']['Document']['properties']['description'][ 'description'] = "Where a link to a full document is provided, the description should provide a 1 - 3 paragraph summary of the information the document contains, and the `pageStart` field should be used to make sure readers can find the correct section of the document containing more information. Where there is no linked document available, the description field may contain all the information required by the current `documentType`. \n\nLine breaks in text (represented in JSON using `\\n\\n`) must be respected by systems displaying this information, and systems may also support basic HTML tags (H1-H6, B, I, U, strong, A and optionally IMG) or [markdown syntax](https://github.com/adam-p/markdown-here/wiki/Markdown-Cheatsheet) for formatting. " # noqa: E501 # noqa: Original from standard: " direct link to the document or attachment. The server providing access to this document should be configured to correctly report the document mime type." schema['definitions']['Document']['properties']['url'][ 'description'] = "This should be a direct link to the document or web page where the information described by the current documentType exists." # noqa: E501 copy_def('Identifier') copy_def( 'Metric', { ('properties', 'id', 'description'): lambda s: s.replace('contracting process', 'contracting process or project') }), # noqa: E501 schema['definitions']['Metric'][ 'description'] = "Metrics are used to set out forecast and actual metrics targets for a project: for example, planned and actual physical and financial progress over time." # noqa: E501 # noqa: Original from standard: "Metrics are used to set out targets and results from a contracting process. During the planning and tender sections, a metric indicates the anticipated results. In award and contract sections it indicates the awarded/contracted results. In the implementation section it is used to provide updates on actually delivered results, also known as outputs." copy_def('Observation') # Remove the `relatedImplementationMilestone` property del (schema['definitions']['Observation']['properties'] ['relatedImplementationMilestone']) copy_def('Transaction') remove_null_and_pattern_properties(schema) remove_integer_identifier_types(schema) remove_deprecated_properties(schema) add_validation_properties(schema) with (schema_dir / 'project-schema.json').open('w') as f: json.dump(schema, f, ensure_ascii=False, indent=2) f.write('\n')
def cli( filename, schema, selection, split, threshold, state_file, xlsx, csv, combine, unnest, unnest_file, only, only_file, repeat, repeat_file, count, human, language, ): """Spoonbill cli entry point""" click.echo(_("Detecting input file format")) # TODO: handle line separated json # TODO: handle single release/record ( input_format, _is_concatenated, _is_array, ) = detect_format(filename) if csv: csv = pathlib.Path(csv).resolve() if not csv.exists(): raise click.BadParameter( _("Desired location {} does not exists").format(csv)) if xlsx: xlsx = pathlib.Path(xlsx).resolve() if not xlsx.parent.exists(): raise click.BadParameter( _("Desired location {} does not exists").format(xlsx.parent)) click.echo( _("Input file is {}").format(click.style(input_format, fg="green"))) is_package = "package" in input_format combine_choice = combine if combine else "" if not is_package: # TODO: fix this click.echo("Single releases are not supported by now") return if schema: schema = resolve_file_uri(schema) if "release" in input_format: root_key = "releases" if not schema: click.echo( _("No schema provided, using version {}").format( click.style(CURRENT_SCHEMA_TAG, fg="cyan"))) profile = ProfileBuilder(CURRENT_SCHEMA_TAG, {}) schema = profile.release_package_schema() else: root_key = "records" if not schema: click.echo( _("No schema provided, using version {}").format( click.style(CURRENT_SCHEMA_TAG, fg="cyan"))) profile = ProfileBuilder(CURRENT_SCHEMA_TAG, {}) schema = profile.record_package_schema() title = schema.get("title", "").lower() if not title: raise ValueError( _("Incomplete schema, please make sure your data is correct")) if "package" in title: # TODO: is is a good way to get release/record schema schema = schema["properties"][root_key]["items"] path = pathlib.Path(filename) workdir = path.parent filename = path.name selection = selection or ROOT_TABLES.keys() combine = combine or COMBINED_TABLES.keys() root_tables = get_selected_tables(ROOT_TABLES, selection) combined_tables = get_selected_tables(COMBINED_TABLES, combine) if state_file: click.secho(_("Restoring from provided state file"), bold=True) analyzer = FileAnalyzer(workdir, state_file=state_file) else: click.secho( _("State file not supplied, going to analyze input file first"), bold=True) analyzer = FileAnalyzer( workdir, schema=schema, root_key=root_key, root_tables=root_tables, combined_tables=combined_tables, language=language, table_threshold=threshold, ) click.echo(_("Analyze options:")) click.echo( _(" - table threshold => {}").format( click.style(str(threshold), fg="cyan"))) click.echo( _(" - language => {}").format( click.style(language, fg="cyan"))) click.echo( _("Processing file: {}").format(click.style(str(path), fg="cyan"))) total = path.stat().st_size progress = 0 # Progress bar not showing with small files # https://github.com/pallets/click/pull/1296/files with click.progressbar(width=0, show_percent=True, show_pos=True, length=total) as bar: for read, number in analyzer.analyze_file(filename, with_preview=True): bar.label = ANALYZED_LABEL.format( click.style(str(number), fg="cyan")) bar.update(read - progress) progress = read click.secho(_("Done processing. Analyzed objects: {}").format( click.style(str(number + 1), fg="red")), fg="green") state_file = pathlib.Path(f"{filename}.state") state_file_path = workdir / state_file click.echo( _("Dumping analyzed data to '{}'").format( click.style(str(state_file_path.absolute()), fg="cyan"))) analyzer.dump_to_file(state_file) click.echo( _("Flattening file: {}").format(click.style(str(path), fg="cyan"))) if unnest and unnest_file: raise click.UsageError( _("Conflicting options: unnest and unnest-file")) if repeat and repeat_file: raise click.UsageError( _("Conflicting options: repeat and repeat-file")) if only and only_file: raise click.UsageError(_("Conflicting options: only and only-file")) options = {"selection": {}, "count": count} unnest = read_option_file(unnest, unnest_file) repeat = read_option_file(repeat, repeat_file) only = read_option_file(only, only_file) for name in selection: table = analyzer.spec[name] if table.total_rows == 0: click.echo( _("Ignoring empty table {}").format(click.style(name, fg="red"))) continue unnest = [col for col in unnest if col in table.combined_columns] if unnest: click.echo( _("Unnesting columns {} for table {}").format( click.style(",".join(unnest), fg="cyan"), click.style(name, fg="cyan"))) only = [col for col in only if col in table] if only: click.echo( _("Using only columns {} for table {}").format( click.style(",".join(only), fg="cyan"), click.style(name, fg="cyan"))) repeat = [col for col in repeat if col in table] if repeat: click.echo( _("Repeating columns {} in all child table of {}").format( click.style(",".join(repeat), fg="cyan"), click.style(name, fg="cyan"))) options["selection"][name] = { "split": split or analyzer.spec[name].should_split, "pretty_headers": human, "unnest": unnest, "only": only, "repeat": repeat, } options = FlattenOptions(**options) flattener = FileFlattener( workdir, options, analyzer.spec.tables, root_key=root_key, csv=csv, xlsx=xlsx, language=language, ) all_tables = chain([table for table in flattener.flattener.tables.keys()], combine_choice) click.echo( _("Going to export tables: {}").format( click.style(",".join(all_tables), fg="magenta"))) click.echo(_("Processed tables:")) for table in flattener.flattener.tables.keys(): message = _("{}: {} rows").format( table, flattener.flattener.tables[table].total_rows) if not flattener.flattener.tables[table].is_root: message = "└-----" + message click.echo(message) else: click.echo(message) click.echo(_("Flattening input file")) with click.progressbar( flattener.flatten_file(filename), length=analyzer.spec.total_items + 1, width=0, show_percent=True, show_pos=True, ) as bar: for count in bar: bar.label = FLATTENED_LABEL.format( click.style(str(count + 1), fg="cyan")) click.secho(_("Done flattening. Flattened objects: {}").format( click.style(str(count + 1), fg="red")), fg="green")