def get_merge_rules(schema=None): """ Returns merge rules as key-value pairs, in which the key is a JSON path as a tuple, and the value is a list of merge properties whose values are `true`. """ schema = schema or get_release_schema_url(get_tags()[-1]) if isinstance(schema, dict): return _get_merge_rules_from_dereferenced_schema(jsonref.JsonRef.replace_refs(schema)) else: return _get_merge_rules_from_url_or_path(schema)
def process_item(self, item, spider): if not spider.unflatten or not isinstance(item, (File, FileItem)): return item input_name = item["file_name"] if input_name.endswith(".csv"): item["file_name"] = item["file_name"][:-4] + ".json" input_format = "csv" elif input_name.endswith(".xlsx"): item["file_name"] = item["file_name"][:-5] + ".json" input_format = "xlsx" else: raise NotImplementedError( f"the file '{input_name}' has no extension or is not CSV or XLSX, " f"obtained from: {item['url']}" ) spider_ocds_version = spider.ocds_version.replace(".", "__") for tag in reversed(get_tags()): if tag.startswith(spider_ocds_version): schema = get_release_schema_url(tag) break else: raise NotImplementedError(f"no schema found for '{spider_ocds_version}'") with tempfile.TemporaryDirectory() as directory: input_path = os.path.join(directory, input_name) output_name = os.path.join(directory, item["file_name"]) if input_format == "csv": input_name = directory elif input_format == "xlsx": input_name = input_path with open(input_path, "wb") as f: f.write(item["data"]) with warnings.catch_warnings(): warnings.filterwarnings( "ignore" ) # flattentool uses UserWarning, so we can't set a specific category unflatten( input_name, root_list_path="releases", root_id="ocid", schema=schema, input_format=input_format, output_name=output_name, **spider.unflatten_args, ) with open(output_name, "r") as f: item["data"] = f.read() return item
def test_get_tags(): assert get_tags()[:11] == [ '0__3__2', '0__3__3', '1__0__0', '1__0__1', '1__0__2', '1__0__3', '1__1__0', '1__1__1', '1__1__2', '1__1__3', '1__1__4', ]
def process_item(self, item, spider): if not spider.unflatten or not isinstance(item, (File, FileItem)): return item input_name = item['file_name'] if input_name.endswith('.csv'): item['file_name'] = item['file_name'][:-4] + '.json' input_format = 'csv' elif input_name.endswith('.xlsx'): item['file_name'] = item['file_name'][:-5] + '.json' input_format = 'xlsx' else: raise NotImplementedError(f"the file '{input_name}' has no extension or is not CSV or XLSX, " f"obtained from: {item['url']}") spider_ocds_version = spider.ocds_version.replace('.', '__') for tag in reversed(get_tags()): if tag.startswith(spider_ocds_version): schema = get_release_schema_url(tag) break else: raise NotImplementedError(f"no schema found for '{spider_ocds_version}'") with tempfile.TemporaryDirectory() as directory: input_path = os.path.join(directory, input_name) output_name = os.path.join(directory, item['file_name']) if input_format == 'csv': input_name = directory elif input_format == 'xlsx': input_name = input_path with open(input_path, 'wb') as f: f.write(item['data']) unflatten( input_name, root_list_path='releases', root_id='ocid', schema=schema, input_format=input_format, output_name=output_name, **spider.unflatten_args ) with open(output_name, 'r') as f: item['data'] = f.read() return item
def test_output_package_no_streaming(): data = [ json.loads(read(filename)) for filename in ('realdata/release-package-1.json', 'realdata/release-package-2.json') ] with Packager() as packager: packager.package['version'] = '1.1' packager.add(data) prefix = packager.version.replace('.', '__') + '__' tag = next(tag for tag in reversed(get_tags()) if tag.startswith(prefix)) schema = get_release_schema_url(tag) actual = next(packager.output_package(Merger(schema))) assert actual == json.loads(read('realdata/record-package_package.json'))
def merge(data, uri='', publisher=None, published_date='', version=DEFAULT_VERSION, schema=None, return_versioned_release=False, return_package=False, use_linked_releases=False, streaming=False): """ Merges release packages and individual releases. By default, yields compiled releases. If ``return_versioned_release`` is ``True``, yields versioned releases. If ``return_package`` is ``True``, wraps the compiled releases (and versioned releases if ``return_versioned_release`` is ``True``) in a record package. If ``return_package`` is set and ``publisher`` isn't set, the output record package will have the same publisher as the last input release package. :param data: an iterable of release packages and individual releases :param str uri: if ``return_package`` is ``True``, the record package's ``uri`` :param dict publisher: if ``return_package`` is ``True``, the record package's ``publisher`` :param str published_date: if ``return_package`` is ``True``, the record package's ``publishedDate`` :param str version: if ``return_package`` is ``True``, the record package's ``version`` :param dict schema: the URL, path or dict of the patched release schema to use :param bool return_package: wrap the compiled releases in a record package :param bool use_linked_releases: if ``return_package`` is ``True``, use linked releases instead of full releases, if the input is a release package :param bool return_versioned_release: if ``return_package`` is ``True``, include versioned releases in the record package; otherwise, yield versioned releases instead of compiled releases :param bool streaming: if ``return_package`` is ``True``, set the package's records to a generator (this only works if the calling code exhausts the generator before ``merge`` returns) :raises InconsistentVersionError: if the versions are inconsistent across packages to merge :raises MissingOcidKeyError: if the release is missing an ``ocid`` field """ with Packager() as packager: packager.add(data) if not schema and packager.version: prefix = packager.version.replace('.', '__') + '__' tag = next(tag for tag in reversed(get_tags()) if tag.startswith(prefix)) schema = get_release_schema_url(tag) if packager.package['extensions']: builder = ProfileBuilder(tag, list(packager.package['extensions'])) schema = builder.patched_release_schema() merger = Merger(schema) if return_package: packager.package['uri'] = uri packager.package['publishedDate'] = published_date packager.package['version'] = version if publisher: packager.package['publisher'] = publisher yield from packager.output_package( merger, return_versioned_release=return_versioned_release, use_linked_releases=use_linked_releases, streaming=streaming) else: yield from packager.output_releases( merger, return_versioned_release=return_versioned_release)
def _get_tags(): return cache.get_or_set('git_tags', sorted(get_tags(), reverse=True), 3600)