def package_from_resources(resource_path, output_path, clean=True): """ Collects resource descriptors and merges them in a datapackage.json Parameters ---------- resource_path: string Path to directory with resources (in .json format) output_path: string Root path of datapackage where the newly created datapckage.json is stored clean: boolean If true, resources will be deleted """ p = Package() p.descriptor["profile"] = "tabular-data-package" p.commit() for f in os.listdir(resource_path): path = os.path.join(resource_path, f) r = Resource(path) p.add_resource(r.descriptor) p.commit() os.remove(path) if clean: os.rmdir(resource_path) p.save(os.path.join(output_path, "datapackage.json"))
def save_datasets_as_data_packages(self, folder_path): """ save each dataset from a data.json source as _datapackage_ """ for dataset in self.datasets: package = Package() #TODO check this, I'm learning datapackages resource = Resource({'data': dataset}) resource.infer() #adds "name": "inline" #FIXME identifier uses incompables characthers as paths (e.g. /). # could exist duplicates paths from different resources # use BASE64 or hashes idf = slugify(dataset['identifier']) resource_path = os.path.join(folder_path, f'resource_data_json_{idf}.json') if not resource.valid: raise Exception('Invalid resource') resource.save(resource_path) package.add_resource(descriptor=resource.descriptor) package_path = os.path.join(folder_path, f'pkg_data_json_{idf}.zip') package.save(target=package_path)
def _process(self): for resource, descriptor in zip(self.resources, self.datapackage['resources']): if descriptor['name'] == 'messages': messages_datasets, output_resources = self.process_messages( resource) else: raise Exception('unexpected resource: {}'.format( descriptor['name'])) dp = Package({'name': '_', 'resources': output_resources}) dp.add_resource({ 'name': 'messages-datasets', 'path': 'messages-datasets.csv', 'dpp:streaming': True, 'schema': { 'fields': [{ 'name': 'message_id', 'type': 'string' }, { 'name': 'dataset_name', 'type': 'string' }] } }) return DataStream(dp, ((row for row in r) for r in [messages_datasets]), {})
def save_datasets_as_data_packages(self, folder_path): """ save each dataset from a data.json source as _datapackage_ """ for dataset in self.package_list: package = Package() #TODO check this, I'm learning datapackages resource = Resource({'data': dataset}) resource.infer() #adds "name": "inline" identifier = dataset['id'] bytes_identifier = identifier.encode('utf-8') encoded = base64.b64encode(bytes_identifier) encoded_identifier = str(encoded, "utf-8") resource_path = os.path.join( folder_path, f'resource_ckan_api_{encoded_identifier}.json') if not resource.valid: raise Exception('Invalid resource') resource.save(resource_path) package.add_resource(descriptor=resource.descriptor) package_path = os.path.join( folder_path, f'pkg_ckan_api_{encoded_identifier}.zip') package.save(target=package_path)
def save_as_data_packages(row): """ save dataset from data.json as data package We will use this files as a queue to process later """ # TODO check if ckanext-datapackager is useful for import # or export resources: # https://github.com/frictionlessdata/ckanext-datapackager package = Package() # TODO check this, I'm learning datapackages. resource = Resource({'data': row}) resource.infer() # adds "name": "inline" if not resource.valid: raise Exception('Invalid resource') encoded_identifier = encode_identifier(identifier=row['identifier']) # resource_path = os.path.join(path, f'{prefix}_{encoded_identifier}.json') # resource.save(resource_path) package.add_resource(descriptor=resource.descriptor) folder = config.get_data_packages_folder_path() filename = f'data-json-{encoded_identifier}.json' package_path = os.path.join(folder, filename) # no not rewrite if exists if not os.path.isfile(package_path): package.save(target=package_path)
def _make_package(source, publisher, config): os.chdir(source) files = [f for f in os.listdir('data') if f.endswith('.csv')] package = Package({'publisher': publisher}) for f in files: path = f"data/{f}" name = f.replace('.csv', '') schema = f"https://raw.githubusercontent.com/digital-land/alpha-data/master/schema/{name}-schema.json" resource = Resource({'path': path, 'schema': schema}) package.add_resource(resource.descriptor) package.commit() package.infer() errors = False for r in package.resources: try: r.read(keyed=True) r.check_relations() except (CastError, RelationError) as e: print('Error in', os.path.join(source, r.descriptor['path'])) print(e, e.errors) errors = True if not errors: package.save('datapackage.zip') print('saved datapackage.json to', source) s3 = boto3.client( 's3', aws_access_key_id=config['AWS_ACCESS_KEY_ID'], aws_secret_access_key=config['AWS_SECRET_ACCESS_KEY']) bucket = 'developer-contributions-datapackages' key = f'{publisher}/{uuid.uuid4()}/datapackage.zip' s3.upload_file(f'{source}/datapackage.zip', bucket, key, ExtraArgs={'ACL': 'public-read'}) config = s3._client_config config.signature_version = botocore.UNSIGNED datapackage_url = boto3.resource( 's3', config=config).meta.client.generate_presigned_url('get_object', ExpiresIn=0, Params={ 'Bucket': bucket, 'Key': key }) return datapackage_url
def convert_hdx_dataset(self, dataset_id, path): dataset = Dataset.read_from_hdx(dataset_id) package = Package({'id': dataset['id'], 'name': dataset['name'], 'title': dataset['title'], 'description': dataset['notes']}) for hdx_resource in dataset.get_resources(): name = hdx_resource['name'].lower().replace(' ', '_') package.add_resource({'name': name, 'path': hdx_resource['url'], 'format': hdx_resource['format'].lower(), 'title': hdx_resource['description']}) try: package.infer() except tabulator.exceptions.FormatError: pass for frictionless_resource in package.descriptor['resources']: self.convert_hxl_url(frictionless_resource) package.commit() package.save(path)
def create_datapackage(submission_title, submission_files, metadata): minio_url = app.config.get('MINIO_ENDPOINT') package = Package({'name': submission_title}) for submission in submission_files: package.add_resource({ 'name': clean_resource_name(submission['file_name']), 'path': f'http://{minio_url}/minio/{BUCKET}/{submission["object_name"]}', }) # TODO decide if we validate here? ''' try: validate(package.descriptor) except Exception as e: raise e ''' return package.descriptor
def update_package_descriptor(): """ """ p = Package("datapackage.json") for f in os.listdir("resources"): path = os.path.join("resources", f) r = Resource(path) p.add_resource(r.descriptor) p.commit() os.remove(path) os.rmdir("resources") p.save("datapackage.json")
def save_datasets_as_data_packages(self, folder_path, identifier_field): """ save each dataset from a data.json source as _datapackage_ """ for dataset in self.datasets: package = Package() #TODO check this, I'm learning datapackages resource = Resource({'data': dataset}) resource.infer() #adds "name": "inline" idf = slugify(dataset[identifier_field]) resource_path = os.path.join(folder_path, f'resource_data_json_{idf}.json') if not resource.valid: raise Exception('Invalid resource') resource.save(resource_path) package.add_resource(descriptor=resource.descriptor) package_path = os.path.join(folder_path, f'pkg_data_json_{idf}.zip') package.save(target=package_path)
def convert_resources(self, resources): """Called when advertising resources through this connection *in the FORWARD direction*. Takes the initial list of resources advertised by the source item and returns a new list, which is the one finally advertised. At the moment it only packs CSVs into datapackage (and again, it's only used in the FORWARD direction). Args: resources (list of ProjectItemResource): Resources to convert Returns: list of ProjectItemResource """ if not self.use_datapackage: return resources # Split CSVs from the rest of resources final_resources = [] csv_filepaths = [] for r in resources: if r.hasfilepath and os.path.splitext(r.path)[1].lower() == ".csv": csv_filepaths.append(r.path) continue final_resources.append(r) if not csv_filepaths: return final_resources # Build Package from CSVs and add it to the resources base_path = os.path.dirname(os.path.commonpath(csv_filepaths)) package = Package(base_path=base_path) for path in csv_filepaths: package.add_resource({"path": os.path.relpath(path, base_path)}) package_path = os.path.join(base_path, "datapackage.json") package.save(package_path) package_resource = file_resource(self.source, package_path, label=f"datapackage@{self.source}") final_resources.append(package_resource) return final_resources
def process_datapackage(self, dp: Package): if isinstance(self.load_source, tuple): datapackage_descriptor, _ = self.load_source dp.descriptor.setdefault('resources', []) self.resource_matcher = ResourceMatcher(self.resources, datapackage_descriptor) for resource_descriptor in datapackage_descriptor['resources']: if self.resource_matcher.match(resource_descriptor['name']): dp.add_resource(resource_descriptor) else: # load_source is string: if self.load_source.startswith('env://'): env_var = self.load_source[6:] self.load_source = os.environ.get(env_var) if self.load_source is None: raise ValueError( f"Couldn't find value for env var '{env_var}'") if os.path.basename(self.load_source) == 'datapackage.json': self.load_dp = Package(self.load_source) self.resource_matcher = ResourceMatcher( self.resources, self.load_dp) dp.descriptor.setdefault('resources', []) for resource in self.load_dp.resources: if self.resource_matcher.match(resource.name): dp.add_resource(resource.descriptor) else: if os.path.exists(self.load_source): base_path = os.path.dirname(self.load_source) or '.' self.load_source = os.path.basename(self.load_source) else: base_path = None descriptor = dict(path=self.load_source, profile='tabular-data-resource') descriptor['format'] = self.options.get('format') if 'encoding' in self.options: descriptor['encoding'] = self.options['encoding'] if descriptor['format'] == 'xml' or self.load_source.endswith( '.xml'): self.options.setdefault('custom_parsers', {})['xml'] = XMLParser self.options.setdefault('ignore_blank_headers', True) self.options.setdefault('headers', 1) self.res = Resource(descriptor, base_path=base_path, **self.options) self.res.infer(confidence=1, limit=1000) if self.name is not None: self.res.descriptor['name'] = self.name if self.force_strings: for f in self.res.descriptor['schema']['fields']: f['type'] = 'string' self.res.commit() self.res.descriptor['path'] = '{name}.{format}'.format( **self.res.descriptor) dp.add_resource(self.res.descriptor) return dp
def infer_metadata( package_name="default-name", keep_resources=False, foreign_keys={ "bus": [ "volatile", "dispatchable", "storage", "load", "reservoir", "shortage", "excess", ], "profile": ["load", "volatile", "ror"], "from_to_bus": ["connection", "line", "conversion"], "chp": ["backpressure", "extraction", "chp"], }, path=None, ): """ Add basic meta data for a datapackage Parameters ---------- package_name: string Name of the data package keep_resource: boolean Flag indicating of the resources meta data json-files should be kept after main datapackage.json is created. The reource meta data will be stored in the `resources` directory. foreign_keys: dict Dictionary with foreign key specification. Keys for dictionary are: 'bus', 'profile', 'from_to_bus'. Values are list with strings with the name of the resources path: string Absoltue path to root-folder of the datapackage """ current_path = os.getcwd() if path: print("Setting current work directory to {}".format(path)) os.chdir(path) p = Package() p.descriptor["name"] = package_name p.descriptor["profile"] = "tabular-data-package" p.commit() if not os.path.exists("resources"): os.makedirs("resources") # create meta data resources elements if not os.path.exists("data/elements"): print("No data path found in directory {}. Skipping...".format( os.getcwd())) else: for f in os.listdir("data/elements"): r = Resource({"path": os.path.join("data/elements", f)}) r.infer() r.descriptor["schema"]["primaryKey"] = "name" if r.name in foreign_keys.get("bus", []): r.descriptor["schema"]["foreignKeys"] = [{ "fields": "bus", "reference": { "resource": "bus", "fields": "name" }, }] if r.name in foreign_keys.get("profile", []): r.descriptor["schema"]["foreignKeys"].append({ "fields": "profile", "reference": { "resource": r.name + "_profile" }, }) elif r.name in foreign_keys.get("from_to_bus", []): r.descriptor["schema"]["foreignKeys"] = [ { "fields": "from_bus", "reference": { "resource": "bus", "fields": "name" }, }, { "fields": "to_bus", "reference": { "resource": "bus", "fields": "name" }, }, ] elif r.name in foreign_keys.get("chp", []): r.descriptor["schema"]["foreignKeys"] = [ { "fields": "fuel_bus", "reference": { "resource": "bus", "fields": "name" }, }, { "fields": "electricity_bus", "reference": { "resource": "bus", "fields": "name" }, }, { "fields": "heat_bus", "reference": { "resource": "bus", "fields": "name" }, }, ] r.commit() r.save(os.path.join("resources", f.replace(".csv", ".json"))) p.add_resource(r.descriptor) # create meta data resources elements if not os.path.exists("data/sequences"): print("No data path found in directory {}. Skipping...".format( os.getcwd())) else: for f in os.listdir("data/sequences"): r = Resource({"path": os.path.join("data/sequences", f)}) r.infer() r.commit() r.save(os.path.join("resources", f.replace(".csv", ".json"))) p.add_resource(r.descriptor) p.commit() p.save("datapackage.json") if not keep_resources: shutil.rmtree("resources") os.chdir(current_path)
# -*- coding: utf-8 -*- """ """ import os from datapackage import Package, Resource p = Package('datapackage.json') p.descriptor['profile'] = 'tabular-data-package' for f in os.listdir('resources'): path = os.path.join('resources', f) r = Resource(path) p.add_resource(r.descriptor) p.commit() os.remove(path) os.rmdir('resources') p.save('datapackage.json')
def build(config: Dict) -> Package: """Builds a datapackage.Datapackage object from a config dictionary. The configuration dictionary should contain the following keys: "metadata", "files". Information about the corresponding study can be placed in metadata. Example: { 'metadata': { 'name': 'ddionrails-study', 'id': 'doi' } } The desired files to be included in the Tabular Data Package can be placed in 'files': Example: { 'files': [ 'concepts.csv' ] } See: examples/example-config.yml The resulting Tabular Data Package is written to disk as 'datapackage.json' in the directory the command line tool is run. Args: config: The configuration of the Datapackage to be created. """ if "metadata" not in config or "files" not in config: raise ValueError("Config must contain 'metadata' and 'files'") # Read the descriptor base dictionary from disk # and update it with values from the config file descriptor = read_yaml(DATAPACKAGE_BASE_FILE) descriptor["name"] = config["metadata"].get("name") descriptor["id"] = config["metadata"].get("id") descriptor["title"] = config["metadata"].get("title") # Remove empty keys from the dictionary descriptor = {key: value for key, value in descriptor.items() if value} # Create a Datapackage object from the descriptor dictionary package = Package(descriptor=descriptor) wanted_files = [file.split(".")[0] for file in config["files"]] for file in wanted_files: # If a filename ends with "_strict" # create the basic Tabular Data Resource first # then add the "stricter" rules from the "_strict" file if "_strict" in file: basic_file = file.replace("_strict", "") resource = read_tabular_data_resource(basic_file) strict_resource = read_tabular_data_resource(file) merge(resource, strict_resource) else: resource = read_tabular_data_resource(file) package.add_resource(resource) package.commit() if not package.valid: for error in package.errors: LOGGER.error(error) return package
def test_package_add_resource(): package = Package({}) resource = package.add_resource({'name': 'name', 'data': []}) assert len(package.resources) == 1 assert package.resources[0].name == 'name' assert resource.name == 'name'
import sys if len(sys.argv) != 2: raise Exception( 'This script expects only one parameter: the path to a datapackage.json file' ) sys.exit() dpkg = sys.argv[1] print('Reading ' + dpkg) ## Read the created pipeline package package = Package(dpkg) for i in range(len(package.resource_names)): resource = package.get_resource(package.resource_names[i]) if (resource.remote is False): ## Infer the schema fields print('Inferring schema for ' + package.resource_names[i]) inferred_schema_pkg = Package() inferred_schema_pkg.infer(resource.source) inferred_resource = inferred_schema_pkg.get_resource( resource.descriptor['name']) resource.descriptor['schema'] = inferred_resource.descriptor['schema'] package.remove_resource(resource.descriptor['name']) package.add_resource(resource.descriptor) package.commit() if (package.valid is not True): raise Exception('Package is invalid') package.save(dpkg)