Пример #1
0
def package_from_resources(resource_path, output_path, clean=True):
    """ Collects resource descriptors and merges them in a datapackage.json

    Parameters
    ----------
    resource_path: string
        Path to directory with resources (in .json format)
    output_path: string
        Root path of datapackage where the newly created datapckage.json is
        stored
    clean: boolean
        If true, resources will be deleted
    """
    p = Package()

    p.descriptor["profile"] = "tabular-data-package"
    p.commit()

    for f in os.listdir(resource_path):
        path = os.path.join(resource_path, f)

        r = Resource(path)

        p.add_resource(r.descriptor)

        p.commit()

        os.remove(path)

    if clean:
        os.rmdir(resource_path)

    p.save(os.path.join(output_path, "datapackage.json"))
Пример #2
0
    def save_datasets_as_data_packages(self, folder_path):
        """ save each dataset from a data.json source as _datapackage_ """
        for dataset in self.datasets:
            package = Package()

            #TODO check this, I'm learning datapackages
            resource = Resource({'data': dataset})
            resource.infer()  #adds "name": "inline"

            #FIXME identifier uses incompables characthers as paths (e.g. /).
            # could exist duplicates paths from different resources
            # use BASE64 or hashes
            idf = slugify(dataset['identifier'])

            resource_path = os.path.join(folder_path,
                                         f'resource_data_json_{idf}.json')
            if not resource.valid:
                raise Exception('Invalid resource')

            resource.save(resource_path)

            package.add_resource(descriptor=resource.descriptor)
            package_path = os.path.join(folder_path,
                                        f'pkg_data_json_{idf}.zip')
            package.save(target=package_path)
 def _process(self):
     for resource, descriptor in zip(self.resources,
                                     self.datapackage['resources']):
         if descriptor['name'] == 'messages':
             messages_datasets, output_resources = self.process_messages(
                 resource)
         else:
             raise Exception('unexpected resource: {}'.format(
                 descriptor['name']))
     dp = Package({'name': '_', 'resources': output_resources})
     dp.add_resource({
         'name': 'messages-datasets',
         'path': 'messages-datasets.csv',
         'dpp:streaming': True,
         'schema': {
             'fields': [{
                 'name': 'message_id',
                 'type': 'string'
             }, {
                 'name': 'dataset_name',
                 'type': 'string'
             }]
         }
     })
     return DataStream(dp, ((row for row in r)
                            for r in [messages_datasets]), {})
Пример #4
0
    def save_datasets_as_data_packages(self, folder_path):
        """ save each dataset from a data.json source as _datapackage_ """
        for dataset in self.package_list:
            package = Package()

            #TODO check this, I'm learning datapackages
            resource = Resource({'data': dataset})
            resource.infer()  #adds "name": "inline"

            identifier = dataset['id']
            bytes_identifier = identifier.encode('utf-8')
            encoded = base64.b64encode(bytes_identifier)
            encoded_identifier = str(encoded, "utf-8")

            resource_path = os.path.join(
                folder_path, f'resource_ckan_api_{encoded_identifier}.json')
            if not resource.valid:
                raise Exception('Invalid resource')

            resource.save(resource_path)

            package.add_resource(descriptor=resource.descriptor)
            package_path = os.path.join(
                folder_path, f'pkg_ckan_api_{encoded_identifier}.zip')
            package.save(target=package_path)
Пример #5
0
def save_as_data_packages(row):
    """ save dataset from data.json as data package
        We will use this files as a queue to process later """
    # TODO check if ckanext-datapackager is useful for import
    # or export resources:
    # https://github.com/frictionlessdata/ckanext-datapackager

    package = Package()

    # TODO check this, I'm learning datapackages.
    resource = Resource({'data': row})
    resource.infer()  # adds "name": "inline"
    if not resource.valid:
        raise Exception('Invalid resource')

    encoded_identifier = encode_identifier(identifier=row['identifier'])

    # resource_path = os.path.join(path, f'{prefix}_{encoded_identifier}.json')
    # resource.save(resource_path)

    package.add_resource(descriptor=resource.descriptor)
    folder = config.get_data_packages_folder_path()
    filename = f'data-json-{encoded_identifier}.json'
    package_path = os.path.join(folder, filename)

    # no not rewrite if exists
    if not os.path.isfile(package_path):
        package.save(target=package_path)
Пример #6
0
def _make_package(source, publisher, config):

    os.chdir(source)
    files = [f for f in os.listdir('data') if f.endswith('.csv')]
    package = Package({'publisher': publisher})

    for f in files:
        path = f"data/{f}"
        name = f.replace('.csv', '')
        schema = f"https://raw.githubusercontent.com/digital-land/alpha-data/master/schema/{name}-schema.json"
        resource = Resource({'path': path, 'schema': schema})
        package.add_resource(resource.descriptor)

    package.commit()
    package.infer()

    errors = False
    for r in package.resources:
        try:
            r.read(keyed=True)
            r.check_relations()
        except (CastError, RelationError) as e:
            print('Error in', os.path.join(source, r.descriptor['path']))
            print(e, e.errors)
            errors = True
    if not errors:
        package.save('datapackage.zip')
        print('saved datapackage.json to', source)

        s3 = boto3.client(
            's3',
            aws_access_key_id=config['AWS_ACCESS_KEY_ID'],
            aws_secret_access_key=config['AWS_SECRET_ACCESS_KEY'])

        bucket = 'developer-contributions-datapackages'
        key = f'{publisher}/{uuid.uuid4()}/datapackage.zip'
        s3.upload_file(f'{source}/datapackage.zip',
                       bucket,
                       key,
                       ExtraArgs={'ACL': 'public-read'})

        config = s3._client_config
        config.signature_version = botocore.UNSIGNED

        datapackage_url = boto3.resource(
            's3',
            config=config).meta.client.generate_presigned_url('get_object',
                                                              ExpiresIn=0,
                                                              Params={
                                                                  'Bucket':
                                                                  bucket,
                                                                  'Key': key
                                                              })

        return datapackage_url
Пример #7
0
 def convert_hdx_dataset(self, dataset_id, path):
     dataset = Dataset.read_from_hdx(dataset_id)
     package = Package({'id': dataset['id'], 'name': dataset['name'], 'title': dataset['title'],
                        'description': dataset['notes']})
     for hdx_resource in dataset.get_resources():
         name = hdx_resource['name'].lower().replace(' ', '_')
         package.add_resource({'name': name, 'path': hdx_resource['url'],
                               'format': hdx_resource['format'].lower(), 'title': hdx_resource['description']})
     try:
         package.infer()
     except tabulator.exceptions.FormatError:
         pass
     for frictionless_resource in package.descriptor['resources']:
         self.convert_hxl_url(frictionless_resource)
     package.commit()
     package.save(path)
Пример #8
0
def create_datapackage(submission_title, submission_files, metadata):
    minio_url = app.config.get('MINIO_ENDPOINT')
    package = Package({'name': submission_title})
    for submission in submission_files:
        package.add_resource({
            'name':
            clean_resource_name(submission['file_name']),
            'path':
            f'http://{minio_url}/minio/{BUCKET}/{submission["object_name"]}',
        })
    # TODO decide if we validate here?
    '''
    try:
        validate(package.descriptor)
    except Exception as e:
        raise e
    '''

    return package.descriptor
Пример #9
0
def update_package_descriptor():
    """
    """
    p = Package("datapackage.json")

    for f in os.listdir("resources"):
        path = os.path.join("resources", f)

        r = Resource(path)

        p.add_resource(r.descriptor)

        p.commit()

        os.remove(path)

    os.rmdir("resources")

    p.save("datapackage.json")
Пример #10
0
    def save_datasets_as_data_packages(self, folder_path, identifier_field):
        """ save each dataset from a data.json source as _datapackage_ """
        for dataset in self.datasets:
            package = Package()

            #TODO check this, I'm learning datapackages
            resource = Resource({'data': dataset})
            resource.infer()  #adds "name": "inline"

            idf = slugify(dataset[identifier_field])

            resource_path = os.path.join(folder_path, f'resource_data_json_{idf}.json')
            if not resource.valid:
                raise Exception('Invalid resource')

            resource.save(resource_path)

            package.add_resource(descriptor=resource.descriptor)
            package_path = os.path.join(folder_path, f'pkg_data_json_{idf}.zip')
            package.save(target=package_path)
Пример #11
0
    def convert_resources(self, resources):
        """Called when advertising resources through this connection *in the FORWARD direction*.
        Takes the initial list of resources advertised by the source item and returns a new list,
        which is the one finally advertised.

        At the moment it only packs CSVs into datapackage (and again, it's only used in the FORWARD direction).

        Args:
            resources (list of ProjectItemResource): Resources to convert

        Returns:
            list of ProjectItemResource
        """
        if not self.use_datapackage:
            return resources
        # Split CSVs from the rest of resources
        final_resources = []
        csv_filepaths = []
        for r in resources:
            if r.hasfilepath and os.path.splitext(r.path)[1].lower() == ".csv":
                csv_filepaths.append(r.path)
                continue
            final_resources.append(r)
        if not csv_filepaths:
            return final_resources
        # Build Package from CSVs and add it to the resources
        base_path = os.path.dirname(os.path.commonpath(csv_filepaths))
        package = Package(base_path=base_path)
        for path in csv_filepaths:
            package.add_resource({"path": os.path.relpath(path, base_path)})
        package_path = os.path.join(base_path, "datapackage.json")
        package.save(package_path)
        package_resource = file_resource(self.source,
                                         package_path,
                                         label=f"datapackage@{self.source}")
        final_resources.append(package_resource)
        return final_resources
Пример #12
0
 def process_datapackage(self, dp: Package):
     if isinstance(self.load_source, tuple):
         datapackage_descriptor, _ = self.load_source
         dp.descriptor.setdefault('resources', [])
         self.resource_matcher = ResourceMatcher(self.resources,
                                                 datapackage_descriptor)
         for resource_descriptor in datapackage_descriptor['resources']:
             if self.resource_matcher.match(resource_descriptor['name']):
                 dp.add_resource(resource_descriptor)
     else:  # load_source is string:
         if self.load_source.startswith('env://'):
             env_var = self.load_source[6:]
             self.load_source = os.environ.get(env_var)
             if self.load_source is None:
                 raise ValueError(
                     f"Couldn't find value for env var '{env_var}'")
         if os.path.basename(self.load_source) == 'datapackage.json':
             self.load_dp = Package(self.load_source)
             self.resource_matcher = ResourceMatcher(
                 self.resources, self.load_dp)
             dp.descriptor.setdefault('resources', [])
             for resource in self.load_dp.resources:
                 if self.resource_matcher.match(resource.name):
                     dp.add_resource(resource.descriptor)
         else:
             if os.path.exists(self.load_source):
                 base_path = os.path.dirname(self.load_source) or '.'
                 self.load_source = os.path.basename(self.load_source)
             else:
                 base_path = None
             descriptor = dict(path=self.load_source,
                               profile='tabular-data-resource')
             descriptor['format'] = self.options.get('format')
             if 'encoding' in self.options:
                 descriptor['encoding'] = self.options['encoding']
             if descriptor['format'] == 'xml' or self.load_source.endswith(
                     '.xml'):
                 self.options.setdefault('custom_parsers',
                                         {})['xml'] = XMLParser
             self.options.setdefault('ignore_blank_headers', True)
             self.options.setdefault('headers', 1)
             self.res = Resource(descriptor,
                                 base_path=base_path,
                                 **self.options)
             self.res.infer(confidence=1, limit=1000)
             if self.name is not None:
                 self.res.descriptor['name'] = self.name
             if self.force_strings:
                 for f in self.res.descriptor['schema']['fields']:
                     f['type'] = 'string'
             self.res.commit()
             self.res.descriptor['path'] = '{name}.{format}'.format(
                 **self.res.descriptor)
             dp.add_resource(self.res.descriptor)
     return dp
Пример #13
0
def infer_metadata(
    package_name="default-name",
    keep_resources=False,
    foreign_keys={
        "bus": [
            "volatile",
            "dispatchable",
            "storage",
            "load",
            "reservoir",
            "shortage",
            "excess",
        ],
        "profile": ["load", "volatile", "ror"],
        "from_to_bus": ["connection", "line", "conversion"],
        "chp": ["backpressure", "extraction", "chp"],
    },
    path=None,
):
    """ Add basic meta data for a datapackage

    Parameters
    ----------
    package_name: string
        Name of the data package
    keep_resource: boolean
        Flag indicating of the resources meta data json-files should be kept
        after main datapackage.json is created. The reource meta data will
        be stored in the `resources` directory.
    foreign_keys: dict
        Dictionary with foreign key specification. Keys for dictionary are:
        'bus', 'profile', 'from_to_bus'. Values are list with
        strings with the name of the resources
    path: string
        Absoltue path to root-folder of the datapackage
    """
    current_path = os.getcwd()
    if path:
        print("Setting current work directory to {}".format(path))
        os.chdir(path)

    p = Package()
    p.descriptor["name"] = package_name
    p.descriptor["profile"] = "tabular-data-package"
    p.commit()
    if not os.path.exists("resources"):
        os.makedirs("resources")

    # create meta data resources elements
    if not os.path.exists("data/elements"):
        print("No data path found in directory {}. Skipping...".format(
            os.getcwd()))
    else:
        for f in os.listdir("data/elements"):
            r = Resource({"path": os.path.join("data/elements", f)})
            r.infer()
            r.descriptor["schema"]["primaryKey"] = "name"

            if r.name in foreign_keys.get("bus", []):
                r.descriptor["schema"]["foreignKeys"] = [{
                    "fields": "bus",
                    "reference": {
                        "resource": "bus",
                        "fields": "name"
                    },
                }]

                if r.name in foreign_keys.get("profile", []):
                    r.descriptor["schema"]["foreignKeys"].append({
                        "fields": "profile",
                        "reference": {
                            "resource": r.name + "_profile"
                        },
                    })

            elif r.name in foreign_keys.get("from_to_bus", []):
                r.descriptor["schema"]["foreignKeys"] = [
                    {
                        "fields": "from_bus",
                        "reference": {
                            "resource": "bus",
                            "fields": "name"
                        },
                    },
                    {
                        "fields": "to_bus",
                        "reference": {
                            "resource": "bus",
                            "fields": "name"
                        },
                    },
                ]

            elif r.name in foreign_keys.get("chp", []):
                r.descriptor["schema"]["foreignKeys"] = [
                    {
                        "fields": "fuel_bus",
                        "reference": {
                            "resource": "bus",
                            "fields": "name"
                        },
                    },
                    {
                        "fields": "electricity_bus",
                        "reference": {
                            "resource": "bus",
                            "fields": "name"
                        },
                    },
                    {
                        "fields": "heat_bus",
                        "reference": {
                            "resource": "bus",
                            "fields": "name"
                        },
                    },
                ]

            r.commit()
            r.save(os.path.join("resources", f.replace(".csv", ".json")))
            p.add_resource(r.descriptor)

    # create meta data resources elements
    if not os.path.exists("data/sequences"):
        print("No data path found in directory {}. Skipping...".format(
            os.getcwd()))
    else:
        for f in os.listdir("data/sequences"):
            r = Resource({"path": os.path.join("data/sequences", f)})
            r.infer()
            r.commit()
            r.save(os.path.join("resources", f.replace(".csv", ".json")))
            p.add_resource(r.descriptor)

    p.commit()
    p.save("datapackage.json")

    if not keep_resources:
        shutil.rmtree("resources")

    os.chdir(current_path)
Пример #14
0
# -*- coding: utf-8 -*-
"""
"""
import os
from datapackage import Package, Resource

p = Package('datapackage.json')

p.descriptor['profile'] = 'tabular-data-package'

for f in os.listdir('resources'):
    path = os.path.join('resources', f)

    r = Resource(path)

    p.add_resource(r.descriptor)

    p.commit()

    os.remove(path)

os.rmdir('resources')

p.save('datapackage.json')
Пример #15
0
def build(config: Dict) -> Package:
    """Builds a datapackage.Datapackage object from a config dictionary.

    The configuration dictionary should contain the following keys:
    "metadata", "files".

    Information about the corresponding study can be placed in metadata.
    Example:
        {
            'metadata': {
                'name': 'ddionrails-study',
                'id': 'doi'
            }
        }
    The desired files to be included in the Tabular Data Package can be placed in 'files':
    Example:
        {
            'files': [
                'concepts.csv'
            ]
        }

    See: examples/example-config.yml

    The resulting Tabular Data Package is written to disk as 'datapackage.json' in
    the directory the command line tool is run.

    Args:
        config: The configuration of the Datapackage to be created.

    """

    if "metadata" not in config or "files" not in config:
        raise ValueError("Config must contain 'metadata' and 'files'")

    # Read the descriptor base dictionary from disk
    # and update it with values from the config file
    descriptor = read_yaml(DATAPACKAGE_BASE_FILE)
    descriptor["name"] = config["metadata"].get("name")
    descriptor["id"] = config["metadata"].get("id")
    descriptor["title"] = config["metadata"].get("title")
    # Remove empty keys from the dictionary
    descriptor = {key: value for key, value in descriptor.items() if value}

    # Create a Datapackage object from the descriptor dictionary
    package = Package(descriptor=descriptor)
    wanted_files = [file.split(".")[0] for file in config["files"]]
    for file in wanted_files:
        # If a filename ends with "_strict"
        # create the basic Tabular Data Resource first
        # then add the "stricter" rules from the "_strict" file
        if "_strict" in file:
            basic_file = file.replace("_strict", "")
            resource = read_tabular_data_resource(basic_file)
            strict_resource = read_tabular_data_resource(file)
            merge(resource, strict_resource)
        else:
            resource = read_tabular_data_resource(file)
        package.add_resource(resource)
    package.commit()
    if not package.valid:
        for error in package.errors:
            LOGGER.error(error)
    return package
Пример #16
0
def test_package_add_resource():
    package = Package({})
    resource = package.add_resource({'name': 'name', 'data': []})
    assert len(package.resources) == 1
    assert package.resources[0].name == 'name'
    assert resource.name == 'name'
Пример #17
0
import sys

if len(sys.argv) != 2:
    raise Exception(
        'This script expects only one parameter: the path to a datapackage.json file'
    )
    sys.exit()

dpkg = sys.argv[1]

print('Reading ' + dpkg)
## Read the created pipeline package
package = Package(dpkg)

for i in range(len(package.resource_names)):
    resource = package.get_resource(package.resource_names[i])
    if (resource.remote is False):
        ## Infer the schema fields
        print('Inferring schema for ' + package.resource_names[i])
        inferred_schema_pkg = Package()
        inferred_schema_pkg.infer(resource.source)
        inferred_resource = inferred_schema_pkg.get_resource(
            resource.descriptor['name'])
        resource.descriptor['schema'] = inferred_resource.descriptor['schema']
        package.remove_resource(resource.descriptor['name'])
        package.add_resource(resource.descriptor)
        package.commit()
        if (package.valid is not True):
            raise Exception('Package is invalid')
        package.save(dpkg)
Пример #18
0
def test_package_add_resource():
    package = Package({})
    resource = package.add_resource({'name': 'name', 'data': []})
    assert len(package.resources) == 1
    assert package.resources[0].name == 'name'
    assert resource.name == 'name'