예제 #1
0
def test_pull_datapackage(storage, descriptor):

    # Prepare and call
    storage.buckets = ['data___data']
    storage.describe.return_value = (
        {'fields': [
            {'name': 'id', 'type': 'integer'},
            {'name': 'city', 'type': 'string'}]})
    storage.read.return_value = [
        (1, 'London'),
        (2, 'Paris'),
    ]
    module.pull_datapackage(descriptor=descriptor, name='name', backend='backend')

    # Assert pulled datapackage
    dp = DataPackage(descriptor)
    assert dp.descriptor == helpers.expand_data_package_descriptor(
        {'name': 'name',
        'resources': [
            {'path': ['data.csv'],
             'name': 'data',
             'schema':
                {'fields': [
                    {'name': 'id', 'type': 'integer'},
                    {'name': 'city', 'type': 'string'}]}}]})
예제 #2
0
def get_fiscal_datapackage(skip_validation=False, source=None):
    """Create the master fiscal datapackage from parts."""

    with open(FISCAL_METADATA_FILE) as stream:
        fiscal_datapackage = yaml.load(stream.read())

    if source:
        datapackage = source
        datapackage['name'] = slugify(os.getcwd().lstrip(DATA_DIR)).lower()
    else:
        datapackage = fiscal_datapackage

    with open(FISCAL_SCHEMA_FILE) as stream:
        schema = yaml.load(stream.read())
        datapackage['resources'][0]['schema'] = schema
        datapackage['resources'][0].update(mediatype='text/csv')
        datapackage['resources'] = [datapackage['resources'][0]]

        # TODO: Update the resource properties in the fiscal data-package

    with open(FISCAL_MODEL_FILE) as stream:
        datapackage['model'] = yaml.load(stream.read())

    if not skip_validation:
        DataPackage(datapackage, schema='fiscal').validate()

    return datapackage
예제 #3
0
def get(location):
    """
    Helper function to retreive data from a data package located at the
    provided location.
    """
    datapkg = DataPackage(location)
    return datapkg.data
예제 #4
0
def datapackage(source, **options):
    errors = []
    tables = []

    # Prepare datapackage
    datapackage = DataPackage(source, **options)
    for exception in datapackage.iter_errors():
        # Error message should contain datapackage source (often it's path)
        message = spec['errors']['datapackage-error']['message']
        message = message.format(
            error_message='{problem} [{source}]'.format(
                problem=str(exception).splitlines()[0],
                source=str(source)))
        errors.append({
            'code': 'datapackage-error',
            'message': message,
            'row-number': None,
            'column-number': None,
        })

    # Add tables
    if not errors:
        for resource in datapackage.resources:
            path = resource.remote_data_path or resource.local_data_path
            tables.append({
                'source': path,
                'stream': Stream(path, headers=1),
                'schema': Schema(resource.descriptor['schema']),
                'extra': {
                    'datapackage': str(source),
                },
            })

    return errors, tables
예제 #5
0
def assemble(metadata_file):
    """Assemble a data-package from its descriptor parts."""

    def read(file):
        with open(file) as yaml:
            return load(yaml.read())

    def add_name(info):
        info['name'] = slugify(info['title'], separator='_')
        return info

    def get_files(filetype):
        filename = metadata_file.replace('metadata', filetype)
        folder = dirname(metadata_file)
        schema_files_pattern = join(folder, filename)
        return glob(schema_files_pattern)

    descriptor = add_name(read(metadata_file))
    resources = [add_name(read(file)) for file in get_files('resource')]
    model = get_files('model')

    descriptor['resources'] = resources
    if model and len(model) == 1:
        descriptor['model'] = model.pop()

    return DataPackage(descriptor)
예제 #6
0
def load_data(pkgdir, engine):
    dpo = DataPackage(pkgdir)
    schema = dpo.resources[0].schema
    csvpath = pkgdir + dpo.resources[0].path
    data = [row for row in csv.DictReader(open(csvpath))]
    table = SchemaTable(engine, 'table', schema)
    table.create()
    table.load_iter(data)
예제 #7
0
 def __init__(self, datapackage_dir, with_dependencies):
     self.datapackage_dir = datapackage_dir
     self.with_dependencies = with_dependencies
     datapackage_descriptor_file = os.path.join(datapackage_dir,
                                                "datapackage.json")
     with open(datapackage_descriptor_file) as f:
         descriptor = json.load(f)
     self.fix_descriptor(descriptor)
     self.datapackage = DataPackage(descriptor,
                                    default_base_path=self.datapackage_dir)
예제 #8
0
 def _get_load_resources(self):
     if not hasattr(self, "_load_resources"):
         self._load_resources = []
         for load_resource in self._parameters["load-resources"]:
             if os.path.exists(load_resource["url"]):
                 datapackage = DataPackage(load_resource["url"])
                 for resource in datapackage.resources:
                     if resource.descriptor["name"] == load_resource["resource"]:
                         self._load_resources.append(resource)
     return self._load_resources
예제 #9
0
 def __new__(mcls, name, bases, attrs):
     cls = super(BaseMeta, mcls).__new__(mcls, name, bases, attrs)
     datapackage = attrs.get('__datapackage__')
     if datapackage:
         if isinstance(datapackage, basestring):
             datapackage = DataPackage(unicode(datapackage))
         resource_name = unicode(attrs.get('__resource__'))
         metadata = attrs.get('__metadata__', metadata_)
         mapper(cls, datapackage, resource_name, metadata)
         cls.__queryset__ = SQLAlchemyQuerySet
     return cls
예제 #10
0
    def _load_dp(self, path):
        dppath = join(path, 'datapackage.json')

        # do we need to do this or is it done in datapackage library?
        if not exists(dppath):
            raise DpmException(
                'No Data Package found at %s. Did not find datapackage.json at %s'
                % (path, dppath))

        dp = DataPackage(dppath)
        return dp
예제 #11
0
def import_package(storage, descriptor):
    """Import Data Package to storage.

    Parameters
    ----------
    storage: object
        Storage object.
    descriptor: str
        Path to descriptor.

    """

    # Init maps
    tables = []
    schemas = []
    datamap = {}
    mapping = {}

    # Init model
    model = DataPackage(descriptor)

    # Collect tables/schemas/data
    for resource in model.resources:
        name = resource.metadata.get('name', None)
        table = _convert_path(resource.metadata['path'], name)
        schema = resource.metadata['schema']
        data = resource.iter()
        tables.append(table)
        schemas.append(schema)
        datamap[table] = data
        if name is not None:
            mapping[name] = table
    schemas = _convert_schemas(mapping, schemas)

    # Create tables
    for table in tables:
        if storage.check(table):
            storage.delete(table)
    storage.create(tables, schemas)

    # Write data to tables
    for table in storage.tables:
        storage.write(table, datamap[table])
예제 #12
0
def create_offline_client(paths, cachedir='.cached'):
    ''' Establish an offline client for more up to date assessments than those published
  '''
    import pandas as pd
    from datapackage import DataPackage
    all_pkgs = {}
    for path in paths:
        pkg = DataPackage(path)
        for resource in map(format_patch, pkg.resources):
            if resource.name not in all_pkgs:
                all_pkgs[resource.name] = {
                    'schema': resource.descriptor['schema'],
                    'data': []
                }
            #
            try:
                all_pkgs[resource.name]['data'] += resource.read(keyed=True)
            except Exception as e:
                print(
                    f"datapackage exception while reading from table: '{resource.name}'"
                )
                print(e.errors)
                raise e
    #
    joined_pkgs = {}
    for resource_name, resource in all_pkgs.items():
        if resource['data']:
            data = pd.DataFrame(resource['data'])
        else:
            data = pd.DataFrame([],
                                columns=[
                                    field['name']
                                    for field in resource['schema']['fields']
                                ])
        #
        for field in resource['schema']['fields']:
            if field['type'] == 'datetime':
                data[field['name']] = pd.to_datetime(data[field['name']],
                                                     utc=True)
            elif field['type'] in {'array', 'object'}:
                data[field['name']] = data[field['name']].apply(json.dumps)
        joined_pkgs[resource_name] = dict(resource, data=data)
    return DerivaCompatPkg(joined_pkgs, cachedir=cachedir)
예제 #13
0
def datavalidate(filepath, print_json):
    """
    Validate csv file data, given its path. Print validation report. If the file is
    a resource of the datapackage in current dir, will use datapackage.json schema for
    validation; otherwise infer the schema automatically.
    If no file path is given, validate all resources data in datapackage.json.
    """
    inspector = goodtables.Inspector(infer_schema=True)

    if exists('datapackage.json'):
        dp = DataPackage('datapackage.json')
    else:
        dp = None

    if not filepath and not dp:
        echo(
            '[ERROR] please provide csv file path or run command inside a datapackage dir.'
        )
        sys.exit(1)

    if filepath:
        schema = None
        if dp:
            # Try to find schema in the datapackage.json
            for resource in dp.resources:
                if resource.local_data_path == abspath(filepath):
                    #import ipdb; ipdb.sset_trace()
                    schema = resource.descriptor.get('schema')
                    break

        report = inspector.inspect(filepath, schema=schema)
    else:
        # Validate whole datapackage
        dprclient.validate_metadata(dp)
        report = dprclient.validate_data(dp)

    dprclient.print_inspection_report(report, print_json)
    if not report['valid']:
        sys.exit(1)
예제 #14
0
    def __init__(self, dp_url):
        self.datapackage = DataPackage(dp_url)
        self.resource = self.datapackage.resources[0]
        descriptor = self.resource.descriptor
        self.type_name = descriptor['name']

        self._schema = descriptor['schema']
        fields = self._schema['fields']

        try:
            self.keys = self._schema['primaryKey']
        except KeyError:
            logger.exception('Failed to load %s', dp_url)
            raise
        if isinstance(self.keys, str):
            self.keys = [self.keys]

        self.date_fields = {}
        self.range_structure = {}

        for field in fields:
            if field.get("es:time-range"):
                self.date_fields[field["es:time-range"]] = field["name"]

        try:
            self.scoring_column = next(iter(
                filter(lambda f: 'es:score-column' in f, fields),
            ))['name']
        except StopIteration:
            self.scoring_column = '<none>'
        self._mapping_generator = MappingGenerator()
        try:
            self.mapping, self.search_fields = self.build_mapping(self._schema)
        except: #noqa
            logger.exception('Failed to load %s', dp_url)
            raise
def test_support_criteria_parser():
    dp = DataPackage({
        "name":
        "support-criteria",
        "resources": [{
            "name": "criteria",
            "dialect": {
                "delimiter": ",",
                "doubleQuote": True,
                "lineTerminator": "\r\n",
                "quoteChar": '"',
                "skipInitialSpace": False
            },
            "encoding": "utf-8",
            "format": "csv",
            "path": "tests/support/criteria.csv",
            "schema": {
                "fields": [
                    # the original support-criteria fields
                    {
                        "format": "%Y-%m-%d",
                        "name": "date",
                        "type": "date"
                    },
                    {
                        "name": "title",
                        "type": "string"
                    },
                    {
                        "name": "paper_type",
                        "type": "string"
                    },
                    {
                        "name": "office",
                        "type": "string"
                    },
                    {
                        "format": "uri",
                        "name": "pdf_url",
                        "type": "string"
                    },
                    # the expected data from the parser
                    {
                        "name": "expected_purpose",
                        "type": "string"
                    }
                ]
            }
        }]
    })
    i = 0
    num_parsed = 0
    num_expected_purposes = 0
    for i, row in enumerate(dp.resources[0].iter(keyed=True)):
        parsed_row = parse_row(row)
        if len(parsed_row["purpose"]
               ) > 0 and parsed_row["purpose"] != row["title"]:
            num_parsed += 1
        row_expected_purpose = row["expected_purpose"] if row[
            "expected_purpose"] else ""
        if len(row_expected_purpose) > 0:
            num_expected_purposes += 1
            assert parsed_row["purpose"] == row_expected_purpose, "{}".format({
                "i":
                i,
                "row":
                row,
                "parsed_row":
                parsed_row,
                "expected_purpose":
                row_expected_purpose
            })
    assert i == 358
    assert num_expected_purposes > 20, "not enough purposes were checked, might be something wrong with the criteria.csv file"
    assert num_parsed > 190, "not enough parsed rows"
예제 #16
0
 def simpsons_datapackage(self, simpsons_descriptor_path):
     datapackage = DataPackage(descriptor=simpsons_descriptor_path)
     for r in datapackage.resources:
         sanitize_resource_schema(r)
     return datapackage
예제 #17
0
 def __init__(self, datapackage, databackend):
     if isinstance(datapackage, basestring):
         datapackage = DataPackage(unicode(datapackage))
     self.datapackage = datapackage
     self.models_maker = ModelsMaker(datapackage, backend=databackend)
     self._resources = {}
예제 #18
0
 def simpsons_broken_datapackage(self, simpsons_broken_descriptor_path):
     return DataPackage(descriptor=simpsons_broken_descriptor_path)
예제 #19
0
    pipeline.register_processor('schema', options={'schema': schema})
    valid, report = pipeline.run()
    return valid, report


def validate_schema(package):
    try:
        package.validate()
        return []
    except (ValidationError, InvalidSchemaError, SchemaValidationError):
        for error in package.iter_errors():
            yield error.message


if __name__ == '__main__':
    package_ = DataPackage(SOURCE_SCHEMA)
    encoding_ = detect_encoding(SOURCE_DATA)
    errors_ = list(validate_schema(package_))

    source_df = read_excel(SOURCE_DATA, header=4, skiprows=range(0, 3))
    comment_lines = source_df.index[COMMENT_LINES]
    for i in COMMENT_LINES:
        print(list(source_df.iloc[i]))
    source_df = source_df.drop(comment_lines)
    source_csv = SOURCE_DATA.replace('xls', 'csv')
    source_df.to_csv(source_csv)

    odo(source_df, source_csv)

    if errors_:
        for message in errors_:
def test_assemble_fiscal_datapackage_returns_a_valid_fiscal_descriptor():
    datapackage = assemble_fiscal_datapackage()
    # The validation raises an exception if the validation fails
    assert DataPackage(datapackage, schema='fiscal').validate() is None
    def load_fdp_to_db(self, package, callback=noop):
        """
        Load an FDP to the database, create a babbage model and save it as well
        :param package: URL for the datapackage.json
        :param callback: callback to use to send progress updates
        """

        self.callback = callback
        self.package = package

        # Load and validate the datapackage
        self.status_update(status=STATUS_LOADING_DATAPACKAGE)
        self.dpo = DataPackage(package)
        self.status_update(status=STATUS_VALIDATING_DATAPACKAGE)
        self.dpo.validate()
        self.status_update(status=STATUS_LOADING_RESOURCE)
        resource = self.dpo.resources[0]
        schema = resource.descriptor['schema']

        # Use the cube manager to get the table name
        self.datapackage_name = self.dpo.descriptor['name']
        datapackage_owner = self.dpo.descriptor['owner']
        datapackage_author = self.dpo.descriptor['author']

        # Get the full name from the author field, and rewrite it without the email
        self.fullname, email_addr = email.utils.parseaddr(datapackage_author)
        email_addr = email_addr.split('@')[0] + '@not.shown'
        self.dpo.descriptor['author'] = '{0} <{1}>'.format(
            self.fullname, email_addr)
        self.dpo.descriptor.setdefault('private', True)

        self.model_name = "{0}:{1}".format(datapackage_owner,
                                           self.datapackage_name)
        table_name = table_name_for_package(datapackage_owner,
                                            self.datapackage_name)

        try:
            all_fields = set()
            field_translation = {}
            # Process schema - slugify field names
            for field in schema['fields']:
                name = database_name(field['name'], all_fields)
                all_fields.add(name)
                translated_field = {'name': name, 'type': field['type']}
                field_translation[field['name']] = translated_field

            storage_schema = {
                'fields': [{
                    'type': f['type'],
                    'name': field_translation[f['name']]['name'],
                    'format': f.get('format', 'default')
                } for f in schema['fields']],
                # Babbage likes just one primary key
                'primaryKey':
                '_id'
            }

            # Add Primary key to schema
            storage_schema['fields'].insert(0, {
                'name': '_id',
                'type': 'integer'
            })

            # Create Babbage Model
            self.status_update(status=STATUS_CREATING_BABBAGE_MODEL)
            self.model = fdp_to_model(self.dpo, table_name, resource,
                                      field_translation)

            if self.check_hashes(resource):
                # Create indexes
                indexes = set()
                primary_keys = schema.get('primaryKey', [])
                for dim in self.dpo.descriptor.get('model',
                                                   {}).get('dimensions',
                                                           {}).values():
                    attributes = dim.get('attributes', {})
                    for attribute in attributes.values():
                        source = attribute.get('source')
                        if source in primary_keys:
                            indexes.add((field_translation[source]['name'], ))
                        labelfor = attribute.get('labelfor')
                        if labelfor is not None:
                            labelfor = attributes.get(labelfor, {})
                            labelfor_source = labelfor.get('source')
                            if labelfor_source in primary_keys:
                                indexes.add((
                                    field_translation[labelfor_source]['name'],
                                    field_translation[source]['name'],
                                ))
                indexes = list(indexes)
                logging.error('INDEXES: %r', indexes)
                #
                # if dim['label'] in primary_keys:
                #     key_field = dim['attributes'][dim['key_attribute']]['label']
                #     key_field = field_translation[key_field]['name']
                #     indexes.append((key_field,))
                #
                #     label_field = dim['attributes'].get(dim.get('label_attribute'), {}).get('label')
                #     if label_field is not None:
                #         label_field = field_translation[label_field]['name']
                #         if label_field != key_field:
                #             indexes.append((key_field, label_field))

                # Load 1st resource data into DB
                # We use the prefix name so that JTS-SQL doesn't load all table data into memory
                storage = Storage(self.engine, prefix=table_name)
                faux_table_name = ''
                if faux_table_name in storage.buckets:
                    self.status_update(status=STATUS_DELETING_TABLE)
                    storage.delete(faux_table_name)
                self.status_update(status=STATUS_CREATING_TABLE)
                indexes_fields = None
                if indexes:
                    indexes_fields = [indexes]
                storage.create(faux_table_name,
                               storage_schema,
                               indexes_fields=indexes_fields)

                self.status_update(status=STATUS_LOADING_DATA_READY)
                row_processor = RowProcessor(resource.iter(keyed=True),
                                             self.status_update, schema,
                                             self.dpo.descriptor)
                storage.write(faux_table_name, row_processor.iter())

                cache = get_os_cache()
                if cache is not None:
                    logging.info('Clearing cache for context=%s',
                                 self.model_name)
                    cache.clear(self.model_name)

            response = {
                'model_name': self.model_name,
                'babbage_model': self.model,
                'package': self.dpo.descriptor
            }
            self.status_update(status=STATUS_DONE, data=response)

        except Exception as e:
            logging.exception('LOADING FAILED')
            self.status_update(status=STATUS_FAIL,
                               error=traceback.format_exc())
            return False

        return True