def test_pull_datapackage(storage, descriptor): # Prepare and call storage.buckets = ['data___data'] storage.describe.return_value = ( {'fields': [ {'name': 'id', 'type': 'integer'}, {'name': 'city', 'type': 'string'}]}) storage.read.return_value = [ (1, 'London'), (2, 'Paris'), ] module.pull_datapackage(descriptor=descriptor, name='name', backend='backend') # Assert pulled datapackage dp = DataPackage(descriptor) assert dp.descriptor == helpers.expand_data_package_descriptor( {'name': 'name', 'resources': [ {'path': ['data.csv'], 'name': 'data', 'schema': {'fields': [ {'name': 'id', 'type': 'integer'}, {'name': 'city', 'type': 'string'}]}}]})
def get_fiscal_datapackage(skip_validation=False, source=None): """Create the master fiscal datapackage from parts.""" with open(FISCAL_METADATA_FILE) as stream: fiscal_datapackage = yaml.load(stream.read()) if source: datapackage = source datapackage['name'] = slugify(os.getcwd().lstrip(DATA_DIR)).lower() else: datapackage = fiscal_datapackage with open(FISCAL_SCHEMA_FILE) as stream: schema = yaml.load(stream.read()) datapackage['resources'][0]['schema'] = schema datapackage['resources'][0].update(mediatype='text/csv') datapackage['resources'] = [datapackage['resources'][0]] # TODO: Update the resource properties in the fiscal data-package with open(FISCAL_MODEL_FILE) as stream: datapackage['model'] = yaml.load(stream.read()) if not skip_validation: DataPackage(datapackage, schema='fiscal').validate() return datapackage
def get(location): """ Helper function to retreive data from a data package located at the provided location. """ datapkg = DataPackage(location) return datapkg.data
def datapackage(source, **options): errors = [] tables = [] # Prepare datapackage datapackage = DataPackage(source, **options) for exception in datapackage.iter_errors(): # Error message should contain datapackage source (often it's path) message = spec['errors']['datapackage-error']['message'] message = message.format( error_message='{problem} [{source}]'.format( problem=str(exception).splitlines()[0], source=str(source))) errors.append({ 'code': 'datapackage-error', 'message': message, 'row-number': None, 'column-number': None, }) # Add tables if not errors: for resource in datapackage.resources: path = resource.remote_data_path or resource.local_data_path tables.append({ 'source': path, 'stream': Stream(path, headers=1), 'schema': Schema(resource.descriptor['schema']), 'extra': { 'datapackage': str(source), }, }) return errors, tables
def assemble(metadata_file): """Assemble a data-package from its descriptor parts.""" def read(file): with open(file) as yaml: return load(yaml.read()) def add_name(info): info['name'] = slugify(info['title'], separator='_') return info def get_files(filetype): filename = metadata_file.replace('metadata', filetype) folder = dirname(metadata_file) schema_files_pattern = join(folder, filename) return glob(schema_files_pattern) descriptor = add_name(read(metadata_file)) resources = [add_name(read(file)) for file in get_files('resource')] model = get_files('model') descriptor['resources'] = resources if model and len(model) == 1: descriptor['model'] = model.pop() return DataPackage(descriptor)
def load_data(pkgdir, engine): dpo = DataPackage(pkgdir) schema = dpo.resources[0].schema csvpath = pkgdir + dpo.resources[0].path data = [row for row in csv.DictReader(open(csvpath))] table = SchemaTable(engine, 'table', schema) table.create() table.load_iter(data)
def __init__(self, datapackage_dir, with_dependencies): self.datapackage_dir = datapackage_dir self.with_dependencies = with_dependencies datapackage_descriptor_file = os.path.join(datapackage_dir, "datapackage.json") with open(datapackage_descriptor_file) as f: descriptor = json.load(f) self.fix_descriptor(descriptor) self.datapackage = DataPackage(descriptor, default_base_path=self.datapackage_dir)
def _get_load_resources(self): if not hasattr(self, "_load_resources"): self._load_resources = [] for load_resource in self._parameters["load-resources"]: if os.path.exists(load_resource["url"]): datapackage = DataPackage(load_resource["url"]) for resource in datapackage.resources: if resource.descriptor["name"] == load_resource["resource"]: self._load_resources.append(resource) return self._load_resources
def __new__(mcls, name, bases, attrs): cls = super(BaseMeta, mcls).__new__(mcls, name, bases, attrs) datapackage = attrs.get('__datapackage__') if datapackage: if isinstance(datapackage, basestring): datapackage = DataPackage(unicode(datapackage)) resource_name = unicode(attrs.get('__resource__')) metadata = attrs.get('__metadata__', metadata_) mapper(cls, datapackage, resource_name, metadata) cls.__queryset__ = SQLAlchemyQuerySet return cls
def _load_dp(self, path): dppath = join(path, 'datapackage.json') # do we need to do this or is it done in datapackage library? if not exists(dppath): raise DpmException( 'No Data Package found at %s. Did not find datapackage.json at %s' % (path, dppath)) dp = DataPackage(dppath) return dp
def import_package(storage, descriptor): """Import Data Package to storage. Parameters ---------- storage: object Storage object. descriptor: str Path to descriptor. """ # Init maps tables = [] schemas = [] datamap = {} mapping = {} # Init model model = DataPackage(descriptor) # Collect tables/schemas/data for resource in model.resources: name = resource.metadata.get('name', None) table = _convert_path(resource.metadata['path'], name) schema = resource.metadata['schema'] data = resource.iter() tables.append(table) schemas.append(schema) datamap[table] = data if name is not None: mapping[name] = table schemas = _convert_schemas(mapping, schemas) # Create tables for table in tables: if storage.check(table): storage.delete(table) storage.create(tables, schemas) # Write data to tables for table in storage.tables: storage.write(table, datamap[table])
def create_offline_client(paths, cachedir='.cached'): ''' Establish an offline client for more up to date assessments than those published ''' import pandas as pd from datapackage import DataPackage all_pkgs = {} for path in paths: pkg = DataPackage(path) for resource in map(format_patch, pkg.resources): if resource.name not in all_pkgs: all_pkgs[resource.name] = { 'schema': resource.descriptor['schema'], 'data': [] } # try: all_pkgs[resource.name]['data'] += resource.read(keyed=True) except Exception as e: print( f"datapackage exception while reading from table: '{resource.name}'" ) print(e.errors) raise e # joined_pkgs = {} for resource_name, resource in all_pkgs.items(): if resource['data']: data = pd.DataFrame(resource['data']) else: data = pd.DataFrame([], columns=[ field['name'] for field in resource['schema']['fields'] ]) # for field in resource['schema']['fields']: if field['type'] == 'datetime': data[field['name']] = pd.to_datetime(data[field['name']], utc=True) elif field['type'] in {'array', 'object'}: data[field['name']] = data[field['name']].apply(json.dumps) joined_pkgs[resource_name] = dict(resource, data=data) return DerivaCompatPkg(joined_pkgs, cachedir=cachedir)
def datavalidate(filepath, print_json): """ Validate csv file data, given its path. Print validation report. If the file is a resource of the datapackage in current dir, will use datapackage.json schema for validation; otherwise infer the schema automatically. If no file path is given, validate all resources data in datapackage.json. """ inspector = goodtables.Inspector(infer_schema=True) if exists('datapackage.json'): dp = DataPackage('datapackage.json') else: dp = None if not filepath and not dp: echo( '[ERROR] please provide csv file path or run command inside a datapackage dir.' ) sys.exit(1) if filepath: schema = None if dp: # Try to find schema in the datapackage.json for resource in dp.resources: if resource.local_data_path == abspath(filepath): #import ipdb; ipdb.sset_trace() schema = resource.descriptor.get('schema') break report = inspector.inspect(filepath, schema=schema) else: # Validate whole datapackage dprclient.validate_metadata(dp) report = dprclient.validate_data(dp) dprclient.print_inspection_report(report, print_json) if not report['valid']: sys.exit(1)
def __init__(self, dp_url): self.datapackage = DataPackage(dp_url) self.resource = self.datapackage.resources[0] descriptor = self.resource.descriptor self.type_name = descriptor['name'] self._schema = descriptor['schema'] fields = self._schema['fields'] try: self.keys = self._schema['primaryKey'] except KeyError: logger.exception('Failed to load %s', dp_url) raise if isinstance(self.keys, str): self.keys = [self.keys] self.date_fields = {} self.range_structure = {} for field in fields: if field.get("es:time-range"): self.date_fields[field["es:time-range"]] = field["name"] try: self.scoring_column = next(iter( filter(lambda f: 'es:score-column' in f, fields), ))['name'] except StopIteration: self.scoring_column = '<none>' self._mapping_generator = MappingGenerator() try: self.mapping, self.search_fields = self.build_mapping(self._schema) except: #noqa logger.exception('Failed to load %s', dp_url) raise
def test_support_criteria_parser(): dp = DataPackage({ "name": "support-criteria", "resources": [{ "name": "criteria", "dialect": { "delimiter": ",", "doubleQuote": True, "lineTerminator": "\r\n", "quoteChar": '"', "skipInitialSpace": False }, "encoding": "utf-8", "format": "csv", "path": "tests/support/criteria.csv", "schema": { "fields": [ # the original support-criteria fields { "format": "%Y-%m-%d", "name": "date", "type": "date" }, { "name": "title", "type": "string" }, { "name": "paper_type", "type": "string" }, { "name": "office", "type": "string" }, { "format": "uri", "name": "pdf_url", "type": "string" }, # the expected data from the parser { "name": "expected_purpose", "type": "string" } ] } }] }) i = 0 num_parsed = 0 num_expected_purposes = 0 for i, row in enumerate(dp.resources[0].iter(keyed=True)): parsed_row = parse_row(row) if len(parsed_row["purpose"] ) > 0 and parsed_row["purpose"] != row["title"]: num_parsed += 1 row_expected_purpose = row["expected_purpose"] if row[ "expected_purpose"] else "" if len(row_expected_purpose) > 0: num_expected_purposes += 1 assert parsed_row["purpose"] == row_expected_purpose, "{}".format({ "i": i, "row": row, "parsed_row": parsed_row, "expected_purpose": row_expected_purpose }) assert i == 358 assert num_expected_purposes > 20, "not enough purposes were checked, might be something wrong with the criteria.csv file" assert num_parsed > 190, "not enough parsed rows"
def simpsons_datapackage(self, simpsons_descriptor_path): datapackage = DataPackage(descriptor=simpsons_descriptor_path) for r in datapackage.resources: sanitize_resource_schema(r) return datapackage
def __init__(self, datapackage, databackend): if isinstance(datapackage, basestring): datapackage = DataPackage(unicode(datapackage)) self.datapackage = datapackage self.models_maker = ModelsMaker(datapackage, backend=databackend) self._resources = {}
def simpsons_broken_datapackage(self, simpsons_broken_descriptor_path): return DataPackage(descriptor=simpsons_broken_descriptor_path)
pipeline.register_processor('schema', options={'schema': schema}) valid, report = pipeline.run() return valid, report def validate_schema(package): try: package.validate() return [] except (ValidationError, InvalidSchemaError, SchemaValidationError): for error in package.iter_errors(): yield error.message if __name__ == '__main__': package_ = DataPackage(SOURCE_SCHEMA) encoding_ = detect_encoding(SOURCE_DATA) errors_ = list(validate_schema(package_)) source_df = read_excel(SOURCE_DATA, header=4, skiprows=range(0, 3)) comment_lines = source_df.index[COMMENT_LINES] for i in COMMENT_LINES: print(list(source_df.iloc[i])) source_df = source_df.drop(comment_lines) source_csv = SOURCE_DATA.replace('xls', 'csv') source_df.to_csv(source_csv) odo(source_df, source_csv) if errors_: for message in errors_:
def test_assemble_fiscal_datapackage_returns_a_valid_fiscal_descriptor(): datapackage = assemble_fiscal_datapackage() # The validation raises an exception if the validation fails assert DataPackage(datapackage, schema='fiscal').validate() is None
def load_fdp_to_db(self, package, callback=noop): """ Load an FDP to the database, create a babbage model and save it as well :param package: URL for the datapackage.json :param callback: callback to use to send progress updates """ self.callback = callback self.package = package # Load and validate the datapackage self.status_update(status=STATUS_LOADING_DATAPACKAGE) self.dpo = DataPackage(package) self.status_update(status=STATUS_VALIDATING_DATAPACKAGE) self.dpo.validate() self.status_update(status=STATUS_LOADING_RESOURCE) resource = self.dpo.resources[0] schema = resource.descriptor['schema'] # Use the cube manager to get the table name self.datapackage_name = self.dpo.descriptor['name'] datapackage_owner = self.dpo.descriptor['owner'] datapackage_author = self.dpo.descriptor['author'] # Get the full name from the author field, and rewrite it without the email self.fullname, email_addr = email.utils.parseaddr(datapackage_author) email_addr = email_addr.split('@')[0] + '@not.shown' self.dpo.descriptor['author'] = '{0} <{1}>'.format( self.fullname, email_addr) self.dpo.descriptor.setdefault('private', True) self.model_name = "{0}:{1}".format(datapackage_owner, self.datapackage_name) table_name = table_name_for_package(datapackage_owner, self.datapackage_name) try: all_fields = set() field_translation = {} # Process schema - slugify field names for field in schema['fields']: name = database_name(field['name'], all_fields) all_fields.add(name) translated_field = {'name': name, 'type': field['type']} field_translation[field['name']] = translated_field storage_schema = { 'fields': [{ 'type': f['type'], 'name': field_translation[f['name']]['name'], 'format': f.get('format', 'default') } for f in schema['fields']], # Babbage likes just one primary key 'primaryKey': '_id' } # Add Primary key to schema storage_schema['fields'].insert(0, { 'name': '_id', 'type': 'integer' }) # Create Babbage Model self.status_update(status=STATUS_CREATING_BABBAGE_MODEL) self.model = fdp_to_model(self.dpo, table_name, resource, field_translation) if self.check_hashes(resource): # Create indexes indexes = set() primary_keys = schema.get('primaryKey', []) for dim in self.dpo.descriptor.get('model', {}).get('dimensions', {}).values(): attributes = dim.get('attributes', {}) for attribute in attributes.values(): source = attribute.get('source') if source in primary_keys: indexes.add((field_translation[source]['name'], )) labelfor = attribute.get('labelfor') if labelfor is not None: labelfor = attributes.get(labelfor, {}) labelfor_source = labelfor.get('source') if labelfor_source in primary_keys: indexes.add(( field_translation[labelfor_source]['name'], field_translation[source]['name'], )) indexes = list(indexes) logging.error('INDEXES: %r', indexes) # # if dim['label'] in primary_keys: # key_field = dim['attributes'][dim['key_attribute']]['label'] # key_field = field_translation[key_field]['name'] # indexes.append((key_field,)) # # label_field = dim['attributes'].get(dim.get('label_attribute'), {}).get('label') # if label_field is not None: # label_field = field_translation[label_field]['name'] # if label_field != key_field: # indexes.append((key_field, label_field)) # Load 1st resource data into DB # We use the prefix name so that JTS-SQL doesn't load all table data into memory storage = Storage(self.engine, prefix=table_name) faux_table_name = '' if faux_table_name in storage.buckets: self.status_update(status=STATUS_DELETING_TABLE) storage.delete(faux_table_name) self.status_update(status=STATUS_CREATING_TABLE) indexes_fields = None if indexes: indexes_fields = [indexes] storage.create(faux_table_name, storage_schema, indexes_fields=indexes_fields) self.status_update(status=STATUS_LOADING_DATA_READY) row_processor = RowProcessor(resource.iter(keyed=True), self.status_update, schema, self.dpo.descriptor) storage.write(faux_table_name, row_processor.iter()) cache = get_os_cache() if cache is not None: logging.info('Clearing cache for context=%s', self.model_name) cache.clear(self.model_name) response = { 'model_name': self.model_name, 'babbage_model': self.model, 'package': self.dpo.descriptor } self.status_update(status=STATUS_DONE, data=response) except Exception as e: logging.exception('LOADING FAILED') self.status_update(status=STATUS_FAIL, error=traceback.format_exc()) return False return True