def check_resource_schema(self, default_resource, resource): """Check that user resource schema contains all the mandatory fields""" def get_uncustomizable_fields(schema): uncustomizable = ['constraints', 'format', 'name', 'type'] field_filter = lambda field: {key: val for key, val in field.items() if key in uncustomizable} fields = [field_filter(field) for field in schema.fields] fields = sorted(fields, key=lambda k: k['name']) resource_schema = SchemaModel(resource.descriptor['schema']) default_schema_dict = default_resource.descriptor['schema'] if default_resource.descriptor['name'] == 'source_file': for field in default_schema_dict['fields']: if field['name'] == 'data': field['name'] = self.data_key default_schema = SchemaModel(default_schema_dict) if default_resource.descriptor['name'] in self.inflexible_resources: if get_uncustomizable_fields(default_schema) != \ get_uncustomizable_fields(resource_schema): msg = ('The fields for "{0}" are not subject to' 'change').format(resource.local_data_path) raise ValueError(msg, resource.local_data_path) else: required_headers = set(default_schema.required_headers) resource_headers = set(resource_schema.headers) if not required_headers.issubset(resource_headers): missing_headers = required_headers.difference(resource_headers) msg = ('Fields [{0}] are needed for internal processing' 'but are missing from {1}.' ).format(','.join(missing_headers), resource.local_data_path) raise ValueError(msg, resource.local_data_path)
def __init__(self, infile, spec, orig_spec, validate=False, debug=False): self.spec = spec self.table_schema = SchemaModel(orig_spec['schema']) self.validate = validate self.infile = infile self.debug = debug self.stopped = False
def export_package(storage, descriptor, datapackage_name): """Export Data Package from storage. Parameters ---------- storage: object Storage object. descriptor: str Path where to store descriptor. datapackage_name: str Name of the exported datapackage. """ # Iterate over tables resources = [] mapping = {} for table in storage.tables: # Prepare schema = storage.describe(table) base = os.path.dirname(descriptor) path, name = _restore_path(table) fullpath = os.path.join(base, path) if name is not None: mapping[table] = name # Write data _ensure_dir(fullpath) with io.open(fullpath, mode=_write_mode, newline=_write_newline, encoding=_write_encoding) as file: model = SchemaModel(deepcopy(schema)) data = storage.read(table) writer = csv.writer(file) writer.writerow(model.headers) for row in data: writer.writerow(row) # Add resource resource = {'schema': schema, 'path': path} if name is not None: resource['name'] = name resources.append(resource) # Write descriptor resources = _restore_resources(mapping, resources) _ensure_dir(descriptor) with io.open(descriptor, mode=_write_mode, encoding=_write_encoding) as file: descriptor = { 'name': datapackage_name, 'resources': resources, } json.dump(descriptor, file, indent=4)
def __init__(self, schema): self.data = schema # print(self.data) self.schema_model = SchemaModel(schema) # print("printing schema model") # print(self.schema_model.fields) # for f in self.schema_model.fields: # print(SchemaField(f)) self.fields = [SchemaField(f) for f in self.schema_model.fields] self.species_fields = self.find_species_fields(self)
def schema_validator(resource): schema = SchemaModel(resource.spec['schema']) for row in resource: for k, v in row.items(): try: schema.cast(k, v) except InvalidCastError: logging.error('Bad value %r for field %s', v, k) raise yield row
def update_sources_period(self, new_sources): """Overwrite source_file with the identified period_id""" source_resource = utilities.get_datapackage_resource(self.source_file, self.datapackage) source_idx = self.datapackage.resources.index(source_resource) source_schema_dict = self.datapackage.resources[source_idx].descriptor['schema'] updates = {'fields':[{'name': 'period_id', 'type': 'string', 'title': 'The period source data is relevant for.'}]} utilities.deep_update_dict(source_schema_dict, updates) source_schema = SchemaModel(source_schema_dict) with compat.UnicodeWriter(self.source_file) as source_file: source_file.writerow(source_schema.headers) for row in utilities.dicts_to_schema_rows(new_sources, source_schema): source_file.writerow(row)
def assert_conforms_to_schema(schema, doc): assert isinstance(doc, dict), "invalid doc: {}".format(doc) row = [doc[field["name"]] for field in schema["fields"]] try: Schema(schema).cast_row(row) except Exception as e: logging.exception(e) raise Exception( "row does not conform to schema\nrow='{}'\nschema='{}'".format( json.dumps(row), json.dumps(schema))) schema_model = SchemaModel(schema) res = {} for k, v in doc.items(): try: res[k] = schema_model.cast(k, v) except Exception as e: logging.exception(e) raise Exception("doc attribute '{}' with value '{}' " "does not conform to schema '{}'".format( *map(json.dumps, [k, v, schema]))) return res
def verify_csvimport(args): if not os.path.exists(args.csvfile): LOG.error('input CSV file %s does not exist' % args.csvfile) exit(-1) if os.path.exists(args.schema): schemafile = args.schema + '.json' else: schemafile = os.path.join(schemas_dir, args.schema + '.json') if not os.path.exists(schemafile): LOG.error('This schema file %s doesn' 't exist in current directory or csv_schemas directory' % (args.schema + '.json')) exit(-1) try: schema = SchemaModel(schemafile, case_insensitive_headers=True) if 'account' not in schema.headers and args.accountname is None: LOG.error('schema headers: %s' % schema.headers) LOG.error( 'This schema does not have an account column and no account name was provided' ) exit(-1) with open(schemafile, 'r') as sf: schemacontent = json.load(sf) try: setattr(schema, 'nheaders', schemacontent['nheaders']) except KeyError: setattr(schema, 'nheaders', 1) return schema except InvalidSchemaError as e: LOG.error('Invalid CSV schema %s' % e) exit(-1)
def do_csvimport(args, client=None): if client is None: client = clientfromargs(args) logger = get_logger(args) logger.debug('selected schema %s' % (args.schema,)) if os.path.exists(args.schema): schemafile = args.schema else: schemafile = os.path.join(schemas_dir, args.schema + '.json') if not os.path.exists(schemafile): logger.error('This schema doesn''t exist in csv_schemas') exit(-1) try: schema = SchemaModel(schemafile, case_insensitive_headers=True) with open(schemafile, 'r') as sf: schemacontent = json.load(sf) try: nheaders = schemacontent['nheaders'] except KeyError: nheaders = 1 except InvalidSchemaError: logger.error('Invalid CSV schema') raise logger.debug('schema headers %s' % schema.headers) if 'account' not in schema.headers and args.accountname is None: logger.error('This schema does not have an account column and no account name was provided') exit(-1) accounts = {x.account_name: x for x in client.budget.be_accounts} payees = {p.name: p for p in client.budget.be_payees} mastercategories_perid = {m.id: m for m in client.budget.be_master_categories} subcategories = {} for s in client.budget.be_subcategories: m = mastercategories_perid[s.entities_master_category_id] subcategories[m.name + ':' + s.name] = s def getaccount(accountname): try: logger.debug('searching for account %s' % accountname) return accounts[accountname] except KeyError: logger.error('Couldn''t find this account: %s' % accountname) exit(-1) def getpayee(payeename): try: logger.debug('searching for payee %s' % payeename) return payees[payeename] except KeyError: logger.debug('Couldn''t find this payee: %s' % payeename) payee = Payee(name=payeename) client.budget.be_payees.append(payee) return payee def getsubcategory(categoryname): try: logger.debug('searching for subcategory %s' % categoryname) return subcategories[categoryname] except KeyError: logger.debug('Couldn''t find this category: %s' % categoryname) exit(-1) entities_account_id = None if 'account' not in schema.headers: entities_account_id = getaccount(args.accountname).id amount = None if 'inflow' in schema.headers and 'outflow' in schema.headers: pass elif 'amount' in schema.headers: pass else: logger.error('This schema doesn''t provide an amount column or (inflow,outflow) columns') exit(-1) csvrow = namedtuple('CSVrow', field_names=schema.headers) transactions = [] imported_date = datetime.now().date() logger.debug('OK starting the import from %s ' % os.path.abspath(args.csvfile)) with open(args.csvfile, 'r') as inputfile: header = [] for i in range(0, nheaders): header.append(inputfile.readline()) for row in csv.reader(inputfile): if sys.version[0] == '2': row = [cell.decode('utf-8') for cell in row] if all(map(lambda x: x.strip() == '', row)): continue logger.debug('read line %s' % row) result = csvrow(*list(schema.convert_row(*row, fail_fast=True))) if 'account' in schema.headers: entities_account_id = getaccount(result.account).id if entities_account_id is None: logger.error( 'No account id, the account %s in the an account column was not recognized' % result.account) exit(-1) if 'inflow' in schema.headers and 'outflow' in schema.headers: amount = result.inflow - result.outflow elif 'amount' in schema.headers: amount = result.amount if 'category' in schema.headers and result.category: entities_subcategory_id = getsubcategory(result.category).id else: entities_subcategory_id = None if 'payee' in schema.headers: imported_payee = result.payee else: imported_payee = '' entities_payee_id = getpayee(imported_payee).id if 'memo' in schema.headers: memo = result.memo else: memo = '' transaction = Transaction( entities_account_id=entities_account_id, amount=amount, date=result.date, entities_payee_id=entities_payee_id, entities_subcategory_id=entities_subcategory_id, imported_date=imported_date, imported_payee=imported_payee, memo=memo, source="Imported" ) if args.import_duplicates or (not transaction in client.budget.be_transactions): logger.debug('Appending transaction %s ' % transaction.get_dict()) transactions.append(transaction) else: logger.debug('Duplicate transaction found %s ' % transaction.get_dict()) client.add_transactions(transactions)
def pull_datapackage(descriptor, name, backend, **backend_options): """Pull Data Package from storage. All parameters should be used as keyword arguments. Args: descriptor (str): path where to store descriptor name (str): name of the pulled datapackage backend (str): backend name like `sql` or `bigquery` backend_options (dict): backend options mentioned in backend docs """ # Save datapackage name datapackage_name = name # Get storage plugin = import_module('jsontableschema.plugins.%s' % backend) storage = plugin.Storage(**backend_options) # Iterate over tables resources = [] for table in storage.tables: # Prepare schema = storage.describe(table) base = os.path.dirname(descriptor) path, name = mappers.restore_path(table) fullpath = os.path.join(base, path) # Write data helpers.ensure_dir(fullpath) with io.open(fullpath, 'wb') as file: model = SchemaModel(deepcopy(schema)) data = storage.read(table) writer = csv.writer(file, encoding='utf-8') writer.writerow(model.headers) for row in data: writer.writerow(row) # Add resource resource = {'schema': schema, 'path': path} if name is not None: resource['name'] = name resources.append(resource) # Write descriptor mode = 'w' encoding = 'utf-8' if six.PY2: mode = 'wb' encoding = None resources = mappers.restore_resources(resources) helpers.ensure_dir(descriptor) with io.open(descriptor, mode=mode, encoding=encoding) as file: descriptor = { 'name': datapackage_name, 'resources': resources, } json.dump(descriptor, file, indent=4) return storage
def __init__(self, schema): self.data = schema self.schema_model = SchemaModel(schema) self.fields = [SchemaField(f) for f in self.schema_model.fields] self.species_fields = self.find_species_fields(self)
def __init__(self, schema): self.schema_model = SchemaModel(schema) self.fields = [SchemaField(f) for f in self.schema_model.fields]