示例#1
0
    def write(self, table, data):
        """Write data to table.

        Parameters
        ----------
        table: str
            Table name.
        data: list
            List of data tuples.

        """

        # Process data
        schema = self.describe(table)
        model = SchemaModel(schema)
        cdata = []
        for row in data:
            rdata = {}
            for index, field in enumerate(model.fields):
                value = row[index]
                try:
                    value = model.cast(field['name'], value)
                except InvalidObjectType as exception:
                    value = json.loads(value)
                rdata[field['name']] = value
            cdata.append(rdata)

        # Insert data
        dbtable = self.__get_dbtable(table)
        dbtable.insert().execute(cdata)
示例#2
0
 def __init__(self, infile, spec, orig_spec, validate=False, debug=False):
     self.spec = spec
     self.table_schema = SchemaModel(orig_spec['schema'])
     self.validate = validate
     self.infile = infile
     self.debug = debug
     self.stopped = False
    def check_resource_schema(self, default_resource, resource):
        """Check that user resource schema contains all the mandatory fields"""

        def get_uncustomizable_fields(schema):
            uncustomizable = ['constraints', 'format', 'name', 'type']
            field_filter = lambda field: {key: val for key, val in field.items()
                                          if key in uncustomizable}
            fields = [field_filter(field) for field in schema.fields]
            fields = sorted(fields, key=lambda k: k['name'])

        resource_schema = SchemaModel(resource.descriptor['schema'])
        default_schema_dict = default_resource.descriptor['schema']
        if default_resource.descriptor['name'] == 'source_file':
            for field in default_schema_dict['fields']:
                if field['name'] == 'data':
                    field['name'] = self.data_key
        default_schema = SchemaModel(default_schema_dict)

        if default_resource.descriptor['name'] in self.inflexible_resources:
            if get_uncustomizable_fields(default_schema) != \
               get_uncustomizable_fields(resource_schema):
                msg = ('The fields for "{0}" are not subject to'
                       'change').format(resource.local_data_path)
                raise ValueError(msg, resource.local_data_path)
        else:
            required_headers = set(default_schema.required_headers)
            resource_headers = set(resource_schema.headers)
            if not required_headers.issubset(resource_headers):
                missing_headers = required_headers.difference(resource_headers)
                msg = ('Fields [{0}] are needed for internal processing'
                       'but are missing from {1}.'
                       ).format(','.join(missing_headers), resource.local_data_path)
                raise ValueError(msg, resource.local_data_path)
示例#4
0
 def schema_validator(resource):
     schema = SchemaModel(resource.spec['schema'])
     for row in resource:
         for k, v in row.items():
             try:
                 schema.cast(k, v)
             except InvalidCastError:
                 logging.error('Bad value %r for field %s', v, k)
                 raise
         yield row
示例#5
0
class Schema(API):
    """Processor to add types to row.

    Parameters
    ----------
    schema: str/dict
        Schema as in https://github.com/okfn/jsontableschema-py#model.
        If schema is None processor will cast values using type detection.

    """

    # Public

    def __init__(self, schema=None):
        self.__schema = None
        if schema is not None:
            self.__schema = SchemaModel(schema)

    def process(self, iterator):
        if self.__schema is None:
            values = []
            for value in iterator.values:
                value = helpers.parse_value(value)
                values.append(value)
            iterator.values = tuple(values)
        else:
            values = self.__schema.convert_row(*iterator.values)
            iterator.values = tuple(values)

    def handle(self, iterator):
        pass  # pragma: no cover
示例#6
0
def export_package(storage, descriptor, datapackage_name):
    """Export Data Package from storage.

    Parameters
    ----------
    storage: object
        Storage object.
    descriptor: str
        Path where to store descriptor.
    datapackage_name: str
        Name of the exported datapackage.

    """

    # Iterate over tables
    resources = []
    mapping = {}
    for table in storage.tables:

        # Prepare
        schema = storage.describe(table)
        base = os.path.dirname(descriptor)
        path, name = _restore_path(table)
        fullpath = os.path.join(base, path)
        if name is not None:
            mapping[table] = name

        # Write data
        _ensure_dir(fullpath)
        with io.open(fullpath,
                     mode=_write_mode,
                     newline=_write_newline,
                     encoding=_write_encoding) as file:
            model = SchemaModel(deepcopy(schema))
            data = storage.read(table)
            writer = csv.writer(file)
            writer.writerow(model.headers)
            for row in data:
                writer.writerow(row)

        # Add resource
        resource = {'schema': schema, 'path': path}
        if name is not None:
            resource['name'] = name
        resources.append(resource)

    # Write descriptor
    resources = _restore_resources(mapping, resources)
    _ensure_dir(descriptor)
    with io.open(descriptor, mode=_write_mode,
                 encoding=_write_encoding) as file:
        descriptor = {
            'name': datapackage_name,
            'resources': resources,
        }
        json.dump(descriptor, file, indent=4)
示例#7
0
class ResourceIterator(object):
    def __init__(self, infile, spec, orig_spec, validate=False, debug=False):
        self.spec = spec
        self.table_schema = SchemaModel(orig_spec['schema'])
        self.validate = validate
        self.infile = infile
        self.debug = debug
        self.stopped = False

    def __iter__(self):
        return self

    def __next__(self):
        if self.stopped:
            raise StopIteration()
        if self.debug:
            logging.error('WAITING')
        line = self.infile.readline().strip()
        if self.debug:
            logging.error('INGESTING: %r', line)
        if line == '':
            self.stopped = True
            raise StopIteration()
        line = json.loads(line)
        if self.validate:
            for k, v in line.items():
                try:
                    self.table_schema.cast(k, v)
                except (InvalidCastError, TypeError):
                    field = self.table_schema.get_field(k)
                    if field is None:
                        raise ValueError('Validation failed: No such field %s',
                                         k)
                    else:
                        raise ValueError(
                            'Validation failed: Bad value %r '
                            'for field %s with type %s', v, k,
                            field.get('type'))

        return line

    def next(self):
        return self.__next__()
 def __init__(self, schema):
     self.data = schema
     # print(self.data)
     self.schema_model = SchemaModel(schema)
     # print("printing schema model")
     # print(self.schema_model.fields)
     # for f in self.schema_model.fields:
     #     print(SchemaField(f))
     self.fields = [SchemaField(f) for f in self.schema_model.fields]
     self.species_fields = self.find_species_fields(self)
示例#9
0
def assert_conforms_to_schema(schema, doc):
    assert isinstance(doc, dict), "invalid doc: {}".format(doc)
    row = [doc[field["name"]] for field in schema["fields"]]
    try:
        Schema(schema).cast_row(row)
    except Exception as e:
        logging.exception(e)
        raise Exception(
            "row does not conform to schema\nrow='{}'\nschema='{}'".format(
                json.dumps(row), json.dumps(schema)))
    schema_model = SchemaModel(schema)
    res = {}
    for k, v in doc.items():
        try:
            res[k] = schema_model.cast(k, v)
        except Exception as e:
            logging.exception(e)
            raise Exception("doc attribute '{}' with value '{}' "
                            "does not conform to schema '{}'".format(
                                *map(json.dumps, [k, v, schema])))
    return res
    def update_sources_period(self, new_sources):
        """Overwrite source_file with the identified period_id"""

        source_resource = utilities.get_datapackage_resource(self.source_file,
                                                             self.datapackage)
        source_idx = self.datapackage.resources.index(source_resource)
        source_schema_dict = self.datapackage.resources[source_idx].descriptor['schema']
        updates = {'fields':[{'name': 'period_id', 'type': 'string',
                   'title': 'The period source data is relevant for.'}]}
        utilities.deep_update_dict(source_schema_dict, updates)
        source_schema = SchemaModel(source_schema_dict)

        with compat.UnicodeWriter(self.source_file) as source_file:
            source_file.writerow(source_schema.headers)
            for row in utilities.dicts_to_schema_rows(new_sources,
                                                      source_schema):
                source_file.writerow(row)
示例#11
0
def verify_csvimport(args):
    if not os.path.exists(args.csvfile):
        LOG.error('input CSV file %s does not exist' % args.csvfile)
        exit(-1)

    if os.path.exists(args.schema):
        schemafile = args.schema + '.json'
    else:
        schemafile = os.path.join(schemas_dir, args.schema + '.json')
    if not os.path.exists(schemafile):
        LOG.error('This schema file %s doesn'
                  't exist in current directory or csv_schemas directory' %
                  (args.schema + '.json'))
        exit(-1)

    try:
        schema = SchemaModel(schemafile, case_insensitive_headers=True)

        if 'account' not in schema.headers and args.accountname is None:
            LOG.error('schema headers: %s' % schema.headers)
            LOG.error(
                'This schema does not have an account column and no account name was provided'
            )
            exit(-1)

        with open(schemafile, 'r') as sf:
            schemacontent = json.load(sf)
            try:
                setattr(schema, 'nheaders', schemacontent['nheaders'])
            except KeyError:
                setattr(schema, 'nheaders', 1)

        return schema

    except InvalidSchemaError as e:
        LOG.error('Invalid CSV schema %s' % e)
        exit(-1)
示例#12
0
def do_csvimport(args, client=None):
    if client is None:
        client = clientfromargs(args)
    logger = get_logger(args)

    logger.debug('selected schema %s' % (args.schema,))
    if os.path.exists(args.schema):
        schemafile = args.schema
    else:
        schemafile = os.path.join(schemas_dir, args.schema + '.json')
        if not os.path.exists(schemafile):
            logger.error('This schema doesn''t exist in csv_schemas')
            exit(-1)
    try:
        schema = SchemaModel(schemafile, case_insensitive_headers=True)
        with open(schemafile, 'r') as sf:
            schemacontent = json.load(sf)
            try:
                nheaders = schemacontent['nheaders']
            except KeyError:
                nheaders = 1
    except InvalidSchemaError:
        logger.error('Invalid CSV schema')
        raise
    logger.debug('schema headers %s' % schema.headers)

    if 'account' not in schema.headers and args.accountname is None:
        logger.error('This schema does not have an account column and no account name was provided')
        exit(-1)

    accounts = {x.account_name: x for x in client.budget.be_accounts}
    payees = {p.name: p for p in client.budget.be_payees}
    mastercategories_perid = {m.id: m for m in client.budget.be_master_categories}
    subcategories = {}
    for s in client.budget.be_subcategories:
        m = mastercategories_perid[s.entities_master_category_id]
        subcategories[m.name + ':' + s.name] = s

    def getaccount(accountname):
        try:
            logger.debug('searching for account %s' % accountname)
            return accounts[accountname]
        except KeyError:
            logger.error('Couldn''t find this account: %s' % accountname)
            exit(-1)

    def getpayee(payeename):
        try:
            logger.debug('searching for payee %s' % payeename)
            return payees[payeename]
        except KeyError:
            logger.debug('Couldn''t find this payee: %s' % payeename)
            payee = Payee(name=payeename)
            client.budget.be_payees.append(payee)
            return payee

    def getsubcategory(categoryname):
        try:
            logger.debug('searching for subcategory %s' % categoryname)
            return subcategories[categoryname]
        except KeyError:
            logger.debug('Couldn''t find this category: %s' % categoryname)
            exit(-1)

    entities_account_id = None
    if 'account' not in schema.headers:
        entities_account_id = getaccount(args.accountname).id

    amount = None
    if 'inflow' in schema.headers and 'outflow' in schema.headers:
        pass
    elif 'amount' in schema.headers:
        pass
    else:
        logger.error('This schema doesn''t provide an amount column or (inflow,outflow) columns')
        exit(-1)

    csvrow = namedtuple('CSVrow', field_names=schema.headers)
    transactions = []

    imported_date = datetime.now().date()

    logger.debug('OK starting the import from %s ' % os.path.abspath(args.csvfile))
    with open(args.csvfile, 'r') as inputfile:
        header = []
        for i in range(0, nheaders):
            header.append(inputfile.readline())
        for row in csv.reader(inputfile):
            if sys.version[0] == '2':
                row = [cell.decode('utf-8') for cell in row]
            if all(map(lambda x: x.strip() == '', row)):
                continue
            logger.debug('read line %s' % row)
            result = csvrow(*list(schema.convert_row(*row, fail_fast=True)))
            if 'account' in schema.headers:
                entities_account_id = getaccount(result.account).id
            if entities_account_id is None:
                logger.error(
                    'No account id, the account %s in the an account column was not recognized' % result.account)
                exit(-1)
            if 'inflow' in schema.headers and 'outflow' in schema.headers:
                amount = result.inflow - result.outflow
            elif 'amount' in schema.headers:
                amount = result.amount

            if 'category' in schema.headers and result.category:
                entities_subcategory_id = getsubcategory(result.category).id
            else:
                entities_subcategory_id = None
            if 'payee' in schema.headers:
                imported_payee = result.payee
            else:
                imported_payee = ''
            entities_payee_id = getpayee(imported_payee).id
            if 'memo' in schema.headers:
                memo = result.memo
            else:
                memo = ''

            transaction = Transaction(
                entities_account_id=entities_account_id,
                amount=amount,
                date=result.date,
                entities_payee_id=entities_payee_id,
                entities_subcategory_id=entities_subcategory_id,
                imported_date=imported_date,
                imported_payee=imported_payee,
                memo=memo,
                source="Imported"
            )
            if args.import_duplicates or (not transaction in client.budget.be_transactions):
                logger.debug('Appending transaction %s ' % transaction.get_dict())
                transactions.append(transaction)
            else:
                logger.debug('Duplicate transaction found %s ' % transaction.get_dict())

    client.add_transactions(transactions)
示例#13
0
 def __init__(self, schema=None):
     self.__schema = None
     if schema is not None:
         self.__schema = SchemaModel(schema)
示例#14
0
class SchemaField:
    """
    Utility class for a field in a schema.
    It uses the schema types of
    https://github.com/frictionlessdata/jsontableschema-py#types
    for validation.
    """
    # For most of the type we use the jsontableschema ones
    # TODO: SchemaModel is deprecated in favor of of
    # jsontableschema.schema.Schema but there's no _type_map!
    BASE_TYPE_MAP = SchemaModel._type_map()
    # except for anything date.
    BASE_TYPE_MAP['date'] = DayFirstDateType
    BASE_TYPE_MAP['datetime'] = DayFirstDateTimeType
    # and string
    BASE_TYPE_MAP['string'] = NotBlankStringType

    WL_TYPE_MAP = {}

    def __init__(self, data):
        self.data = data
        self.name = self.data.get('name')
        # We want to throw an exception if there is no name
        if not self.name:
            raise FieldSchemaError("A field without a name: {}".format(
                json.dumps(data)))
        # wl specific
        self.wl = WLSchema(self.data.get('wl'))
        # set the type: wl type as precedence
        type_class = self.WL_TYPE_MAP.get(
            self.wl.type) or self.BASE_TYPE_MAP.get(self.data.get('type'))
        self.type = type_class(self.data)
        self.constraints = SchemaConstraints(self.data.get('constraints', {}))

    # implement some dict like methods
    def __getitem__(self, item):
        return self.data.__getitem__(item)

    def get(self, k, d=None):
        return self.data.get(k, d)

    @property
    def title(self):
        return self.data.get('title')

    @property
    def column_name(self):
        return self.name

    @property
    def required(self):
        return self.constraints.required

    @property
    def is_species(self):
        return self.wl.is_species_type()

    @property
    def species_type(self):
        result = None
        if self.is_species:
            return self.wl.species_type or 'all'
        return result

    def cast(self, value):
        """
        Returns a native Python object of the expected format. Will throw an exception
        if the value doesn't complies with any constraints. See for details:
        https://github.com/frictionlessdata/jsontableschema-py#types
        This method is mainly a helper for the validation_error
        :param value:
        :return:
        """
        if isinstance(value, six.string_types) and not isinstance(
                value, six.text_type):
            # the StringType accepts only unicode
            value = six.u(value)
        elif isinstance(value, six.integer_types):
            value = '{}'.format(value)
        return self.type.cast(value)

    def validate(self, value):
        return self.validation_error(value)

    def validation_error(self, value):
        """
        Return an error message if the value is not valid according to the schema.
        It relies on exception thrown by the 'cast1 method of Type method.
        :param value:
        :return: None if value is valid or an error message string
        """
        error = None
        # override the integer validation. The default message is a bit cryptic if there's an error casting a string
        # like '1.2' into an int.
        if isinstance(self.type, types.IntegerType):
            if not is_blank_value(value):
                not_integer = False
                try:
                    casted = self.cast(value)
                    # there's also the case where the case where a float 1.2 is successfully casted in 1
                    # (ex: int(1.2) = 1)
                    if str(casted) != str(value):
                        not_integer = True
                except Exception:
                    not_integer = True
                if not_integer:
                    return 'The field "{}" must be a whole number.'.format(
                        self.name)
        try:
            self.cast(value)
        except Exception as e:
            error = "{}".format(e)
            # Override the default enum exception message to include all
            # possible values
            if error.find('enum array') and self.constraints.enum:
                values = [str(v) for v in self.constraints.enum]
                error = "The value must be one the following: {}".format(
                    values)
        return error

    def __str__(self):
        return '{}'.format(self.name)
示例#15
0
def do_csvimport(args,client=None):
    if client is None:
        client = clientfromargs(args)
    logger=get_logger(args)

    logger.debug('selected schema %s' % (args.schema,))
    if os.path.exists(args.schema):
        schemafile = args.schema
    else:
        schemafile = os.path.join(schemas_dir, args.schema + '.json')
        if not os.path.exists(schemafile):
            logger.error('This schema doesn''t exist in csv_schemas')
            exit(-1)
    try:
        schema = SchemaModel(schemafile, case_insensitive_headers=True)
        with open(schemafile,'r') as sf:
            schemacontent = json.load(sf)
            try:
                nheaders = schemacontent['nheaders']
            except KeyError:
                nheaders = 1
    except InvalidSchemaError:
        logger.error('Invalid CSV schema')
        raise
    logger.debug('schema headers %s' % schema.headers)

    if 'account' not in schema.headers and args.accountname is None:
        logger.error('This schema does not have an account column and no account name was provided')
        exit(-1)

    accounts = {x.account_name: x for x in client.budget.be_accounts}
    payees = {p.name: p for p in client.budget.be_payees}
    mastercategories_perid = {m.id: m for m in client.budget.be_master_categories}
    subcategories = {}
    for s in client.budget.be_subcategories:
        m=mastercategories_perid[s.entities_master_category_id]
        subcategories[m.name+':'+s.name]=s

    def getaccount(accountname):
        try:
            logger.debug('searching for account %s' % accountname)
            return accounts[accountname]
        except KeyError:
            logger.error('Couldn''t find this account: %s' % accountname)
            exit(-1)

    def getpayee(payeename):
        try:
            logger.debug('searching for payee %s' % payeename)
            return payees[payeename]
        except KeyError:
            logger.debug('Couldn''t find this payee: %s' % payeename)
            payee=Payee(name=payeename)
            client.budget.be_payees.append(payee)
            return payee

    def getsubcategory(categoryname):
        try:
            logger.debug('searching for subcategory %s' % categoryname)
            return subcategories[categoryname]
        except KeyError:
            get_logger(args).debug('Couldn''t find this category: %s' % categoryname)
            exit(-1)

    if 'account' not in schema.headers:
        entities_account_id = getaccount(args.accountname).id

    if 'inflow' in schema.headers and 'outflow' in schema.headers:
        pass
    elif 'amount' in schema.headers:
        pass
    else:
        logger.error('This schema doesn''t provide an amount column or (inflow,outflow) columns')
        exit(-1)

    csvrow = namedtuple('CSVrow', field_names=schema.headers)
    transactions = []

    imported_date=datetime.now().date()

    get_logger(args).debug('OK starting the import from %s '%os.path.abspath(args.csvfile))
    with open(args.csvfile, 'r') as inputfile:
        header = inputfile.readline()
        for row in csv.reader(inputfile):
            if sys.version[0] == '2':
                row = [cell.decode('utf-8') for cell in row]
            get_logger(args).debug('read line %s' % row)
            result = csvrow(*list(schema.convert_row(*row, fail_fast=True)))
            if 'account' in schema.headers:
                entities_account_id = getaccount(result.account).id
            if 'inflow' in schema.headers and 'outflow' in schema.headers:
                amount = result.inflow - result.outflow
            elif 'amount' in schema.headers:
                amount = result.amount
            else:
                get_logger(args).error('Couldn''t find this account: %s' % args.accountname)
                exit(-1)

            if 'category' in schema.headers and result.category:
                entities_subcategory_id = getsubcategory(result.category).id
            else:
                entities_subcategory_id = None
            if 'payee' in schema.headers:
                imported_payee=result.payee
            else:
                imported_payee=''
            entities_payee_id = getpayee(imported_payee).id
            if 'memo' in schema.headers:
                memo=result.memo
            else:
                memo=''


            transaction=Transaction(
                entities_account_id=entities_account_id,
                amount=amount,
                date=result.date,
                entities_payee_id=entities_payee_id,
                entities_subcategory_id=entities_subcategory_id,
                imported_date=imported_date,
                imported_payee=imported_payee,
                memo=memo,
                source="Imported"
            )
            if args.import_duplicates or (not client.budget.be_transactions.containsduplicate(transaction)):
                get_logger(args).debug('Appending transaction %s '%transaction.getdict())
                transactions.append(transaction)
            else:
                get_logger(args).debug('Duplicate transaction found %s '%transaction.getdict())



    client.add_transactions(transactions)
示例#16
0
 def __init__(self, data):
     self.data = data
     self.name = data["name"]  # We want to throw an exception if there is no name
     # use of jsontableschema.types to help constraint validation
     self.type = SchemaModel._type_map()[data.get("type")](data)
示例#17
0
 def __init__(self, schema):
     self.schema_model = SchemaModel(schema)
     self.fields = [SchemaField(f) for f in self.schema_model.fields]
示例#18
0
 def __init__(self, data):
     self.data = data
     self.name = data[
         'name']  # We want to throw an exception if there is no name
     # use of jsontableschema.types to help constraint validation
     self.type = SchemaModel._type_map()[data.get('type')](data)
示例#19
0
 def __init__(self, schema):
     self.data = schema
     self.schema_model = SchemaModel(schema)
     self.fields = [SchemaField(f) for f in self.schema_model.fields]
     self.species_fields = self.find_species_fields(self)
示例#20
0
def pull_datapackage(descriptor, name, backend, **backend_options):
    """Pull Data Package from storage.

    All parameters should be used as keyword arguments.

    Args:
        descriptor (str): path where to store descriptor
        name (str): name of the pulled datapackage
        backend (str): backend name like `sql` or `bigquery`
        backend_options (dict): backend options mentioned in backend docs

    """

    # Save datapackage name
    datapackage_name = name

    # Get storage
    plugin = import_module('jsontableschema.plugins.%s' % backend)
    storage = plugin.Storage(**backend_options)

    # Iterate over tables
    resources = []
    for table in storage.tables:

        # Prepare
        schema = storage.describe(table)
        base = os.path.dirname(descriptor)
        path, name = mappers.restore_path(table)
        fullpath = os.path.join(base, path)

        # Write data
        helpers.ensure_dir(fullpath)
        with io.open(fullpath, 'wb') as file:
            model = SchemaModel(deepcopy(schema))
            data = storage.read(table)
            writer = csv.writer(file, encoding='utf-8')
            writer.writerow(model.headers)
            for row in data:
                writer.writerow(row)

        # Add resource
        resource = {'schema': schema, 'path': path}
        if name is not None:
            resource['name'] = name
        resources.append(resource)

    # Write descriptor
    mode = 'w'
    encoding = 'utf-8'
    if six.PY2:
        mode = 'wb'
        encoding = None
    resources = mappers.restore_resources(resources)
    helpers.ensure_dir(descriptor)
    with io.open(descriptor,
                 mode=mode,
                 encoding=encoding) as file:
        descriptor = {
            'name': datapackage_name,
            'resources': resources,
        }
        json.dump(descriptor, file, indent=4)
    return storage
def convert_data(schema, data):
    result = []
    model = SchemaModel(schema)
    for item in data:
        result.append(tuple(model.convert_row(*item)))
    return result