예제 #1
0
def load_sheets():
    schema = {'fields': []}
    sheets = []
    for sheet_file in get_sheet_files():
        filename = 'data/search_import/{}'.format(sheet_file['name'])
        wb = load_workbook(filename)
        for sheet_number, sheet_name in enumerate(wb.sheetnames, start=1):
            stream = Stream(filename, sheet=sheet_name)
            stream.open()
            # logging.info('{}/{}'.format(filename, sheet_name))
            stream_iter = stream.iter()
            first_row = next(stream_iter)
            if 'migdar_id' not in first_row and sheet_number > 1:
                header_row = first_sheet_header_row
            else:
                header_row = first_row
                first_row = None
                if sheet_number == 1:
                    first_sheet_header_row = header_row
            # logging.info(header_row)
            # logging.info(first_row)
            for k in header_row:
                if k and k not in [f['name'] for f in schema['fields']]:
                    # logging.info('found field: {}'.format(k))
                    field_type = 'string'
                    schema['fields'].append({'name': k, 'type': field_type})
            sheets.append({
                'iterator':
                sheet_iterator(first_row, header_row, stream_iter, filename,
                               sheet_name, stream),
                'deleted':
                'deleted' in sheet_name.strip().lower()
            })
    return schema, sheets
예제 #2
0
def load_from_gdrive_files(rows):
    if rows.res.name == 'search_import_index':
        for row_index, row in enumerate(rows, start=1):
            #             if row_index !=5:
            #                 continue
            file_url = f"https://migdar-internal-search.odata.org.il/__data/search_import/{row['name']}"
            print(file_url)
            with tempfile.NamedTemporaryFile('w+b',
                                             suffix='.xlsx') as temp_file:
                with get_migdar_session().get(file_url,
                                              stream=True) as response:
                    for chunk in response.iter_content():
                        temp_file.write(chunk)
                temp_file.flush()
                wb = load_workbook(temp_file.name)
                for sheet_number, sheet_name in enumerate(wb.sheetnames,
                                                          start=1):
                    if 'deleted' in sheet_name.strip().lower():
                        continue
                    stream = Stream(temp_file.name, sheet=sheet_name)
                    stream.open()
                    print('#{}.{}/{}: loading sheet'.format(
                        row_index, row['name'], sheet_name))
                    stream_iter = stream.iter()
                    try:
                        first_row = next(stream_iter)
                    except StopIteration:
                        first_row = None
                    if first_row:
                        if 'migdar_id' not in first_row and sheet_number > 1:
                            header_row = first_sheet_header_row
                        else:
                            header_row = first_row
                            first_row = None
                            if sheet_number == 1:
                                first_sheet_header_row = header_row
                        yield from sheet_iterator(first_row, header_row,
                                                  stream_iter, row['name'],
                                                  sheet_name, stream)
                    else:
                        for row in stream_iter:
                            pass
    else:
        yield from rows
예제 #3
0
class Table(object):

    # Public

    def __init__(self,
                 source,
                 schema=None,
                 strict=False,
                 post_cast=[],
                 storage=None,
                 **options):
        """https://github.com/frictionlessdata/tableschema-py#schema
        """

        # Set attributes
        self.__source = source
        self.__stream = None
        self.__schema = None
        self.__headers = None
        self.__storage = None
        self.__post_cast = copy(post_cast)

        # Schema
        if isinstance(schema, Schema):
            self.__schema = schema
        elif schema is not None:
            self.__schema = Schema(schema)

        # Stream (tabulator)
        if storage is None:
            options.setdefault('headers', 1)
            self.__stream = Stream(source, **options)

        # Stream (storage)
        else:
            if not isinstance(storage, Storage):
                storage = Storage.connect(storage, **options)
            if self.__schema:
                storage.describe(source, self.__schema.descriptor)
            headers = Schema(storage.describe(source)).field_names
            self.__stream = Stream(partial(storage.iter, source),
                                   headers=headers)
            self.__storage = storage

    @property
    def headers(self):
        """https://github.com/frictionlessdata/tableschema-py#schema
        """
        return self.__headers

    @property
    def schema(self):
        """https://github.com/frictionlessdata/tableschema-py#schema
        """
        return self.__schema

    def iter(self,
             keyed=False,
             extended=False,
             cast=True,
             relations=False,
             foreign_keys_values=False):
        """https://github.com/frictionlessdata/tableschema-py#schema
        """

        # Prepare unique checks
        if cast:
            unique_fields_cache = {}
            if self.schema:
                unique_fields_cache = _create_unique_fields_cache(self.schema)
        # Prepare relation checks
        if relations and not foreign_keys_values:
            # we have to test relations but the index has not been precomputed
            # prepare the index to boost validation process
            foreign_keys_values = self.index_foreign_keys_values(relations)

        # Open/iterate stream
        self.__stream.open()
        iterator = self.__stream.iter(extended=True)
        iterator = self.__apply_processors(iterator, cast=cast)
        for row_number, headers, row in iterator:

            # Get headers
            if not self.__headers:
                self.__headers = headers

            # Check headers
            if cast:
                if self.schema and self.headers:
                    if self.headers != self.schema.field_names:
                        self.__stream.close()
                        message = 'Table headers don\'t match schema field names'
                        raise exceptions.CastError(message)

            # Check unique
            if cast:
                for indexes, cache in unique_fields_cache.items():
                    values = tuple(value for i, value in enumerate(row)
                                   if i in indexes)
                    if not all(map(lambda value: value is None, values)):
                        if values in cache['data']:
                            self.__stream.close()
                            message = 'Field(s) "%s" duplicates in row "%s"'
                            message = message % (cache['name'], row_number)
                            raise exceptions.CastError(message)
                        cache['data'].add(values)

            # Resolve relations
            if relations:
                if self.schema:
                    row_with_relations = dict(zip(headers, copy(row)))
                    for foreign_key in self.schema.foreign_keys:
                        refValue = _resolve_relations(row, headers,
                                                      foreign_keys_values,
                                                      foreign_key)
                        if refValue is None:
                            self.__stream.close()
                            keyed_row = OrderedDict(zip(headers, row))
                            # local values of the FK
                            local_values = tuple(
                                keyed_row[f] for f in foreign_key['fields'])
                            message = 'Foreign key "%s" violation in row "%s": %s not found in %s'
                            message = message % (
                                foreign_key['fields'], row_number,
                                local_values,
                                foreign_key['reference']['resource'])
                            raise exceptions.RelationError(message)
                        elif type(refValue) is dict:
                            for field in foreign_key['fields']:
                                if type(row_with_relations[field]) is not dict:
                                    # no previous refValues injected on this field
                                    row_with_relations[field] = refValue
                                else:
                                    # alreayd one ref, merging
                                    row_with_relations[field].update(refValue)
                        else:
                            # case when all original value of the FK are empty
                            # refValue == row, there is nothing to do
                            # an empty dict might be a better returned value for this case ?
                            pass

                    #  mutate row now that we are done, in the right order
                    row = [row_with_relations[f] for f in headers]

            # Form row
            if extended:
                yield (row_number, headers, row)
            elif keyed:
                yield dict(zip(headers, row))
            else:
                yield row

        # Close stream
        self.__stream.close()

    def read(self,
             keyed=False,
             extended=False,
             cast=True,
             relations=False,
             limit=None,
             foreign_keys_values=False):
        """https://github.com/frictionlessdata/tableschema-py#schema
        """
        result = []
        rows = self.iter(keyed=keyed,
                         extended=extended,
                         cast=cast,
                         relations=relations,
                         foreign_keys_values=foreign_keys_values)
        for count, row in enumerate(rows, start=1):
            result.append(row)
            if count == limit:
                break
        return result

    def infer(self, limit=100, confidence=0.75):
        """https://github.com/frictionlessdata/tableschema-py#schema
        """
        if self.__schema is None or self.__headers is None:

            # Infer (tabulator)
            if not self.__storage:
                with self.__stream as stream:
                    if self.__schema is None:
                        self.__schema = Schema()
                        self.__schema.infer(stream.sample[:limit],
                                            headers=stream.headers,
                                            confidence=confidence)
                    if self.__headers is None:
                        self.__headers = stream.headers

            # Infer (storage)
            else:
                descriptor = self.__storage.describe(self.__source)
                if self.__schema is None:
                    self.__schema = Schema(descriptor)
                if self.__headers is None:
                    self.__headers = self.__schema.field_names

        return self.__schema.descriptor

    def save(self, target, storage=None, **options):
        """https://github.com/frictionlessdata/tableschema-py#schema
        """

        # Save (tabulator)
        if storage is None:
            with Stream(self.iter, headers=self.__schema.headers) as stream:
                stream.save(target, **options)
            return True

        # Save (storage)
        else:
            if not isinstance(storage, Storage):
                storage = Storage.connect(storage, **options)
            storage.create(target, self.__schema.descriptor, force=True)
            storage.write(target, self.iter(cast=False))
            return storage

    def index_foreign_keys_values(self, relations):
        # we dont need to load the complete reference table to test relations
        # we can lower payload AND optimize testing foreign keys
        # by preparing the right index based on the foreign key definition
        # foreign_keys are sets of tuples of all possible values in the foreign table
        # foreign keys =
        # [reference] [foreign_keys tuple] = { (foreign_keys_values, ) : one_keyedrow, ... }
        foreign_keys = defaultdict(dict)
        if self.schema:
            for fk in self.schema.foreign_keys:
                # load relation data
                relation = fk['reference']['resource']

                # create a set of foreign keys
                # to optimize we prepare index of existing values
                # this index should use reference + foreign_keys as key
                # cause many foreign keys may use the same reference
                foreign_keys[relation][tuple(fk['reference']['fields'])] = {}
                for row in relations[relation]:
                    key = tuple([
                        row[foreign_field]
                        for foreign_field in fk['reference']['fields']
                    ])
                    # here we should chose to pick the first or nth row which match
                    # previous implementation picked the first, so be it
                    if key not in foreign_keys[relation][tuple(
                            fk['reference']['fields'])]:
                        foreign_keys[relation][tuple(
                            fk['reference']['fields'])][key] = row
        return foreign_keys

    # Private

    def __apply_processors(self, iterator, cast=True):

        # Apply processors to iterator
        def builtin_processor(extended_rows):
            for row_number, headers, row in extended_rows:
                if self.__schema and cast:
                    row = self.__schema.cast_row(row)
                yield (row_number, headers, row)

        processors = [builtin_processor] + self.__post_cast
        for processor in processors:
            iterator = processor(iterator)

        return iterator
예제 #4
0
class Table(object):
    """Table Schema table representation.

    Args:
        source (mixed): data source
        schema (Schema/dict/str): schema instance or descriptor/path/url
        backend (None/str): backend name like `sql` or `bigquery`
        options (dict): tabulator options or backend options

    """

    # Public

    def __init__(self,
                 source,
                 schema=None,
                 post_cast=None,
                 backend=None,
                 **options):

        # Defaults
        if post_cast is None:
            post_cast = []

        # Schema
        self.__schema = None
        if isinstance(schema, (compat.str, dict)):
            self.__schema = Schema(schema)

        # Tabulator
        if backend is None:
            options.setdefault('headers', 1)
            self.__stream = Stream(source, **options)
            self.__stream.open()
            if self.__schema is None:
                self.__schema = Schema(
                    infer(self.__stream.headers, self.__stream.sample))

        # Storage
        else:
            module = 'tableschema.plugins.%s' % backend
            storage = import_module(module).Storage(**options)
            generator = partial(storage.iter, source)
            if self.__schema is None:
                self.__schema = Schema(storage.describe(source))
            storage.describe(source, self.__schema.descriptor)
            self.__stream = Stream(generator, headers=self.__schema.headers)
            self.__stream.open()

        # Attributes
        self.__post_cast = post_cast

    @property
    def stream(self):
        """tabulator.Stream: stream instance
        """
        return self.__stream

    @property
    def schema(self):
        """Schema: schema instance
        """
        return self.__schema

    def iter(self, keyed=False, extended=False):
        """Yields table rows.

        Args:
            keyed (bool): yield keyed rows
            extended (bool): yield extended rows

        Yields:
            mixed[]/mixed{}: row or keyed row or extended row

        """
        self.__stream.reset()
        iterator = self.__stream.iter(extended=True)
        iterator = self.__apply_processors(iterator)
        for number, headers, row in iterator:
            if extended:
                yield (number, headers, row)
            elif keyed:
                yield dict(zip(headers, row))
            else:
                yield row

    def read(self, keyed=False, extended=False, limit=None):
        """Read table rows.

        Args:
            limit (int): return this amount of rows

        Returns:
            list[]: table rows

        """
        result = []
        rows = self.iter(keyed=keyed, extended=extended)
        for count, row in enumerate(rows, start=1):
            result.append(row)
            if count == limit:
                break
        return result

    def save(self, target, backend=None, **options):
        """Save table rows.

        NOTE: To save schema use `table.schema.save(target)`

        Args:
            target (str): saving target
            backend (None/str): backend name like sql` or `bigquery`
            options (dict): tabulator options or backend options

        Returns:
            None/Storage: storage instance if backend used

        """

        # Tabulator
        if backend is None:
            with Stream(self.iter, headers=self.__schema.headers) as stream:
                stream.save(target, **options)

        # Storage
        else:
            module = 'tableschema.plugins.%s' % backend
            storage = import_module(module).Storage(**options)
            storage.create(target, self.__schema.descriptor, force=True)
            storage.write(target, self.iter())
            return storage

    # Internal

    def __apply_processors(self, iterator):

        # Apply processors to iterator
        def builtin_processor(extended_rows):
            for number, headers, row in extended_rows:
                headers = self.__schema.headers
                row = self.__schema.cast_row(row)
                yield (number, headers, row)

        processors = [builtin_processor] + self.__post_cast
        for processor in processors:
            iterator = processor(iterator)
        return iterator
예제 #5
0
class Table(object):

    # Public

    def __init__(self,
                 source,
                 schema=None,
                 strict=False,
                 post_cast=[],
                 storage=None,
                 **options):
        """https://github.com/frictionlessdata/tableschema-py#schema
        """

        # Set attributes
        self.__source = source
        self.__stream = None
        self.__schema = None
        self.__headers = None
        self.__storage = None
        self.__post_cast = copy(post_cast)

        # Schema
        if isinstance(schema, Schema):
            self.__schema = schema
        elif schema is not None:
            self.__schema = Schema(schema)

        # Stream (tabulator)
        if storage is None:
            options.setdefault('headers', 1)
            self.__stream = Stream(source, **options)

        # Stream (storage)
        else:
            if not isinstance(storage, Storage):
                storage = Storage.connect(storage, **options)
            if self.__schema:
                storage.describe(source, self.__schema.descriptor)
            headers = Schema(storage.describe(source)).field_names
            self.__stream = Stream(partial(storage.iter, source),
                                   headers=headers)
            self.__storage = storage

    @property
    def headers(self):
        """https://github.com/frictionlessdata/tableschema-py#schema
        """
        return self.__headers

    @property
    def schema(self):
        """https://github.com/frictionlessdata/tableschema-py#schema
        """
        return self.__schema

    def iter(self, keyed=False, extended=False, cast=True, relations=False):
        """https://github.com/frictionlessdata/tableschema-py#schema
        """

        # Prepare unique checks
        if cast:
            unique_fields_cache = {}
            if self.schema:
                unique_fields_cache = _create_unique_fields_cache(self.schema)

        # Open/iterate stream
        self.__stream.open()
        iterator = self.__stream.iter(extended=True)
        iterator = self.__apply_processors(iterator, cast=cast)
        for row_number, headers, row in iterator:

            # Get headers
            if not self.__headers:
                self.__headers = headers

            # Check headers
            if cast:
                if self.schema and self.headers:
                    if self.headers != self.schema.field_names:
                        self.__stream.close()
                        message = 'Table headers don\'t match schema field names'
                        raise exceptions.CastError(message)

            # Check unique
            if cast:
                for indexes, cache in unique_fields_cache.items():
                    values = tuple(value for i, value in enumerate(row)
                                   if i in indexes)
                    if not all(map(lambda value: value is None, values)):
                        if values in cache['data']:
                            self.__stream.close()
                            message = 'Field(s) "%s" duplicates in row "%s"'
                            message = message % (cache['name'], row_number)
                            raise exceptions.CastError(message)
                        cache['data'].add(values)

            # Resolve relations
            if relations:
                if self.schema:
                    for foreign_key in self.schema.foreign_keys:
                        row = _resolve_relations(row, headers, relations,
                                                 foreign_key)
                        if row is None:
                            self.__stream.close()
                            message = 'Foreign key "%s" violation in row "%s"'
                            message = message % (foreign_key['fields'],
                                                 row_number)
                            raise exceptions.RelationError(message)

            # Form row
            if extended:
                yield (row_number, headers, row)
            elif keyed:
                yield dict(zip(headers, row))
            else:
                yield row

        # Close stream
        self.__stream.close()

    def read(self,
             keyed=False,
             extended=False,
             cast=True,
             relations=False,
             limit=None):
        """https://github.com/frictionlessdata/tableschema-py#schema
        """
        result = []
        rows = self.iter(keyed=keyed,
                         extended=extended,
                         cast=cast,
                         relations=relations)
        for count, row in enumerate(rows, start=1):
            result.append(row)
            if count == limit:
                break
        return result

    def infer(self, limit=100, confidence=0.75):
        """https://github.com/frictionlessdata/tableschema-py#schema
        """
        if self.__schema is None or self.__headers is None:

            # Infer (tabulator)
            if not self.__storage:
                with self.__stream as stream:
                    if self.__schema is None:
                        self.__schema = Schema()
                        self.__schema.infer(stream.sample[:limit],
                                            headers=stream.headers,
                                            confidence=confidence)
                    if self.__headers is None:
                        self.__headers = stream.headers

            # Infer (storage)
            else:
                descriptor = self.__storage.describe(self.__source)
                if self.__schema is None:
                    self.__schema = Schema(descriptor)
                if self.__headers is None:
                    self.__headers = self.__schema.field_names

        return self.__schema.descriptor

    def save(self, target, storage=None, **options):
        """https://github.com/frictionlessdata/tableschema-py#schema
        """

        # Save (tabulator)
        if storage is None:
            with Stream(self.iter, headers=self.__schema.headers) as stream:
                stream.save(target, **options)
            return True

        # Save (storage)
        else:
            if not isinstance(storage, Storage):
                storage = Storage.connect(storage, **options)
            storage.create(target, self.__schema.descriptor, force=True)
            storage.write(target, self.iter(cast=False))
            return storage

    # Private

    def __apply_processors(self, iterator, cast=True):

        # Apply processors to iterator
        def builtin_processor(extended_rows):
            for row_number, headers, row in extended_rows:
                if self.__schema and cast:
                    row = self.__schema.cast_row(row)
                yield (row_number, headers, row)

        processors = [builtin_processor] + self.__post_cast
        for processor in processors:
            iterator = processor(iterator)

        return iterator
예제 #6
0
class Table(object):
    """Table representation

    # Arguments
      source (str/list[]): data source one of:
        - local file (path)
        - remote file (url)
        - array of arrays representing the rows
      schema (any): data schema in all forms supported by `Schema` class
      strict (bool): strictness option to pass to `Schema` constructor
      post_cast (function[]): list of post cast processors
      storage (None): storage name like `sql` or `bigquery`
      options (dict): `tabulator` or storage's options

    # Raises
      TableSchemaException: raises on any error

    """

    # Public

    def __init__(self,
                 source,
                 schema=None,
                 strict=False,
                 post_cast=[],
                 storage=None,
                 **options):

        # Set attributes
        self.__source = source
        self.__stream = None
        self.__schema = None
        self.__headers = None
        self.__storage = None
        self.__post_cast = copy(post_cast)

        # Schema
        if isinstance(schema, Schema):
            self.__schema = schema
        elif schema is not None:
            self.__schema = Schema(schema)

        # Stream (tabulator)
        if storage is None:
            options.setdefault('headers', 1)
            self.__stream = Stream(source, **options)

        # Stream (storage)
        else:
            if not isinstance(storage, Storage):
                storage = Storage.connect(storage, **options)
            if self.__schema:
                storage.describe(source, self.__schema.descriptor)
            headers = Schema(storage.describe(source)).field_names
            self.__stream = Stream(partial(storage.iter, source),
                                   headers=headers)
            self.__storage = storage

    @property
    def headers(self):
        """Table's headers is available

        # Returns
            str[]: headers

        """
        return self.__headers

    @property
    def schema(self):
        """Returns schema class instance if available

        # Returns
            Schema: schema

        """
        return self.__schema

    @property
    def size(self):
        """Table's size in BYTES if it's available

        If it's already read using e.g. `table.read`, otherwise returns `None`.
        In the middle of an iteration it returns size of already read contents

        # Returns
            int/None: size in BYTES

        """
        if self.__stream:
            return self.__stream.size

    @property
    def hash(self):
        """Table's SHA256 hash if it's available.

        If it's already read using e.g. `table.read`, otherwise returns `None`.
        In the middle of an iteration it returns hash of already read contents

        # Returns
            str/None: SHA256 hash

        """
        if self.__stream:
            return self.__stream.hash

    def iter(self,
             keyed=False,
             extended=False,
             cast=True,
             integrity=False,
             relations=False,
             foreign_keys_values=False,
             exc_handler=None):
        """Iterates through the table data and emits rows cast based on table schema.

        # Arguments

            keyed (bool):
                yield keyed rows in a form of `{header1\\: value1, header2\\: value2}`
                (default is false; the form of rows is `[value1, value2]`)

            extended (bool):
                yield extended rows in a for of `[rowNumber, [header1, header2], [value1, value2]]`
                (default is false; the form of rows is `[value1, value2]`)

            cast (bool):
                disable data casting if false
                (default is true)

            integrity (dict):
                dictionary in a form of `{'size'\\: <bytes>, 'hash'\\: '<sha256>'}`
                to check integrity of the table when it's read completely.
                Both keys are optional.

            relations (dict):
                dictionary of foreign key references in a form
                of `{resource1\\: [{field1\\: value1, field2\\: value2}, ...], ...}`.
                If provided, foreign key fields will checked and resolved
                to one of their references (/!\\ one-to-many fk are not completely resolved).

            foreign_keys_values (dict):
                three-level dictionary of foreign key references optimized
                to speed up validation process in a form of
                `{resource1\\: {(fk_field1, fk_field2)\\: {(value1, value2)\\: {one_keyedrow}, ... }}}`.
                If not provided but relations is true, it will be created
                before the validation process by *index_foreign_keys_values* method

            exc_handler (func):
                optional custom exception handler callable.
                Can be used to defer raising errors (i.e. "fail late"), e.g.
                for data validation purposes. Must support the signature below

        # Custom exception handler

        ```python
        def exc_handler(exc, row_number=None, row_data=None, error_data=None):
            '''Custom exception handler (example)

            # Arguments:
                exc(Exception):
                    Deferred exception instance
                row_number(int):
                    Data row number that triggers exception exc
                row_data(OrderedDict):
                    Invalid data row source data
                error_data(OrderedDict):
                    Data row source data field subset responsible for the error, if
                    applicable (e.g. invalid primary or foreign key fields). May be
                    identical to row_data.
            '''
            # ...
        ```

        # Raises
            TableSchemaException: base class of any error
            CastError: data cast error
            IntegrityError: integrity checking error
            UniqueKeyError: unique key constraint violation
            UnresolvedFKError: unresolved foreign key reference error

        # Returns
            Iterator[list]: yields rows

        """
        # TODO: Use helpers.default_exc_handler instead. Prerequisite: Use
        # stream context manager to make sure the stream gets properly closed
        # in all situations, see comment below.
        if exc_handler is None:
            stream = self.__stream

            def exc_handler(exc, *args, **kwargs):
                stream.close()
                raise exc

        # Prepare unique checks
        if cast:
            unique_fields_cache = {}
            if self.schema:
                unique_fields_cache = _create_unique_fields_cache(self.schema)
        # Prepare relation checks
        if relations and not foreign_keys_values:
            # we have to test relations but the index has not been precomputed
            # prepare the index to boost validation process
            foreign_keys_values = self.index_foreign_keys_values(relations)

        # Open/iterate stream
        # TODO: Use context manager instead to make sure stream gets closed in
        # case of exceptions. Leaving that in for now for the sake of a smaller
        # diff.
        self.__stream.open()
        iterator = self.__stream.iter(extended=True)
        iterator = self.__apply_processors(iterator,
                                           cast=cast,
                                           exc_handler=exc_handler)
        for row_number, headers, row in iterator:

            # Get headers
            if not self.__headers:
                self.__headers = headers

            # Check headers
            if cast:
                if self.schema and self.headers:
                    if self.headers != self.schema.field_names:
                        message = ('Table headers (%r) don\'t match '
                                   'schema field names (%r) in row %s' %
                                   (self.headers, self.schema.field_names,
                                    row_number))
                        keyed_row = OrderedDict(zip(headers, row))
                        exc_handler(exceptions.CastError(message),
                                    row_number=row_number,
                                    row_data=keyed_row,
                                    error_data=keyed_row)
                        continue

            # Check unique
            if cast:
                for indexes, cache in unique_fields_cache.items():
                    keyed_values = OrderedDict((headers[i], value)
                                               for i, value in enumerate(row)
                                               if i in indexes)
                    values = tuple(keyed_values.values())
                    if not all(map(lambda value: value is None, values)):
                        if values in cache['data']:
                            message = ('Field(s) "%s" duplicates in row "%s" '
                                       'for values %r' %
                                       (cache['name'], row_number, values))
                            exc_handler(exceptions.UniqueKeyError(message),
                                        row_number=row_number,
                                        row_data=OrderedDict(zip(headers,
                                                                 row)),
                                        error_data=keyed_values)
                        cache['data'].add(values)

            # Resolve relations
            if relations:
                if self.schema:
                    row_with_relations = dict(zip(headers, copy(row)))
                    for foreign_key in self.schema.foreign_keys:
                        refValue = _resolve_relations(row, headers,
                                                      foreign_keys_values,
                                                      foreign_key)
                        if refValue is None:
                            keyed_row = OrderedDict(zip(headers, row))
                            # local values of the FK
                            local_keyed_values = {
                                key: keyed_row[key]
                                for key in foreign_key['fields']
                            }
                            local_values = tuple(local_keyed_values.values())
                            message = (
                                'Foreign key "%s" violation in row "%s": '
                                '%s not found in %s' %
                                (foreign_key['fields'], row_number,
                                 local_values,
                                 foreign_key['reference']['resource']))
                            exc_handler(exceptions.UnresolvedFKError(message),
                                        row_number=row_number,
                                        row_data=keyed_row,
                                        error_data=local_keyed_values)
                            # If we reach this point we don't fail-early
                            # i.e. no exception has been raised. As the
                            # reference can't be resolved, use empty dict
                            # as the "unresolved result".
                            for field in foreign_key['fields']:
                                if not isinstance(row_with_relations[field],
                                                  dict):
                                    row_with_relations[field] = {}
                        elif type(refValue) is dict:
                            # Substitute resolved referenced object for
                            # original referencing field value.
                            # For a composite foreign key, this substitutes
                            # each part of the composite key with the
                            # referenced object.
                            for field in foreign_key['fields']:
                                if type(row_with_relations[field]) is not dict:
                                    # no previous refValues injected on this field
                                    row_with_relations[field] = refValue
                                else:
                                    # alreayd one ref, merging
                                    row_with_relations[field].update(refValue)
                        else:
                            # case when all original value of the FK are empty
                            # refValue == row, there is nothing to do
                            # an empty dict might be a better returned value for this case ?
                            pass

                    #  mutate row now that we are done, in the right order
                    row = [row_with_relations[f] for f in headers]

            # Form row
            if extended:
                yield (row_number, headers, row)
            elif keyed:
                yield dict(zip(headers, row))
            else:
                yield row

        # Check integrity
        if integrity:
            violations = []
            size = integrity.get('size')
            hash = integrity.get('hash')
            if size and size != self.__stream.size:
                violations.append('size "%s"' % self.__stream.size)
            if hash and hash != self.__stream.hash:
                violations.append('hash "%s"' % self.__stream.hash)
            if violations:
                message = 'Calculated %s differ(s) from declared value(s)'
                raise exceptions.IntegrityError(message %
                                                ' and '.join(violations))

        # Close stream
        self.__stream.close()

    def read(self,
             keyed=False,
             extended=False,
             cast=True,
             limit=None,
             integrity=False,
             relations=False,
             foreign_keys_values=False,
             exc_handler=None):
        """Read the whole table and return as array of rows

        > It has the same API as `table.iter` except for

        # Arguments
            limit (int): limit count of rows to read and return

        # Returns
            list[]: returns rows

        """
        result = []
        rows = self.iter(keyed=keyed,
                         extended=extended,
                         cast=cast,
                         integrity=integrity,
                         relations=relations,
                         foreign_keys_values=foreign_keys_values,
                         exc_handler=exc_handler)
        for count, row in enumerate(rows, start=1):
            result.append(row)
            if count == limit:
                break
        return result

    def infer(self, limit=100, confidence=0.75):
        """Infer a schema for the table.

        It will infer and set Table Schema to `table.schema` based on table data.

        # Arguments
            limit (int): limit rows sample size
            confidence (float):
                how many casting errors are allowed
                (as a ratio, between 0 and 1)

        # Returns
            dict: Table Schema descriptor

        """
        if self.__schema is None or self.__headers is None:

            # Infer (tabulator)
            if not self.__storage:
                with self.__stream as stream:
                    if self.__schema is None:
                        self.__schema = Schema()
                        self.__schema.infer(stream.sample[:limit],
                                            headers=stream.headers,
                                            confidence=confidence)
                    if self.__headers is None:
                        self.__headers = stream.headers

            # Infer (storage)
            else:
                descriptor = self.__storage.describe(self.__source)
                if self.__schema is None:
                    self.__schema = Schema(descriptor)
                if self.__headers is None:
                    self.__headers = self.__schema.field_names

        return self.__schema.descriptor

    def save(self, target, storage=None, **options):
        """Save data source to file locally in CSV format with `,` (comma) delimiter

        > To save schema use `table.schema.save()`

        # Arguments
            target (str): saving target (e.g. file path)
            storage (None/str): storage name like `sql` or `bigquery`
            options (dict): `tabulator` or storage options

        # Raises
            TableSchemaException: raises an error if there is saving problem

        # Returns
            True/Storage: returns true or storage instance

        """

        # Save (tabulator)
        if storage is None:
            with Stream(self.iter, headers=self.__schema.headers) as stream:
                stream.save(target, **options)
            return True

        # Save (storage)
        else:
            if not isinstance(storage, Storage):
                storage = Storage.connect(storage, **options)
            storage.create(target, self.__schema.descriptor, force=True)
            storage.write(target, self.iter(cast=False))
            return storage

    def index_foreign_keys_values(self, relations):
        """Creates a three-level dictionary of foreign key references

        We create them optimized to speed up validation process in a form of
        `{resource1: {(fk_field1, fk_field2): {(value1, value2): {one_keyedrow}, ... }}}`.

        For each foreign key of the schema it will iterate through the corresponding
        `relations['resource']` to create an index (i.e. a dict) of existing values
        for the foreign fields and store on keyed row for each value combination.

        The optimization relies on the indexation of possible values for one foreign key
        in a hashmap to later speed up resolution.

        This method is public to allow creating the index once to apply it
        on multiple tables charing the same schema
        (typically [grouped resources in datapackage](https://github.com/frictionlessdata/datapackage-py#group))

        # Notes

        - the second key of the output is a tuple of the foreign fields,
            a proxy identifier of the foreign key
        - the same relation resource can be indexed multiple times
            as a schema can contain more than one Foreign Keys
            pointing to the same resource

        # Arguments
            relations (dict):
                dict of foreign key references in a form of
                `{resource1\\: [{field1\\: value1, field2\\: value2}, ...], ...}`.
                It must contain all resources pointed in the foreign keys schema definition.

        # Returns
            dict:
                returns a three-level dictionary of foreign key references
                optimized to speed up validation process in a form of
                `{resource1\\: {(fk_field1, fk_field2)\\: {(value1, value2)\\: {one_keyedrow}, ... }}})`

        """

        # we dont need to load the complete reference table to test relations
        # we can lower payload AND optimize testing foreign keys
        # by preparing the right index based on the foreign key definition
        # foreign_keys are sets of tuples of all possible values in the foreign table
        # foreign keys =
        # [reference] [foreign_keys tuple] = { (foreign_keys_values, ) : one_keyedrow, ... }
        foreign_keys = defaultdict(dict)
        if self.schema:
            for fk in self.schema.foreign_keys:
                # load relation data
                relation = fk['reference']['resource']

                # create a set of foreign keys
                # to optimize we prepare index of existing values
                # this index should use reference + foreign_keys as key
                # cause many foreign keys may use the same reference
                foreign_keys[relation][tuple(fk['reference']['fields'])] = {}
                for row in relations[relation]:
                    key = tuple([
                        row[foreign_field]
                        for foreign_field in fk['reference']['fields']
                    ])
                    # here we should chose to pick the first or nth row which match
                    # previous implementation picked the first, so be it
                    if key not in foreign_keys[relation][tuple(
                            fk['reference']['fields'])]:
                        foreign_keys[relation][tuple(
                            fk['reference']['fields'])][key] = row
        return foreign_keys

    # Private

    def __apply_processors(self, iterator, cast=True, exc_handler=None):

        # Apply processors to iterator
        def builtin_processor(extended_rows):
            for row_number, headers, row in extended_rows:
                if self.__schema and cast:
                    row = self.__schema.cast_row(row,
                                                 row_number=row_number,
                                                 exc_handler=exc_handler)
                yield (row_number, headers, row)

        processors = [builtin_processor] + self.__post_cast
        for processor in processors:
            iterator = processor(iterator)

        return iterator
예제 #7
0
class Table(object):

    # Public

    def __init__(self, source, schema=None, strict=False,
                 post_cast=[], storage=None, **options):
        """https://github.com/frictionlessdata/tableschema-py#schema
        """

        # Set attributes
        self.__source = source
        self.__stream = None
        self.__schema = None
        self.__headers = None
        self.__storage = None
        self.__post_cast = copy(post_cast)

        # Schema
        if schema is not None:
            self.__schema = Schema(schema)

        # Stream (tabulator)
        if storage is None:
            options.setdefault('headers', 1)
            self.__stream = Stream(source,  **options)

        # Stream (storage)
        else:
            if not isinstance(storage, Storage):
                storage = Storage.connect(storage, **options)
            if self.__schema:
                storage.describe(source, self.__schema.descriptor)
            headers = Schema(storage.describe(source)).field_names
            self.__stream = Stream(partial(storage.iter, source), headers=headers)
            self.__storage = storage

    @property
    def headers(self):
        """https://github.com/frictionlessdata/tableschema-py#schema
        """
        return self.__headers

    @property
    def schema(self):
        """https://github.com/frictionlessdata/tableschema-py#schema
        """
        return self.__schema

    def iter(self, keyed=False, extended=False, cast=True, relations=False):
        """https://github.com/frictionlessdata/tableschema-py#schema
        """

        # Prepare unique checks
        if cast:
            unique_fields_cache = {}
            if self.schema:
                unique_fields_cache = _create_unique_fields_cache(self.schema)

        # Open/iterate stream
        self.__stream.open()
        iterator = self.__stream.iter(extended=True)
        iterator = self.__apply_processors(iterator, cast=cast)
        for row_number, headers, row in iterator:

            # Get headers
            if not self.__headers:
                self.__headers = headers

            # Check headers
            if cast:
                if self.schema and self.headers:
                    if self.headers != self.schema.field_names:
                        self.__stream.close()
                        message = 'Table headers don\'t match schema field names'
                        raise exceptions.CastError(message)

            # Check unique
            if cast:
                for indexes, cache in unique_fields_cache.items():
                    values = tuple(value for i, value in enumerate(row) if i in indexes)
                    if not all(map(lambda value: value is None, values)):
                        if values in cache['data']:
                            self.__stream.close()
                            message = 'Field(s) "%s" duplicates in row "%s"'
                            message = message % (cache['name'], row_number)
                            raise exceptions.CastError(message)
                        cache['data'].add(values)

            # Resolve relations
            if relations:
                if self.schema:
                    for foreign_key in self.schema.foreign_keys:
                        row = _resolve_relations(row, headers, relations, foreign_key)
                        if row is None:
                            self.__stream.close()
                            message = 'Foreign key "%s" violation in row "%s"'
                            message = message % (foreign_key['fields'], row_number)
                            raise exceptions.RelationError(message)

            # Form row
            if extended:
                yield (row_number, headers, row)
            elif keyed:
                yield dict(zip(headers, row))
            else:
                yield row

        # Close stream
        self.__stream.close()

    def read(self, keyed=False, extended=False, cast=True, relations=False, limit=None):
        """https://github.com/frictionlessdata/tableschema-py#schema
        """
        result = []
        rows = self.iter(keyed=keyed, extended=extended, cast=cast, relations=relations)
        for count, row in enumerate(rows, start=1):
            result.append(row)
            if count == limit:
                break
        return result

    def infer(self, limit=100):
        """https://github.com/frictionlessdata/tableschema-py#schema
        """
        if self.__schema is None or self.__headers is None:

            # Infer (tabulator)
            if not self.__storage:
                with self.__stream as stream:
                    if self.__schema is None:
                        self.__schema = Schema()
                        self.__schema.infer(stream.sample[:limit], headers=stream.headers)
                    if self.__headers is None:
                        self.__headers = stream.headers

            # Infer (storage)
            else:
                descriptor = self.__storage.describe(self.__source)
                if self.__schema is None:
                    self.__schema = Schema(descriptor)
                if self.__headers is None:
                    self.__headers = self.__schema.field_names

        return self.__schema.descriptor

    def save(self, target, storage=None, **options):
        """https://github.com/frictionlessdata/tableschema-py#schema
        """

        # Save (tabulator)
        if storage is None:
            with Stream(self.iter, headers=self.__schema.headers) as stream:
                stream.save(target, **options)
            return True

        # Save (storage)
        else:
            if not isinstance(storage, Storage):
                storage = Storage.connect(storage, **options)
            storage.create(target, self.__schema.descriptor, force=True)
            storage.write(target, self.iter(cast=False))
            return storage

    # Private

    def __apply_processors(self, iterator, cast=True):

        # Apply processors to iterator
        def builtin_processor(extended_rows):
            for row_number, headers, row in extended_rows:
                if self.__schema and cast:
                    row = self.__schema.cast_row(row)
                yield (row_number, headers, row)
        processors = [builtin_processor] + self.__post_cast
        for processor in processors:
            iterator = processor(iterator)

        return iterator