def test_indexes_fields(dialect, database_url):
    engine = create_engine(database_url)
    storage = Storage(engine=engine, prefix='test_indexes_fields_')
    storage.delete()
    storage.create(['articles'], [ARTICLES['schema']],
                   indexes_fields=[[['rating'], ['name']]])
    storage.write('articles', ARTICLES['data'])
    inspector = reflection.Inspector.from_engine(engine)
    indexes = [
        index for index in [
            inspector.get_indexes(table_name)
            for table_name in inspector.get_table_names()
        ]
    ][0]
    assert indexes
Exemplo n.º 2
0
    def process_resource(self, resource: ResourceWrapper):
        resource_name = resource.res.name
        if resource_name not in self.converted_resources:
            return resource
        else:
            converted_resource = self.converted_resources[resource_name]
            mode = converted_resource.get('mode', 'rewrite')
            table_name = converted_resource['table-name']
            storage = Storage(self.engine, prefix=table_name)
            if mode == 'rewrite' and '' in storage.buckets:
                storage.delete('')
            schema_descriptor = resource.res.descriptor['schema']
            schema = self.normalize_schema_for_engine(self.engine.dialect.name,
                                                      schema_descriptor)
            if '' not in storage.buckets:
                logging.info('Creating DB table %s', table_name)
                try:
                    storage.create('', schema)
                except ValidationError as e:
                    logging.error('Error validating schema %r',
                                  schema_descriptor)
                    for err in e.errors:
                        logging.error('Error validating schema: %s', err)
                    raise
            else:
                storage.describe('', schema)

            update_keys = None
            if mode == 'update':
                update_keys = converted_resource.get('update_keys')
                if update_keys is None:
                    update_keys = schema_descriptor.get('primaryKey', [])
            logging.info('Writing to DB %s -> %s (mode=%s, keys=%s)',
                         resource_name, table_name, mode, update_keys)
            return map(
                self.get_output_row,
                storage.write(
                    '',
                    self.normalize_for_engine(self.engine.dialect.name,
                                              resource, schema_descriptor),
                    keyed=True,
                    as_generator=True,
                    update_keys=update_keys,
                    buffer_size=self.batch_size,
                    use_bloom_filter=self.use_bloom_filter,
                ))
def test_storage_only_parameter():
    RESOURCE = {
        'schema': {
            'fields': [
                {
                    'name': 'person_id',
                    'type': 'integer',
                    'constraints': {
                        'required': True
                    }
                },
                {
                    'name': 'name',
                    'type': 'string'
                },
            ],
            'primaryKey':
            'person_id',
        },
        'data': []
    }

    # Create storage
    engine = create_engine(os.environ['POSTGRES_URL'], echo=True)
    storage = Storage(engine=engine, prefix='test_only_')

    # Delete buckets
    storage.delete()

    # Create buckets
    storage.create('names', RESOURCE['schema'], indexes_fields=[['person_id']])

    # Recreate storage limiting reflection
    only = lambda table: 'name' not in table
    engine = create_engine(os.environ['POSTGRES_URL'], echo=True)
    storage = Storage(engine=engine, prefix='test_only_', reflect_only=only)

    # Delete non existent bucket
    with pytest.raises(tableschema.exceptions.StorageError):
        storage.delete('names')
Exemplo n.º 4
0
    def handle_resource(self, resource, spec, parameters, datapackage):
        resource_name = spec['name']
        if resource_name not in self.converted_resources:
            return resource
        else:
            converted_resource = self.converted_resources[resource_name]
            mode = converted_resource.get('mode', 'rewrite')
            table_name = converted_resource['table-name']
            storage = Storage(self.engine, prefix=table_name)
            if mode == 'rewrite' and '' in storage.buckets:
                storage.delete('')
            schema = self.normalise_schema_for_engine(self.engine.dialect.name,
                                                      spec['schema'])
            if '' not in storage.buckets:
                logging.info('Creating DB table %s', table_name)
                try:
                    storage.create('', schema)
                except ValidationError as e:
                    logging.error('Error validating schema %r', spec['schema'])
                    for err in e.errors:
                        logging.error('Error validating schema: %s', err)
                    raise
            else:
                storage.describe('', schema)

            update_keys = None
            if mode == 'update':
                update_keys = converted_resource.get('update_keys')
                if update_keys is None:
                    update_keys = spec['schema'].get('primaryKey', [])
            logging.info('Writing to DB %s -> %s (mode=%s, keys=%s)',
                         resource_name, table_name, mode, update_keys)
            return map(
                self.get_output_row,
                storage.write('',
                              self.normalise_for_engine(
                                  self.engine.dialect.name, resource, spec),
                              keyed=True,
                              as_generator=True,
                              update_keys=update_keys))
def test_storage_update(use_bloom_filter, buffer_size):
    RESOURCE = {
        'schema': {
            'fields': [
                {
                    'name': 'person_id',
                    'type': 'integer',
                    'constraints': {
                        'required': True
                    }
                },
                {
                    'name': 'name',
                    'type': 'string',
                    'constraints': {
                        'required': True
                    }
                },
                {
                    'name': 'favorite_color',
                    'type': 'string'
                },
            ],
            'primaryKey':
            'person_id',
        },
        'data': [
            ['1', 'ulysses', 'blue'],
            ['2', 'theseus', 'green'],
            ['3', 'perseus', 'red'],
            ['4', 'dedalus', 'yellow'],
        ],
        'updateData': [
            ['5', 'apollo', 'orange'],
            ['3', 'perseus', 'magenta'],
            ['6', 'zeus', 'grey'],
            [
                '4',
                'dedalus',
                'sunshine',
            ],
            ['5', 'apollo', 'peach'],
        ],
    }

    # Create storage
    update_keys = ['person_id', 'name']
    engine = create_engine(os.environ['POSTGRES_URL'])
    storage = Storage(engine=engine,
                      prefix='test_update_',
                      autoincrement='__id')

    # Delete buckets
    storage.delete()

    # Create buckets
    storage.create('colors', RESOURCE['schema'])

    # Write data to buckets
    storage.write('colors', RESOURCE['data'], update_keys=update_keys)
    gen = storage.write('colors',
                        RESOURCE['updateData'],
                        update_keys=update_keys,
                        as_generator=True,
                        use_bloom_filter=use_bloom_filter,
                        buffer_size=buffer_size)
    gen = list(gen)
    assert len(gen) == 5
    assert len(list(filter(lambda i: i.updated, gen))) == 3
    assert list(map(lambda i: i.updated_id, gen)) == [5, 3, 6, 4, 5]

    # Reflect storage
    storage = Storage(engine=engine,
                      prefix='test_update_',
                      autoincrement='__id')
    gen = storage.write('colors',
                        RESOURCE['updateData'],
                        update_keys=update_keys,
                        as_generator=True,
                        use_bloom_filter=use_bloom_filter,
                        buffer_size=buffer_size)
    gen = list(gen)
    assert len(gen) == 5
    assert len(list(filter(lambda i: i.updated, gen))) == 5
    assert list(map(lambda i: i.updated_id, gen)) == [5, 3, 6, 4, 5]

    # Create new storage to use reflection only
    storage = Storage(engine=engine, prefix='test_update_')

    # Assert data
    rows = list(storage.iter('colors'))
    assert len(rows) == 6
    color_by_person = dict((row[1], row[3]) for row in rows)
    assert color_by_person == {
        1: 'blue',
        2: 'green',
        3: 'magenta',
        4: 'sunshine',
        5: 'peach',
        6: 'grey'
    }

    # Storage without autoincrement
    storage = Storage(engine=engine, prefix='test_update_')
    storage.delete()
    storage.create('colors', RESOURCE['schema'])
    storage.write('colors',
                  RESOURCE['data'],
                  update_keys=update_keys,
                  use_bloom_filter=use_bloom_filter,
                  buffer_size=buffer_size)
    gen = storage.write('colors',
                        RESOURCE['updateData'],
                        update_keys=update_keys,
                        as_generator=True,
                        use_bloom_filter=use_bloom_filter,
                        buffer_size=buffer_size)
    gen = list(gen)
    assert len(gen) == 5
    assert len(list(filter(lambda i: i.updated, gen))) == 3
    assert list(map(lambda i: i.updated_id,
                    gen)) == [None, None, None, None, None]
def test_storage_limited_databases(dialect, database_url):

    # Create storage
    engine = create_engine(database_url)
    storage = Storage(engine=engine, prefix='test_storage_')

    # Delete buckets
    storage.delete()

    # Create buckets
    storage.create(
        ['articles', 'comments'],
        [remove_fk(ARTICLES['schema']),
         remove_fk(COMMENTS['schema'])],
        indexes_fields=[[['rating'], ['name']], []])
    storage.create('comments', remove_fk(COMMENTS['schema']), force=True)
    storage.create('temporal', TEMPORAL['schema'])
    storage.create('location', LOCATION['schema'])
    storage.create('compound', COMPOUND['schema'])

    # Write data
    storage.write('articles', ARTICLES['data'])
    storage.write('comments', COMMENTS['data'])
    storage.write('temporal', TEMPORAL['data'])
    storage.write('location', LOCATION['data'])
    storage.write('compound', COMPOUND['data'])

    # Create new storage to use reflection only
    storage = Storage(engine=engine, prefix='test_storage_')

    # Create existent bucket
    with pytest.raises(tableschema.exceptions.StorageError):
        storage.create('articles', ARTICLES['schema'])

    # Assert buckets
    assert storage.buckets == [
        'articles', 'comments', 'compound', 'location', 'temporal'
    ]

    # Assert schemas
    assert storage.describe('articles') == {
        'fields': [
            {
                'name': 'id',
                'type': 'integer',
                'constraints': {
                    'required': True
                }
            },
            {
                'name': 'parent',
                'type': 'integer'
            },
            {
                'name': 'name',
                'type': 'string'
            },
            {
                'name': 'current',
                'type': 'boolean' if dialect == 'sqlite' else 'integer'
            },
            {
                'name': 'rating',
                'type': 'number'
            },
        ],
        'primaryKey':
        'id',
        # foreignKeys not supported
    }
    assert storage.describe('comments') == {
        'fields': [
            {
                'name': 'entry_id',
                'type': 'integer',
                'constraints': {
                    'required': True
                }
            },
            {
                'name': 'comment',
                'type': 'string'
            },
            {
                'name': 'note',
                'type': 'string'
            },  # type downgrade
        ],
        'primaryKey':
        'entry_id',
        # foreignKeys not supported
    }
    assert storage.describe('temporal') == {
        'fields': [
            {
                'name': 'date',
                'type': 'date'
            },
            {
                'name': 'date_year',
                'type': 'date'
            },  # format removal
            {
                'name': 'datetime',
                'type': 'datetime'
            },
            {
                'name': 'duration',
                'type': 'string'
            },  # type fallback
            {
                'name': 'time',
                'type': 'time'
            },
            {
                'name': 'year',
                'type': 'integer'
            },  # type downgrade
            {
                'name': 'yearmonth',
                'type': 'string'
            },  # type fallback
        ],
    }
    assert storage.describe('location') == {
        'fields': [
            {
                'name': 'location',
                'type': 'string'
            },  # type fallback
            {
                'name': 'geopoint',
                'type': 'string'
            },  # type fallback
        ],
    }
    assert storage.describe('compound') == {
        'fields': [
            {
                'name': 'stats',
                'type': 'string'
            },  # type fallback
            {
                'name': 'persons',
                'type': 'string'
            },  # type fallback
        ],
    }

    # Assert data
    assert storage.read('articles') == cast(ARTICLES)['data']
    assert storage.read('comments') == cast(COMMENTS)['data']
    assert storage.read('temporal') == cast(TEMPORAL,
                                            skip=['duration',
                                                  'yearmonth'])['data']
    assert storage.read('location') == cast(LOCATION,
                                            skip=['geojson',
                                                  'geopoint'])['data']
    assert storage.read('compound') == cast(COMPOUND, skip=['array',
                                                            'object'])['data']

    # Assert data with forced schema
    storage.describe('compound', COMPOUND['schema'])
    assert storage.read('compound') == cast(COMPOUND)['data']

    # Delete non existent bucket
    with pytest.raises(tableschema.exceptions.StorageError):
        storage.delete('non_existent')

    # Delete buckets
    storage.delete()
# Get resources
articles_schema = json.load(io.open('data/articles.json', encoding='utf-8'))
comments_schema = json.load(io.open('data/comments.json', encoding='utf-8'))
articles_data = topen('data/articles.csv', with_headers=True).read()
comments_data = topen('data/comments.csv', with_headers=True).read()

# Engine
engine = create_engine(os.environ['POSTGRES_URL'])

# Storage
storage = Storage(engine=engine, prefix='prefix_')

# Delete tables
for table in reversed(storage.tables):
    storage.delete(table)

# Create tables
storage.create(['articles', 'comments'], [articles_schema, comments_schema])

# Write data to tables
storage.write('articles', articles_data)
storage.write('comments', comments_data)

# List tables
print(storage.tables)

# Describe tables
print(storage.describe('articles'))
print(storage.describe('comments'))
Exemplo n.º 8
0
def test_storage():

    # Create storage
    engine = create_engine(os.environ['POSTGRES_URL'])
    storage = Storage(engine=engine, prefix='test_storage_')

    # Delete buckets
    storage.delete()

    # Create buckets
    storage.create(['articles', 'comments'],
                   [ARTICLES['schema'], COMMENTS['schema']],
                   indexes_fields=[[['rating'], ['name']], []])
    storage.create('comments', COMMENTS['schema'], force=True)
    storage.create('temporal', TEMPORAL['schema'])
    storage.create('location', LOCATION['schema'])
    storage.create('compound', COMPOUND['schema'])

    # Write data
    storage.write('articles', ARTICLES['data'])
    storage.write('comments', COMMENTS['data'])
    storage.write('temporal', TEMPORAL['data'])
    storage.write('location', LOCATION['data'])
    storage.write('compound', COMPOUND['data'])

    # Create new storage to use reflection only
    storage = Storage(engine=engine, prefix='test_storage_')

    # Create existent bucket
    with pytest.raises(tableschema.exceptions.StorageError):
        storage.create('articles', ARTICLES['schema'])

    # Assert buckets
    assert storage.buckets == [
        'articles', 'compound', 'location', 'temporal', 'comments'
    ]

    # Assert schemas
    assert storage.describe('articles') == ARTICLES['schema']
    assert storage.describe('comments') == {
        'fields': [
            {
                'name': 'entry_id',
                'type': 'integer',
                'constraints': {
                    'required': True
                }
            },
            {
                'name': 'comment',
                'type': 'string'
            },
            {
                'name': 'note',
                'type': 'string'
            },  # type downgrade
        ],
        'primaryKey':
        'entry_id',
        'foreignKeys': [
            {
                'fields': 'entry_id',
                'reference': {
                    'resource': 'articles',
                    'fields': 'id'
                }
            },
        ],
    }
    assert storage.describe('temporal') == {
        'fields': [
            {
                'name': 'date',
                'type': 'date'
            },
            {
                'name': 'date_year',
                'type': 'date'
            },  # format removal
            {
                'name': 'datetime',
                'type': 'datetime'
            },
            {
                'name': 'duration',
                'type': 'string'
            },  # type fallback
            {
                'name': 'time',
                'type': 'time'
            },
            {
                'name': 'year',
                'type': 'integer'
            },  # type downgrade
            {
                'name': 'yearmonth',
                'type': 'string'
            },  # type fallback
        ],
    }
    assert storage.describe('location') == {
        'fields': [
            {
                'name': 'location',
                'type': 'object'
            },  # type downgrade
            {
                'name': 'geopoint',
                'type': 'string'
            },  # type fallback
        ],
    }
    assert storage.describe('compound') == {
        'fields': [
            {
                'name': 'stats',
                'type': 'object'
            },
            {
                'name': 'persons',
                'type': 'object'
            },  # type downgrade
        ],
    }

    # Assert data
    assert storage.read('articles') == cast(ARTICLES)['data']
    assert storage.read('comments') == cast(COMMENTS)['data']
    assert storage.read('temporal') == cast(TEMPORAL,
                                            skip=['duration',
                                                  'yearmonth'])['data']
    assert storage.read('location') == cast(LOCATION,
                                            skip=['geopoint'])['data']
    assert storage.read('compound') == cast(COMPOUND)['data']

    # Assert data with forced schema
    storage.describe('compound', COMPOUND['schema'])
    assert storage.read('compound') == cast(COMPOUND)['data']

    # Delete non existent bucket
    with pytest.raises(tableschema.exceptions.StorageError):
        storage.delete('non_existent')

    # Delete buckets
    storage.delete()
    def load_fdp_to_db(self, package, callback=noop):
        """
        Load an FDP to the database, create a babbage model and save it as well
        :param package: URL for the datapackage.json
        :param callback: callback to use to send progress updates
        """

        self.callback = callback
        self.package = package

        # Load and validate the datapackage
        self.status_update(status=STATUS_LOADING_DATAPACKAGE)
        self.dpo = DataPackage(package)
        self.status_update(status=STATUS_VALIDATING_DATAPACKAGE)
        self.dpo.validate()
        self.status_update(status=STATUS_LOADING_RESOURCE)
        resource = self.dpo.resources[0]
        schema = resource.descriptor['schema']

        # Use the cube manager to get the table name
        self.datapackage_name = self.dpo.descriptor['name']
        datapackage_owner = self.dpo.descriptor['owner']
        datapackage_author = self.dpo.descriptor['author']

        # Get the full name from the author field, and rewrite it without the email
        self.fullname, email_addr = email.utils.parseaddr(datapackage_author)
        email_addr = email_addr.split('@')[0] + '@not.shown'
        self.dpo.descriptor['author'] = '{0} <{1}>'.format(
            self.fullname, email_addr)
        self.dpo.descriptor.setdefault('private', True)

        self.model_name = "{0}:{1}".format(datapackage_owner,
                                           self.datapackage_name)
        table_name = table_name_for_package(datapackage_owner,
                                            self.datapackage_name)

        try:
            all_fields = set()
            field_translation = {}
            # Process schema - slugify field names
            for field in schema['fields']:
                name = database_name(field['name'], all_fields)
                all_fields.add(name)
                translated_field = {'name': name, 'type': field['type']}
                field_translation[field['name']] = translated_field

            storage_schema = {
                'fields': [{
                    'type': f['type'],
                    'name': field_translation[f['name']]['name'],
                    'format': f.get('format', 'default')
                } for f in schema['fields']],
                # Babbage likes just one primary key
                'primaryKey':
                '_id'
            }

            # Add Primary key to schema
            storage_schema['fields'].insert(0, {
                'name': '_id',
                'type': 'integer'
            })

            # Create Babbage Model
            self.status_update(status=STATUS_CREATING_BABBAGE_MODEL)
            self.model = fdp_to_model(self.dpo, table_name, resource,
                                      field_translation)

            if self.check_hashes(resource):
                # Create indexes
                indexes = set()
                primary_keys = schema.get('primaryKey', [])
                for dim in self.dpo.descriptor.get('model',
                                                   {}).get('dimensions',
                                                           {}).values():
                    attributes = dim.get('attributes', {})
                    for attribute in attributes.values():
                        source = attribute.get('source')
                        if source in primary_keys:
                            indexes.add((field_translation[source]['name'], ))
                        labelfor = attribute.get('labelfor')
                        if labelfor is not None:
                            labelfor = attributes.get(labelfor, {})
                            labelfor_source = labelfor.get('source')
                            if labelfor_source in primary_keys:
                                indexes.add((
                                    field_translation[labelfor_source]['name'],
                                    field_translation[source]['name'],
                                ))
                indexes = list(indexes)
                logging.error('INDEXES: %r', indexes)
                #
                # if dim['label'] in primary_keys:
                #     key_field = dim['attributes'][dim['key_attribute']]['label']
                #     key_field = field_translation[key_field]['name']
                #     indexes.append((key_field,))
                #
                #     label_field = dim['attributes'].get(dim.get('label_attribute'), {}).get('label')
                #     if label_field is not None:
                #         label_field = field_translation[label_field]['name']
                #         if label_field != key_field:
                #             indexes.append((key_field, label_field))

                # Load 1st resource data into DB
                # We use the prefix name so that JTS-SQL doesn't load all table data into memory
                storage = Storage(self.engine, prefix=table_name)
                faux_table_name = ''
                if faux_table_name in storage.buckets:
                    self.status_update(status=STATUS_DELETING_TABLE)
                    storage.delete(faux_table_name)
                self.status_update(status=STATUS_CREATING_TABLE)
                indexes_fields = None
                if indexes:
                    indexes_fields = [indexes]
                storage.create(faux_table_name,
                               storage_schema,
                               indexes_fields=indexes_fields)

                self.status_update(status=STATUS_LOADING_DATA_READY)
                row_processor = RowProcessor(resource.iter(keyed=True),
                                             self.status_update, schema,
                                             self.dpo.descriptor)
                storage.write(faux_table_name, row_processor.iter())

                cache = get_os_cache()
                if cache is not None:
                    logging.info('Clearing cache for context=%s',
                                 self.model_name)
                    cache.clear(self.model_name)

            response = {
                'model_name': self.model_name,
                'babbage_model': self.model,
                'package': self.dpo.descriptor
            }
            self.status_update(status=STATUS_DONE, data=response)

        except Exception as e:
            logging.exception('LOADING FAILED')
            self.status_update(status=STATUS_FAIL,
                               error=traceback.format_exc())
            return False

        return True