def olap2sql(ctx, connection): """ Automatically generates OLAP to SQL mappings using a normalized approach. """ olapmapper = ctx.get('olap2sql.olapmapper', fail=False) if not olapmapper: olapmapper = olap.OlapMapper() ctx.add('olap2sql.olapmapper', olapmapper) facts = ctx.find(type=cubetl.olap.Fact) for fact in facts: entity_mapper = OLAPToSQL.generate_star_schema_mapper_entity( ctx, connection, olapmapper, fact) ctx.add(entity_mapper.sqltable.urn, entity_mapper.sqltable) olapmapper.mappers.append(entity_mapper)
def sql2olap(ctx, debug=False, prefix="sql2olap"): """ This method generates a CubETL OLAP schema from an SQL schema defined by CubETL SQL components (such a schema can automatically be generated from an existing SQL database using `sql2cubetl` function). The process can be controlled via a dictionary of options passed via the `options` argument. Options: * `<object_uri>.type=ignore` ignores the given SQL column. * `<object_uri>.type=attribute` forces the SQL column to be used as fact attribute. * `<object_uri>.type=dimension` forces the SQL column to be used as dimension. Details: This method works by walking objects of class SQLTable in the context, and generating an cubetl.olap.Fact for each. Tables referenced via foreign keys are included as dimensions. """ # TODO: New generation refactor # Create a new Dimension for each found field, unless configuration says they are the same dimension # (or can be deduced: ie: same column name + size + same user type (raw dates, 0/1 boolean...). # Then, instance an olap sql-to-olap (process tables and columns, generate olap and olap mappings) # Implement querying # Move these SQL/OLAP method to Cubetl components. # Normalize/formalize column/name/id/schema/database usage # (optionally, at the end, export to cubes) # (should theorically be able to create olap-2-star-schema mappings, then create tables and load) # (theorically, we should be able to generate the same mappings from the generated star-schema (would require identifying split dims/hierarchies) #exclude_columns = ['key', 'entity_id'] #force_dimensions = dimensions if dimensions else [] # Load datetime ctx.include(ctx.library_path + "/datetime.py") # Mappings for datetime datedimension = ctx.get("cubetl.datetime.date") facts = {} factattributes = [] olapmappers = [] logger.info("Generating CubETL Olap schema from SQL schema.") sqltables = ctx.find(type=cubetl.sql.sql.SQLTable) for sqltable in sqltables: olap_type_table = _match_config( ctx.props, 'sql2olap.table.%s.type' % sqltable.name, None) logger.info("Fact: %s" % sqltable.name) if olap_type_table == 'ignore': logger.info("SQL2OLAP ignoring SQL table: %s", sqltable) continue factmappings = [] factattributes = [] key_count = 0 for dbcol in sqltable.columns: olap_type = _match_config(ctx.props, 'sql2olap.%s.type' % dbcol.urn, None) if olap_type: logger.info("Column: %s (forced type: %s)" % (dbcol, olap_type)) else: logger.info("Column: %s" % (dbcol)) if olap_type == 'ignore': logger.info("SQL2OLAP ignoring SQL column: %s", dbcol) continue if dbcol.pk: key_urn = "%s.fact.%s.key.%s" % (prefix, sqltable.name, dbcol.name) key = ctx.add( key_urn, Key(name=dbcol.name, type=dbcol.type, label=dbcol.label)) factattributes.append(key) factmapping = OlapMapping(path=[key], sqlcolumn=dbcol) factmappings.append(factmapping) key_count += 1 if isinstance(dbcol, cubetl.sql.sql.SQLColumnFK): #if len(dbcol.foreign_keys) > 1: # raise Exception("Multiple foreign keys found for column: %s" % (dbcol.name)) related_fact_name = dbcol.fk_sqlcolumn.sqltable.name if related_fact_name == sqltable.name: # Reference to self # TODO: This does not account for circular dependencies across other entities logger.warning( "Ignoring foreign key reference to self: %s", dbcol.name) continue related_fact = facts.get(related_fact_name, None) if related_fact is None: logger.warning( "Ignoring foreign key reference from %s.%s to not available entity: %s", dbcol.sqltable.name, dbcol.name, related_fact_name) continue # Create dimension attribute dimension_attribute = olap.DimensionAttribute( related_fact, name=dbcol.name, label=dbcol.label) factattributes.append(dimension_attribute) # Create a mapping factdimensionmapping = OlapMapping( path=[dimension_attribute], sqlcolumn=dbcol) factmappings.append(factdimensionmapping) if not dbcol.pk and not isinstance( dbcol, cubetl.sql.sql.SQLColumnFK) and ( olap_type == 'dimension' or (olap_type is None and dbcol.type == "String") ): # or (dbcol.name in force_dimensions) # Embedded dimension (single column, string or integer, treated as a dimension) dimension_attribute = olap.Attribute(name=dbcol.name, type=dbcol.type, label=dbcol.label) dimension = olap.Dimension( name=dbcol.name, label=dbcol.label, attributes=[dimension_attribute]) factattributes.append( DimensionAttribute(dimension, dimension.name, dimension.label)) # This dimension is mapped in the parent table factmapping = OlapMapping( path=[dimension, dimension_attribute], sqlcolumn=dbcol) factmappings.append(factmapping) if not dbcol.pk and not isinstance( dbcol, cubetl.sql.sql.SQLColumnFK) and ( olap_type == 'attribute'): # Attribute (detail) attribute = Attribute(name=dbcol.name, type=dbcol.type, label=dbcol.label) factattributes.append(attribute) factmapping = OlapMapping(path=[attribute], sqlcolumn=dbcol) factmappings.append(factmapping) if not dbcol.pk and not isinstance( dbcol, cubetl.sql.sql.SQLColumnFK) and ( olap_type == 'measure' or (olap_type is None and dbcol.type in ("Float", "Integer"))): measure = Measure(name=dbcol.name, type=dbcol.type, label=dbcol.label) factattributes.append(measure) factmapping = OlapMapping(path=[measure], sqlcolumn=dbcol) factmappings.append(factmapping) elif dbcol.type in ("DateTime"): # Date dimension datedimension = ctx.get("cubetl.datetime.date") # Create dimension attribute dimension_attribute = olap.DimensionAttribute( datedimension, name=dbcol.name, label=dbcol.label) factattributes.append(dimension_attribute) # TODO: This shall be common #mapper = olap.sql.EmbeddedDimensionMapper(entity=datedimension, sqltable=None) #olapmapper.mappers.append(mapper) mapping = OlapMapping(path=[ dimension_attribute, dimension_attribute.dimension.attribute('year') ], sqlcolumn=dbcol, function=OlapMapping.FUNCTION_YEAR) factmappings.append(mapping) #mapping = OlapMapping(entity=datedimension, attribute=datedimension.attribute("quarter"), sqlcolumn=dbcol, function=OlapMapping.FUNCTION_QUARTER) #factmappings.append(mapping) mapping = OlapMapping(path=[ dimension_attribute, dimension_attribute.dimension.attribute('month') ], sqlcolumn=dbcol, function=OlapMapping.FUNCTION_MONTH) factmappings.append(mapping) mapping = OlapMapping(path=[ dimension_attribute, dimension_attribute.dimension.attribute('day') ], sqlcolumn=dbcol, function=OlapMapping.FUNCTION_DAY) factmappings.append(mapping) mapping = OlapMapping(path=[ dimension_attribute, dimension_attribute.dimension.attribute('week') ], sqlcolumn=dbcol, function=OlapMapping.FUNCTION_WEEK) factmappings.append(mapping) # Create an alias for this dimension seen from this datetime field point of view # This approach creates a dimension for each different foreign key column name used ''' aliasdimension_urn = "%s.dim.datetime.%s.alias.%s" % (prefix, datedimension.name, dbcol.name) aliasdimension = ctx.get(aliasdimension_urn, False) if not aliasdimension: aliasdimension = ctx.add(aliasdimension_urn, olap.AliasDimension(dimension=datedimension, name=dbcol.name, label=dbcol.label)) fact.dimensions.append(olap.DimensionAttribute(aliasdimension)) # Create a mapping aliasdimensionmapping = OlapMapping(entity=aliasdimension, sqlcolumn=dbcol) factmappings.append(aliasdimensionmapping) mapper = olap.sql.AliasDimensionMapper(entity=aliasdimension) mapper.mappings = [ # These mappings don't have a sqlcolumn because they are meant to be embedded OlapMapping(entity=ctx.get("cubetl.datetime.year"), sqlcolumn=dbcol, function=OlapMapping.FUNCTION_YEAR), OlapMapping(entity=ctx.get("cubetl.datetime.quarter"), sqlcolumn=dbcol, function=OlapMapping.FUNCTION_QUARTER), OlapMapping(entity=ctx.get("cubetl.datetime.month"), sqlcolumn=dbcol, function=OlapMapping.FUNCTION_MONTH), OlapMapping(entity=ctx.get("cubetl.datetime.day"), sqlcolumn=dbcol, function=OlapMapping.FUNCTION_DAY), OlapMapping(entity=ctx.get("cubetl.datetime.week"), sqlcolumn=dbcol, function=OlapMapping.FUNCTION_WEEK) ] olapmapper.mappers.append(mapper) ''' ''' if len(factmappings) == 0: factmappings = [ { 'name': 'index', 'pk': True, 'type': 'Integer' } ] ''' # Ignore table if more than one primary key was found if key_count > 1: logger.warning( "Multiple primary key found in table %s (not supported, ignoring table)", sqltable.name) continue # Ignore table if it contains no primary key if key_count == 0: logger.warning( "No primary key found in table %s (not supported, ignoring table)", sqltable.name) continue # Define fact fact_urn = "%s.fact.%s" % (prefix, sqltable.name) fact = ctx.add( fact_urn, olap.Fact(name=sqltable.name, label=sqltable.label, attributes=factattributes)) facts[fact.name] = fact # Create an olapmapper for this fact olapmapper = olap.OlapMapper( ) # TODO: review whether this is necessary or we could use a single mapper mapper = olap.sql.TableMapper(entity=fact, sqltable=sqltable, mappings=factmappings) olapmapper.mappers.append(mapper) olapmappers.append(olapmapper) #ctx.register(mapper) #, uri='%s:fact' % ctx.uri(sqltable) # IDs should be defined in mappings, not entity Keys # mappings: # - name: id # pk: True # type: Integer # value: ${ int(m["id"]) } #printconfig = PrintConfig() #printflow = Chain(fork=True, steps=[printconfig]) #result = ctx.process(printflow) ''' process = sql.StoreRow(sqltable) result = ctx.process(process) connection = ctx.find(sql.Connection)[0] process = sql.Query(connection, lambda: "SELECT * FROM fin_account_accountmovement", embed=True) result = ctx.process(process) print(result) ''' ''' process = olap.OlapQueryAggregate() result = ctx.process(process, {'fact': 'fin_account_accountmovement', 'cuts': None, 'drill': None}) print result ''' olapmapper = olap.OlapMapper() olapmapper.include = [i for i in olapmappers] olapmapper_urn = "%s.olapmapper" % (prefix) ctx.add(olapmapper_urn, olapmapper) return ctx
def sql2cubes(db_url, model_path=None, tables=None, dimensions=None, debug=False): exclude_columns = ['key'] force_dimensions = dimensions if dimensions else [] engine = create_engine(db_url) engine_connection = engine.connect() metadata = sqlalchemy.MetaData() metadata.reflect(engine) connection = sql.Connection() connection.id = "cubesutils.connection" connection.url = engine.url # Create Cubetl context cubesbootstrap = Bootstrap() ctx = cubesbootstrap.init(debug=debug) ctx.debug = True # Load yaml library definitions that are dependencies cubetlconfig.load_config( ctx, os.path.dirname(__file__) + "/cubetl-datetime.yaml") olapmappers = {} # Indexed by table name factdimensions = {} # Indexed by table_name facts = {} # Indexed by table name def coltype(dbcol): if str(dbcol.type) in ("FLOAT", "REAL", "DECIMAL"): return "Float" elif str(dbcol.type) in ("INTEGER", "BIGINT"): return "Integer" elif str(dbcol.type) in ("BOOLEAN", "TEXT") or str( dbcol.type).startswith("VARCHAR"): return "String" return None for dbtable in metadata.sorted_tables: if dbtable.name.startswith('sqlite_'): continue print("Table: %s" % dbtable.name) tablename = slugify.slugify(dbtable.name, separator="_") # Define fact fact = olap.Fact() fact.id = "cubesutils.%s.fact" % (tablename) fact.name = slugify.slugify(dbtable.name, separator="_") fact.label = dbtable.name fact.dimensions = [] fact.measures = [] fact.attributes = [] facts[dbtable.name] = fact olapmapper = olap.OlapMapper() olapmapper.id = "cubesutils.%s.olapmapper" % (tablename) olapmapper.mappers = [] olapmapper.include = [] factmappings = [] for dbcol in dbtable.columns: if dbcol.name in exclude_columns: continue print(" Column: %s [type=%s, null=%s, pk=%s, fk=%s]" % (dbcol.name, dbcol.type, dbcol.nullable, dbcol.primary_key, dbcol.foreign_keys)) if dbcol.primary_key: if (str(dbcol.type) == "INTEGER"): factmappings.append({ 'name': slugify.slugify(dbcol.name, separator="_"), 'pk': True, 'type': 'Integer' }) elif str(dbcol.type) == "TEXT" or str( dbcol.type).startswith("VARCHAR"): factmappings.append({ 'name': slugify.slugify(dbcol.name, separator="_"), 'pk': True, 'type': 'String' }) else: raise Exception( "Unknown column type (%s) for primary key column: %s" % (dbcol.type, dbcol.name)) elif dbcol.foreign_keys and len(dbcol.foreign_keys) > 0: if len(dbcol.foreign_keys) > 1: raise Exception( "Multiple foreign keys found for column: %s" % (dbcol.name)) related_fact = list(dbcol.foreign_keys)[0].column.table.name if related_fact == dbtable.name: # Reference to self # TODO: This does not account for circular dependencies across other entities continue factdimension = None if related_fact in factdimensions: factdimension = factdimensions[related_fact] else: factdimension = olap.FactDimension() factdimension.id = "cubesutils.%s.dim.%s" % ( tablename, slugify.slugify(related_fact, separator="_")) factdimension.name = slugify.slugify(related_fact, separator="_") factdimension.label = related_fact factdimension.fact = facts[related_fact] cubetl.container.add_component(factdimension) factdimensions[related_fact] = factdimension # Create an alias aliasdimension = olap.AliasDimension() aliasdimension.dimension = factdimension aliasdimension.id = "cubesutils.%s.dim.%s.%s" % ( tablename, slugify.slugify(related_fact, separator="_"), slugify.slugify(dbcol.name, separator="_")) #aliasdimension.name = slugify.slugify(dbcol.name, separator="_").replace("_id", "") #aliasdimension.label = slugify.slugify(dbcol.name, separator="_").replace("_id", "") aliasdimension.name = tablename + "_" + related_fact + "_" + slugify.slugify( dbcol.name, separator="_").replace("_id", "") aliasdimension.label = tablename + " " + related_fact + " " + slugify.slugify( dbcol.name, separator="_").replace("_id", "") cubetl.container.add_component(aliasdimension) fact.dimensions.append(aliasdimension) mapper = olap.sql.FactDimensionMapper() mapper.entity = aliasdimension mapper.mappings = [ { #'name': slugify.slugify(dbcol.name, separator="_").replace("_id", ""), 'name': tablename + "_" + related_fact + "_" + slugify.slugify( dbcol.name, separator="_").replace("_id", ""), 'column': dbcol.name, 'pk': True } ] olapmapper.include.append(olapmappers[related_fact]) olapmapper.mappers.append(mapper) elif (dbcol.name in force_dimensions) or coltype(dbcol) == "String": # Create dimension dimension = olap.Dimension() dimension.id = "cubesutils.%s.dim.%s" % ( tablename, slugify.slugify(dbcol.name, separator="_")) dimension.name = slugify.slugify( dbtable.name, separator="_") + "_" + slugify.slugify( dbcol.name, separator="_") dimension.label = dbcol.name dimension.attributes = [{ "pk": True, "name": slugify.slugify(dbtable.name, separator="_") + "_" + slugify.slugify(dbcol.name, separator="_"), "type": coltype(dbcol) }] cubetl.container.add_component(dimension) fact.dimensions.append(dimension) mapper = olap.sql.EmbeddedDimensionMapper() mapper.entity = dimension #mapper.table = dbtable.name #mapper.connection = connection #mapper.lookup_cols = dbcol.name mapper.mappings = [{ 'name': slugify.slugify(dbtable.name, separator="_") + "_" + slugify.slugify(dbcol.name, separator="_"), 'column': slugify.slugify(dbcol.name, separator="_") }] olapmapper.mappers.append(mapper) elif str(dbcol.type) in ("FLOAT", "REAL", "DECIMAL", "INTEGER"): measure = { "name": dbcol.name, "label": dbcol.name, "type": "Integer" if str(dbcol.type) in ["INTEGER"] else "Float" } fact.measures.append(measure) # Also add dimension if integer, but not too many if str(dbcol.type) in ("INTEGER"): # TODO pass elif str(dbcol.type) in ("DATETIME"): factdimension = cubetl.container.get_component_by_id( "cubetl.datetime.date") # Create an alias to a datetime dimension aliasdimension = olap.AliasDimension() aliasdimension.dimension = factdimension aliasdimension.id = "cubesutils.%s.dim.%s.%s" % ( slugify.slugify(dbtable.name, separator="_"), "datetime", slugify.slugify(dbcol.name, separator="_")) aliasdimension.name = slugify.slugify( dbtable.name, separator="_") + "_" + slugify.slugify( dbcol.name, separator="_").replace("_id", "") aliasdimension.label = slugify.slugify( dbtable.name, separator="_") + " " + slugify.slugify( dbcol.name, separator="_").replace("_id", "") cubetl.container.add_component(aliasdimension) fact.dimensions.append(aliasdimension) mapper = olap.sql.EmbeddedDimensionMapper() mapper.entity = aliasdimension mapper.mappings = [{ 'name': 'year', 'column': dbcol.name, 'extract': 'year' }, { 'name': 'quarter', 'column': dbcol.name, 'extract': 'quarter' }, { 'name': 'month', 'column': dbcol.name, 'extract': 'month' }, { 'name': 'week', 'column': dbcol.name, 'extract': 'week' }, { 'name': 'day', 'column': dbcol.name, 'extract': 'day' }] #olapmapper.include.append(olapmappers[related_fact]) olapmapper.mappers.append(mapper) else: print(" Cannot map column '%s' (type: %s)" % (dbcol.name, dbcol.type)) mapper = olap.sql.FactMapper() mapper.entity = fact mapper.table = dbtable.name mapper.connection = connection if len(factmappings) > 0: mapper.mappings = factmappings else: mapper.mappings = [{ 'name': 'index', 'pk': True, 'type': 'Integer' }] olapmapper.mappers.append(mapper) # mappings: # - name: id # pk: True # type: Integer # value: ${ int(m["id"]) } cubetl.container.add_component(fact) olapmappers[dbtable.name] = olapmapper # Export process modelwriter = cubes.Cubes10ModelWriter() modelwriter.id = "cubesutils.export-cubes" modelwriter.olapmapper = olap.OlapMapper() modelwriter.olapmapper.include = [i for i in olapmappers.values()] #modelwriter.olapmapper.mappers = [ ] #for om in olapmappers: # for m in om.mappers: # modelwriter.olapmapper.mappers.append(m) # print(m.entity) cubetl.container.add_component(modelwriter) # Launch process ctx.start_node = "cubesutils.export-cubes" result = cubesbootstrap.run(ctx) model_json = result["cubesmodel_json"] # Write model if model_path: with open(model_path, "w") as tmpfile: tmpfile.write(model_json) else: (tmpfile, model_path) = tempfile.mkstemp(suffix='.json', prefix='cubesext-model-') os.write(tmpfile, model_json.encode("utf-8")) os.close(tmpfile) #workspace = Workspace() #workspace.register_default_store("sql", url=connection.url) # Load model #workspace.import_model("model.json") #for fact in facts: # print(" %s" % fact) return (model_path)