def euro_exchange_rates_pipeline(db_alias: str): pipeline = Pipeline( id="load_euro_exchange_rates", description= "Loads daily Euro exchange rates since 1999 from the European central bank", base_path=pathlib.Path(__file__).parent) pipeline.add( Task(id="create_schema_and_table", description="Re-creates currency exchange rate schema", commands=[ ExecuteSQL(sql_file_name='create_schema_and_table.sql', echo_queries=False) ])) pipeline.add(Task( id='load_exchange_rate', description='Loads exchange rates from the European central bank', commands=[ ReadScriptOutput(file_name='load_exchange_rate.py', target_table='euro_fx.exchange_rate', db_alias=db_alias) ]), upstreams=['create_schema_and_table']) pipeline.add(Task(id="postprocess_exchange_rate", description="Adds values for missing days", commands=[ ExecuteSQL( sql_file_name='postprocess_exchange_rate.sql', echo_queries=False) ]), upstreams=['load_exchange_rate']) return pipeline
def utils_pipeline(with_hll=False, with_cstore_fdw=False): pipeline = Pipeline( id="initialize_utils", description= "Creates an utils schema with a number of functions around the ETL best practices of Project A", base_path=pathlib.Path(__file__).parent) pipeline.add_initial( Task(id="create_utils_schema", description="Re-creates the utils schema", commands=[ ExecuteSQL( sql_statement= "DROP SCHEMA IF EXISTS util CASCADE; CREATE SCHEMA util;") ])) pipeline.add( Task(id='chunking', description='Runs file chunking.sql', commands=[ ExecuteSQL(sql_file_name='chunking.sql', echo_queries=False, replace={ 'number_of_chunks': lambda: config.number_of_chunks() }) ])) def add_task_for_file(file_name_without_extension): pipeline.add( Task(id=file_name_without_extension, description=f'Runs file "{file_name_without_extension}.sql"', commands=[ ExecuteSQL(sql_file_name=file_name_without_extension + '.sql', echo_queries=False) ])) for file_name_without_extension in [ 'consistency_checks', 'data_sets', 'partitioning', 'indexes_and_constraints', 'schema_switching', 'enums' ]: add_task_for_file(file_name_without_extension) if with_hll: add_task_for_file('hll') if with_cstore_fdw: add_task_for_file('cstore_fdw') return pipeline
def create_cstore_table_for_query(sql_select_statement, database_schema, table_name, db_alias): """ Create a cstore table for a that can take the output of a select statement. This function is needed because PostgreSQL does not have 'CREATE FOREIGN TABLE AS ... ' """ import mara_db.postgresql with mara_db.postgresql.postgres_cursor_context('dwh') as cursor: cursor.execute('SELECT oid, typname FROM pg_type;') db_types = {} for oid, type_name in cursor.fetchall(): db_types[oid] = type_name cursor.execute(sql_select_statement + ' LIMIT 0') column_specs = [] for column in cursor.description: column_specs.append( f'"{column.name}" {db_types[column.type_code]}') ddl = f""" DROP FOREIGN TABLE IF EXISTS "{database_schema}"."{table_name}"; CREATE FOREIGN TABLE "{database_schema}"."{table_name}" ( """ ddl += ',\n '.join(column_specs) ddl += "\n) SERVER cstore_server OPTIONS (compression 'pglz');" return ExecuteSQL(sql_statement=ddl, echo_queries=True, db_alias=db_alias).run()
def add_task_for_file(file_name_without_extension): pipeline.add( Task(id=file_name_without_extension, description=f'Runs file "{file_name_without_extension}.sql"', commands=[ ExecuteSQL(sql_file_name=file_name_without_extension + '.sql', echo_queries=False) ]))
description="Create flattened data set tables for various front-ends", base_path=pathlib.Path(__file__).parent) from .metabase import pipeline as metabase_pipeline pipeline.add(metabase_pipeline) from .mara_data_explorer import pipeline as mara_data_explorer_pipeline pipeline.add(mara_data_explorer_pipeline) from .mondrian import pipeline as mondrian_pipeline pipeline.add(mondrian_pipeline) pipeline.add_final( Task(id='replace_schemas', description='Replaces the frontend schemas with their next versions', commands=[ ExecuteSQL(sql_file_name='switch_metabase_schema.sql', db_alias='metabase-data-write'), ExecuteSQL( sql_statement= f"SELECT util.replace_schema('data_sets', 'data_sets_next')", db_alias='dwh'), ExecuteSQL( sql_statement= f"SELECT util.replace_schema('mondrian', 'mondrian_next')", db_alias='dwh') ]))
from mara_pipelines.parallel_tasks.sql import ParallelExecuteSQL from mara_pipelines.pipelines import Pipeline, Task from etl_tools.create_attributes_table import CreateAttributesTable pipeline = Pipeline( id="github", description= "Builds a Github activity cube using the public Github archive data set", base_path=pathlib.Path(__file__).parent, labels={"Schema": "gh_dim"}) pipeline.add_initial( Task(id="initialize_schemas", description="Recreates the schemas of the pipeline", commands=[ ExecuteSQL(sql_file_name='recreate_schemas.sql'), ExecuteSQL(sql_file_name="create_data_schema.sql", file_dependencies=["create_data_schema.sql"]) ])) read_repo_activity_file_dependencies = [ "create_repo_activity_data_table.sql", "create_data_schema.sql" ] pipeline.add( ParallelReadFile( id="read_repo_activity", description= "Loads Github repo activities from pre-downloaded csv files", file_pattern="*/*/*/github/repo-activity-v1.csv.gz", read_mode=ReadMode.ONLY_NEW,
pipeline = Pipeline( id="load_marketing_data", description= "Jobs related with loading marketing leads data from the backend database", max_number_of_parallel_tasks=5, base_path=pathlib.Path(__file__).parent, labels={"Schema": "m_data"}) pipeline.add_initial( Task(id="initialize_schemas", description="Recreates the marketing data schema", commands=[ ExecuteSQL(sql_file_name='../recreate_marketing_data_schema.sql', file_dependencies=[ pathlib.Path(__file__).parent.parent / 'recreate_marketing_data_schema.sql' ]) ])) tables = ['closed_deal', 'marketing_qualified_lead'] for table in tables: pipeline.add( Task(id=f"load_{table}", description=f'Loads the {table}s from the backend database', commands=[ ExecuteSQL(sql_file_name=f'{table}/create_{table}_table.sql'), Copy(sql_statement=f""" SELECT * FROM marketing.{table}s;
import pathlib from mara_pipelines.commands.sql import ExecuteSQL from mara_pipelines.pipelines import Pipeline, Task pipeline = Pipeline( id="marketing", description="Builds the Leads cube based on marketing and e-commerce data", base_path=pathlib.Path(__file__).parent, labels={"Schema": "m_dim"}) pipeline.add_initial( Task(id="initialize_schemas", description="Recreates the schemas of the pipeline", commands=[ExecuteSQL(sql_file_name="recreate_schemas.sql")])) pipeline.add( Task(id="preprocess_lead", description="Preprocess the marketing leads", commands=[ExecuteSQL(sql_file_name="preprocess_lead.sql")])) pipeline.add(Task( id="transform_smaller_dimensions", description="Transform smaller marketing dimensions", commands=[ExecuteSQL(sql_file_name="transform_smaller_dimensions.sql")]), upstreams=["preprocess_lead"]) pipeline.add(Task(id="transform_lead", description="Creates the lead dim table", commands=[ ExecuteSQL(sql_file_name="transform_lead.sql",
pipeline = Pipeline( id="flatten_data_sets_for_metabase", description= "Creates data set tables for Metabase (completely flattened, without composed metrics, without personal data)", base_path=pathlib.Path(__file__).parent, labels={"Schema": 'metabase'}) pipeline.add_initial( Task(id="initialize_schema", description="Recreates the metabase schema", commands=[ ExecuteSQL(sql_statement=f""" DROP SCHEMA IF EXISTS util CASCADE; CREATE SCHEMA util; DROP SCHEMA IF EXISTS metabase_next CASCADE; CREATE SCHEMA metabase_next; """, echo_queries=False, db_alias='metabase-data-write'), ExecuteSQL(sql_file_name=str( initialize_db.pipeline.nodes['initialize_utils'].base_path() / 'schema_switching.sql'), db_alias='metabase-data-write'), ExecuteSQL(sql_file_name=str( initialize_db.pipeline.nodes['initialize_utils'].base_path() / 'cstore_fdw.sql'), db_alias='metabase-data-write') ])) for data_set in data_sets():
from mara_pipelines.parallel_tasks.sql import ParallelExecuteSQL from mara_pipelines.pipelines import Pipeline, Task from etl_tools.create_attributes_table import CreateAttributesTable pipeline = Pipeline( id="pypi", description= "Builds a PyPI downloads cube using the public PyPi BigQuery data set", base_path=pathlib.Path(__file__).parent, labels={"Schema": "pypi_dim"}) pipeline.add_initial( Task(id="initialize_schemas", description="Recreates the schemas of the pipeline", commands=[ ExecuteSQL(sql_file_name='recreate_schemas.sql'), ExecuteSQL(sql_file_name="create_data_schema.sql", file_dependencies=["create_data_schema.sql"]) ])) read_download_file_dependencies = [ "create_download_counts_data_table.sql", "create_data_schema.sql" ] pipeline.add( ParallelReadFile( id="read_download_counts", description="Loads PyPI downloads from pre_downloaded csv files", file_pattern="*/*/*/pypi/downloads-v2.csv.gz", read_mode=ReadMode.ONLY_NEW, compression=Compression.GZIP,
from mara_schema.sql_generation import data_set_sql_query, database_identifier from mara_schema.config import data_sets pipeline = Pipeline( id="flatten_data_sets_for_mondrian", description="Creates data set tables for Mondrian (star schema, without composed metrics, without personal data)", base_path=pathlib.Path(__file__).parent, labels={"Schema": 'mondrian'}) pipeline.add_initial( Task( id="initialize_schema", description="Recreates the mondrian schema", commands=[ ExecuteSQL(sql_statement=f""" DROP SCHEMA IF EXISTS mondrian_next CASCADE; CREATE SCHEMA mondrian_next; """, echo_queries=False)])) for data_set in data_sets(): pipeline.add( Task(id=f"flatten_{data_set.id()}_for_mondrian", description=f'Flattens the "{data_set.name}" data set for best use in Mondrian', commands=[ ExecuteSQL(f""" CREATE TABLE mondrian_next.{database_identifier(data_set.name)} AS {data_set_sql_query(data_set=data_set, pre_computed_metrics=False, human_readable_columns=False, star_schema=True, personal_data=False, high_cardinality_attributes=False)}; """, echo_queries=False)]))
import pathlib from mara_pipelines.commands.sql import ExecuteSQL from mara_pipelines.pipelines import Pipeline, Task pipeline = Pipeline( id="e_commerce", description="Builds the e-commerce cubes and datasets", base_path=pathlib.Path(__file__).parent, labels={"Schema": "ec_dim"}) pipeline.add_initial( Task(id="initialize_schemas", description="Recreates the schemas of the pipeline", commands=[ ExecuteSQL(sql_file_name="recreate_schemas.sql") ])) pipeline.add( Task(id="preprocess_customer", description="Preprocess customers and consolidate the data to a single record per customer with " "a unique ID", commands=[ ExecuteSQL(sql_file_name="preprocess_customer.sql") ])) pipeline.add( Task(id="preprocess_order", description="Preprocess orders to get correct unique customer ID", commands=[ ExecuteSQL(sql_file_name="preprocess_order.sql")
def add_parallel_tasks(self, sub_pipeline: Pipeline) -> None: source_db = mara_db.dbs.db(self.source_db_alias) target_db = mara_db.dbs.db(self.target_db_alias) assert (isinstance(source_db, mara_db.dbs.PostgreSQLDB)) assert (isinstance(target_db, mara_db.dbs.PostgreSQLDB)) with mara_db.postgresql.postgres_cursor_context( self.source_db_alias) as cursor: pg_version = cursor.connection.server_version ddl_task = Task( id='create_tables_and_functions', description= 'Re-creates the schema, tables structure and functions on the target db', commands=[ # schema and table structure bash.RunBash( command="(echo 'DROP SCHEMA IF EXISTS " + self.schema_name + " CASCADE;';\\\n" + " pg_dump --username="******" --host=" + source_db.host + " --schema=" + self.schema_name + " --section=pre-data --no-owner --no-privileges " + source_db.database + ") \\\n" + " | " + mara_db.shell.query_command( self.target_db_alias, echo_queries=False) + ' --quiet' ), # function definitions bash.RunBash(command=f'''echo " SELECT CONCAT(pg_get_functiondef(pg_proc.oid),';') AS def FROM (SELECT oid, prokind, pronamespace FROM pg_proc p WHERE {"p.prokind in ('p','f')" if pg_version >= 110000 else "NOT p.proisagg"}) pg_proc, pg_namespace WHERE pg_proc.pronamespace = pg_namespace.oid AND nspname = '{self.schema_name}'" \\\n''' + " | " + mara_db.shell.copy_to_stdout_command( self.source_db_alias) + ' \\\n' + " | " + mara_db.shell.query_command(self.target_db_alias, echo_queries=False)) ]) sub_pipeline.add(ddl_task) # copy content of tables number_of_chunks = self.max_number_of_parallel_tasks * 3 table_copy_chunks = {i: [] for i in range(0, number_of_chunks)} current_size_per_table_copy_chunk = [0] * number_of_chunks table_types = {} with mara_db.postgresql.postgres_cursor_context( self.source_db_alias ) as cursor: # type: psycopg2.extensions.cursor cursor.execute( """ SELECT pg_class.relname AS table, relkind, CASE WHEN relkind = 'f' THEN cstore_table_size(nspname || '.' || relname) * 10 -- cstore tables with similar size take longer to copy ELSE pg_total_relation_size(pg_class.oid) END / 1000000.0 AS size FROM pg_class JOIN pg_namespace ON pg_namespace.oid = pg_class.relnamespace WHERE nspname = '""" + self.schema_name + """' AND relkind IN ('r', 'f') AND relhassubclass = 'f' ORDER BY size DESC""") for table_name, type, size in cursor.fetchall(): smallest_chunk_index = min( range(len(current_size_per_table_copy_chunk)), key=current_size_per_table_copy_chunk.__getitem__) current_size_per_table_copy_chunk[smallest_chunk_index] += size table_copy_chunks[smallest_chunk_index].append(table_name) table_types[table_name] = type copy_tasks = [] for i, tables in table_copy_chunks.items(): if tables: task = Task( id=f'copy_tables_{i}', description='Copies table content to the frontend db', commands=[ RunBash( command= f'echo {shlex.quote(f"COPY {self.schema_name}.{table_name} TO STDOUT")} \\\n' + ' | ' + mara_db.shell.copy_to_stdout_command( self.source_db_alias) + ' \\\n' + ' | ' + mara_db.shell.copy_from_stdin_command( self.target_db_alias, target_table= f'{self.schema_name}.{table_name}')) for table_name in tables ]) copy_tasks.append(task) sub_pipeline.add(task, upstreams=[ddl_task]) # create indexes index_chunks = {i: [] for i in range(0, number_of_chunks)} current_size_per_index_chunk = [0] * number_of_chunks with mara_db.postgresql.postgres_cursor_context( self.source_db_alias) as cursor: cursor.execute(""" SELECT indexdef AS ddl, pg_total_relation_size(pg_class.oid) AS size FROM pg_class JOIN pg_namespace ON pg_namespace.oid = pg_class.relnamespace JOIN pg_indexes ON pg_indexes.indexname = pg_class.relname AND pg_indexes.schemaname = nspname WHERE nspname = '""" + self.schema_name + """' AND relkind = 'i' ORDER BY size DESC;""") for ddl, size in cursor.fetchall(): smallest_chunk_index = min( range(len(current_size_per_index_chunk)), key=current_size_per_index_chunk.__getitem__) current_size_per_index_chunk[smallest_chunk_index] += size index_chunks[smallest_chunk_index].append(ddl) for i, index_statements in index_chunks.items(): if index_statements: index_task = Task( id=f'add_indexes_{i}', description='Re-creates indexes on frontend db', commands=[ ExecuteSQL(sql_statement=statement, db_alias=self.target_db_alias) for statement in index_statements ]) sub_pipeline.add(index_task, upstreams=copy_tasks)
pipeline = Pipeline( id="load_ecommerce_data", description= "Jobs related with loading e-commerce data from the backend database", max_number_of_parallel_tasks=5, base_path=pathlib.Path(__file__).parent, labels={"Schema": "ec_data"}) pipeline.add_initial( Task(id="initialize_schemas", description="Recreates the e-commerce data schema", commands=[ ExecuteSQL(sql_file_name='../recreate_ecommerce_data_schema.sql', file_dependencies=[ pathlib.Path(__file__).parent.parent / 'recreate_ecommerce_data_schema.sql' ]) ])) tables = [ 'customer', 'order', 'order_item', 'product', 'product_category_name_translation', 'seller' ] for table in tables: pipeline.add( Task(id=f"load_{table}", description=f'Loads the {table}s from the backend database', commands=[ ExecuteSQL(sql_file_name=f'{table}/create_{table}_table.sql'),
from mara_pipelines.commands.sql import ExecuteSQL from mara_pipelines.parallel_tasks.sql import ParallelExecuteSQL from mara_pipelines.pipelines import Pipeline, Task from etl_tools.create_attributes_table import CreateAttributesTable pipeline = Pipeline( id="python_projects", description= "Combines PyPI downloads and github activities to a Python project activity cube", base_path=pathlib.Path(__file__).parent, labels={"Schema": "pp_dim"}) pipeline.add_initial( Task(id="initialize_schemas", description="Recreates the schemas of the pipeline", commands=[ExecuteSQL(sql_file_name='recreate_schemas.sql')])) pipeline.add( Task( id="extract_python_repo_activity", description= 'Extracts activity metrics for github repos that have a corresponding pypi package (by name)', commands=[ ExecuteSQL(sql_file_name="extract_python_repo_activity.sql") ])) pipeline.add(ParallelExecuteSQL( id="transform_python_project_activity", description= "Aggregates downloads at project level and combines them with github activity metrics", commands_before=[
from mara_pipelines.pipelines import Pipeline, Task from mara_pipelines.config import default_db_alias pipeline = Pipeline( id="facebook", description="Processes the data downloaded from the FacebookAds API", base_path=pathlib.Path(__file__).parent, labels={"Schema": "fb_dim"}) pipeline.add_initial( Task( id="initialize_schemas", description="Recreates the tmp and dim_next schemas", commands=[ ExecuteSQL( sql_statement= "DROP SCHEMA IF EXISTS fb_dim_next CASCADE; CREATE SCHEMA fb_dim_next;" ), ExecuteSQL(sql_file_name="create_data_schema.sql", echo_queries=False, file_dependencies=["create_data_schema.sql"]), ExecuteSQL(sql_file_name="recreate_schemas.sql", echo_queries=False) ])) pipeline.add( Task(id="read_campaign_structure", description="Loads the adwords campaign structure", commands=[ ExecuteSQL( sql_file_name='create_campaign_structure_data_table.sql', echo_queries=False),
import pathlib from mara_pipelines.commands.sql import ExecuteSQL from mara_pipelines.pipelines import Pipeline, Task from etl_tools import config pipeline = Pipeline(id="create_time_dimensions", description="Creates a day and a duration dimension table", labels={"Schema": "time"}, base_path=pathlib.Path(__file__).parent) pipeline.add( Task(id="create_tables", description="Re-creates the day and duration table and their schema", commands=[ ExecuteSQL(sql_file_name='create_tables.sql', echo_queries=False, file_dependencies=['create_tables.sql']) ])) pipeline.add(Task( id="populate_time_dimensions", description="fills the time dimensions for a configured time range", commands=[ ExecuteSQL( sql_statement=lambda: "SELECT time.populate_time_dimensions('" + config.first_date_in_time_dimensions().isoformat() + "'::DATE, '" + config.last_date_in_time_dimensions().isoformat() + "'::DATE);") ]), upstreams=['create_tables'])
import pathlib from etl_tools import initialize_utils, create_time_dimensions from mara_pipelines.pipelines import Pipeline, Task from mara_pipelines.commands.sql import ExecuteSQL pipeline = Pipeline( id="initialize_db", description="Adds a number of utility functions & creates time dimensions", base_path=pathlib.Path(__file__).parent) pipeline.add(initialize_utils.utils_pipeline(with_cstore_fdw=True)) pipeline.add(create_time_dimensions.pipeline, upstreams=['initialize_utils']) pipeline.add( Task(id='create_read_only_user', description='Creates a read-only user "dwh_read_only", useful for giving analysts direct access to DWH', commands=[ ExecuteSQL(sql_file_name='create_read_only_user.sql') ]), upstreams=['initialize_utils'])
def add_parallel_tasks(self, sub_pipeline: Pipeline) -> None: attributes_table_name = f'{self.source_schema_name}.{self.source_table_name}{self.attributes_table_suffix}' ddl = f''' DROP TABLE IF EXISTS {attributes_table_name}; CREATE TABLE {attributes_table_name} ( attribute TEXT NOT NULL, value TEXT NOT NULL, row_count BIGINT NOT NULL ) PARTITION BY LIST (attribute); ''' commands = [] with mara_db.postgresql.postgres_cursor_context( self.db_alias) as cursor: # type: psycopg2.extensions.cursor cursor.execute( f''' WITH enums AS ( SELECT DISTINCT typname, nspname FROM pg_type JOIN pg_enum ON pg_type.oid = pg_enum.enumtypid JOIN pg_namespace ON pg_type.typnamespace = pg_namespace.oid ) SELECT column_name FROM information_schema.columns LEFT JOIN enums ON udt_schema = enums.nspname AND udt_name = enums.typname WHERE table_schema = {'%s'} AND table_name = {'%s'} AND (data_type IN ('text', 'varchar') OR enums.typname IS NOT NULL); ''', (self.source_schema_name, self.source_table_name)) i = 0 for column_name, in cursor.fetchall(): i += 1 ddl += f""" CREATE TABLE {attributes_table_name}_{i} PARTITION OF {attributes_table_name} FOR VALUES IN ('{column_name}'); """ commands.append( ExecuteSQL(sql_statement=f''' INSERT INTO {attributes_table_name}_{i} SELECT '{column_name}', "{column_name}", count(*) FROM {self.source_schema_name}.{self.source_table_name} WHERE "{column_name}" IS NOT NULL GROUP BY "{column_name}" ORDER BY "{column_name}"; CREATE INDEX {self.source_table_name}_{self.attributes_table_suffix}_{i}__value ON {attributes_table_name}_{i} USING GIN (value gin_trgm_ops); ''', echo_queries=False)) sub_pipeline.add_initial( Task(id='create_table', description='Creates the attributes table', commands=[ExecuteSQL(sql_statement=ddl, echo_queries=False)])) chunk_size = math.ceil( len(commands) / (2 * mara_pipelines.config.max_number_of_parallel_tasks())) for n, chunk in enumerate(more_itertools.chunked(commands, chunk_size)): task = Task(id=str(n), description='Process a portion of the attributes') task.add_commands(chunk) sub_pipeline.add(task)