def root_pipeline(): import app.pipelines.initialize_db import app.pipelines.load_data.load_ecommerce_data import app.pipelines.load_data.load_marketing_data import app.pipelines.e_commerce import app.pipelines.marketing import app.pipelines.generate_artifacts import app.pipelines.update_frontends import app.pipelines.consistency_checks import app.pipelines.update_frontends pipeline = Pipeline( id='mara_example_project_1', description='An example pipeline that integrates e-commerce and marketing funnel data') pipeline.add_initial(app.pipelines.initialize_db.pipeline) pipeline.add(app.pipelines.load_data.load_ecommerce_data.pipeline) pipeline.add(app.pipelines.load_data.load_marketing_data.pipeline) pipeline.add(app.pipelines.e_commerce.pipeline, upstreams=['load_ecommerce_data']) pipeline.add(app.pipelines.marketing.pipeline, upstreams=['load_marketing_data', 'e_commerce']) pipeline.add(app.pipelines.generate_artifacts.pipeline, upstreams=['marketing']) pipeline.add(app.pipelines.update_frontends.pipeline, upstreams=['generate_artifacts']) pipeline.add(app.pipelines.consistency_checks.pipeline, upstreams=['generate_artifacts']) return pipeline
def utils_pipeline(with_hll=False, with_cstore_fdw=False): pipeline = Pipeline( id="initialize_utils", description= "Creates an utils schema with a number of functions around the ETL best practices of Project A", base_path=pathlib.Path(__file__).parent) pipeline.add_initial( Task(id="create_utils_schema", description="Re-creates the utils schema", commands=[ ExecuteSQL( sql_statement= "DROP SCHEMA IF EXISTS util CASCADE; CREATE SCHEMA util;") ])) pipeline.add( Task(id='chunking', description='Runs file chunking.sql', commands=[ ExecuteSQL(sql_file_name='chunking.sql', echo_queries=False, replace={ 'number_of_chunks': lambda: config.number_of_chunks() }) ])) def add_task_for_file(file_name_without_extension): pipeline.add( Task(id=file_name_without_extension, description=f'Runs file "{file_name_without_extension}.sql"', commands=[ ExecuteSQL(sql_file_name=file_name_without_extension + '.sql', echo_queries=False) ])) for file_name_without_extension in [ 'consistency_checks', 'data_sets', 'partitioning', 'indexes_and_constraints', 'schema_switching', 'enums' ]: add_task_for_file(file_name_without_extension) if with_hll: add_task_for_file('hll') if with_cstore_fdw: add_task_for_file('cstore_fdw') return pipeline
from mara_pipelines.parallel_tasks.files import ParallelReadFile, ReadMode from mara_pipelines.parallel_tasks.sql import ParallelExecuteSQL from mara_pipelines.pipelines import Pipeline, Task from etl_tools.create_attributes_table import CreateAttributesTable pipeline = Pipeline( id="pypi", description= "Builds a PyPI downloads cube using the public PyPi BigQuery data set", base_path=pathlib.Path(__file__).parent, labels={"Schema": "pypi_dim"}) pipeline.add_initial( Task(id="initialize_schemas", description="Recreates the schemas of the pipeline", commands=[ ExecuteSQL(sql_file_name='recreate_schemas.sql'), ExecuteSQL(sql_file_name="create_data_schema.sql", file_dependencies=["create_data_schema.sql"]) ])) read_download_file_dependencies = [ "create_download_counts_data_table.sql", "create_data_schema.sql" ] pipeline.add( ParallelReadFile( id="read_download_counts", description="Loads PyPI downloads from pre_downloaded csv files", file_pattern="*/*/*/pypi/downloads-v2.csv.gz", read_mode=ReadMode.ONLY_NEW, compression=Compression.GZIP,
from mara_pipelines.config import default_db_alias pipeline = Pipeline( id="facebook", description="Processes the data downloaded from the FacebookAds API", base_path=pathlib.Path(__file__).parent, labels={"Schema": "fb_dim"}) pipeline.add_initial( Task( id="initialize_schemas", description="Recreates the tmp and dim_next schemas", commands=[ ExecuteSQL( sql_statement= "DROP SCHEMA IF EXISTS fb_dim_next CASCADE; CREATE SCHEMA fb_dim_next;" ), ExecuteSQL(sql_file_name="create_data_schema.sql", echo_queries=False, file_dependencies=["create_data_schema.sql"]), ExecuteSQL(sql_file_name="recreate_schemas.sql", echo_queries=False) ])) pipeline.add( Task(id="read_campaign_structure", description="Loads the adwords campaign structure", commands=[ ExecuteSQL( sql_file_name='create_campaign_structure_data_table.sql', echo_queries=False),
import etl_tools.utils from mara_pipelines.commands.sql import ExecuteSQL from mara_pipelines.parallel_tasks.sql import ParallelExecuteSQL from mara_pipelines.pipelines import Pipeline, Task from etl_tools.create_attributes_table import CreateAttributesTable pipeline = Pipeline( id="python_projects", description= "Combines PyPI downloads and github activities to a Python project activity cube", base_path=pathlib.Path(__file__).parent, labels={"Schema": "pp_dim"}) pipeline.add_initial( Task(id="initialize_schemas", description="Recreates the schemas of the pipeline", commands=[ExecuteSQL(sql_file_name='recreate_schemas.sql')])) pipeline.add( Task( id="extract_python_repo_activity", description= 'Extracts activity metrics for github repos that have a corresponding pypi package (by name)', commands=[ ExecuteSQL(sql_file_name="extract_python_repo_activity.sql") ])) pipeline.add(ParallelExecuteSQL( id="transform_python_project_activity", description= "Aggregates downloads at project level and combines them with github activity metrics",
from mara_pipelines.pipelines import Pipeline, Task from mara_pipelines import config pipeline = Pipeline( id="load_marketing_data", description= "Jobs related with loading marketing leads data from the backend database", max_number_of_parallel_tasks=5, base_path=pathlib.Path(__file__).parent, labels={"Schema": "m_data"}) pipeline.add_initial( Task(id="initialize_schemas", description="Recreates the marketing data schema", commands=[ ExecuteSQL(sql_file_name='../recreate_marketing_data_schema.sql', file_dependencies=[ pathlib.Path(__file__).parent.parent / 'recreate_marketing_data_schema.sql' ]) ])) tables = ['closed_deal', 'marketing_qualified_lead'] for table in tables: pipeline.add( Task(id=f"load_{table}", description=f'Loads the {table}s from the backend database', commands=[ ExecuteSQL(sql_file_name=f'{table}/create_{table}_table.sql'), Copy(sql_statement=f""" SELECT *
description= "Creates data set tables for Metabase (completely flattened, without composed metrics, without personal data)", base_path=pathlib.Path(__file__).parent, labels={"Schema": 'metabase'}) pipeline.add_initial( Task(id="initialize_schema", description="Recreates the metabase schema", commands=[ ExecuteSQL(sql_statement=f""" DROP SCHEMA IF EXISTS util CASCADE; CREATE SCHEMA util; DROP SCHEMA IF EXISTS metabase_next CASCADE; CREATE SCHEMA metabase_next; """, echo_queries=False, db_alias='metabase-data-write'), ExecuteSQL(sql_file_name=str( initialize_db.pipeline.nodes['initialize_utils'].base_path() / 'schema_switching.sql'), db_alias='metabase-data-write'), ExecuteSQL(sql_file_name=str( initialize_db.pipeline.nodes['initialize_utils'].base_path() / 'cstore_fdw.sql'), db_alias='metabase-data-write') ])) for data_set in data_sets(): def query(data_set): return data_set_sql_query(data_set=data_set,
from mara_pipelines.commands.sql import ExecuteSQL from mara_pipelines.pipelines import Pipeline, Task from mara_schema.sql_generation import data_set_sql_query, database_identifier from mara_schema.config import data_sets pipeline = Pipeline( id="flatten_data_sets_for_mondrian", description="Creates data set tables for Mondrian (star schema, without composed metrics, without personal data)", base_path=pathlib.Path(__file__).parent, labels={"Schema": 'mondrian'}) pipeline.add_initial( Task( id="initialize_schema", description="Recreates the mondrian schema", commands=[ ExecuteSQL(sql_statement=f""" DROP SCHEMA IF EXISTS mondrian_next CASCADE; CREATE SCHEMA mondrian_next; """, echo_queries=False)])) for data_set in data_sets(): pipeline.add( Task(id=f"flatten_{data_set.id()}_for_mondrian", description=f'Flattens the "{data_set.name}" data set for best use in Mondrian', commands=[ ExecuteSQL(f""" CREATE TABLE mondrian_next.{database_identifier(data_set.name)} AS {data_set_sql_query(data_set=data_set, pre_computed_metrics=False, human_readable_columns=False, star_schema=True, personal_data=False, high_cardinality_attributes=False)}; """,
def add_parallel_tasks(self, sub_pipeline: Pipeline) -> None: attributes_table_name = f'{self.source_schema_name}.{self.source_table_name}{self.attributes_table_suffix}' ddl = f''' DROP TABLE IF EXISTS {attributes_table_name}; CREATE TABLE {attributes_table_name} ( attribute TEXT NOT NULL, value TEXT NOT NULL, row_count BIGINT NOT NULL ) PARTITION BY LIST (attribute); ''' commands = [] with mara_db.postgresql.postgres_cursor_context( self.db_alias) as cursor: # type: psycopg2.extensions.cursor cursor.execute( f''' WITH enums AS ( SELECT DISTINCT typname, nspname FROM pg_type JOIN pg_enum ON pg_type.oid = pg_enum.enumtypid JOIN pg_namespace ON pg_type.typnamespace = pg_namespace.oid ) SELECT column_name FROM information_schema.columns LEFT JOIN enums ON udt_schema = enums.nspname AND udt_name = enums.typname WHERE table_schema = {'%s'} AND table_name = {'%s'} AND (data_type IN ('text', 'varchar') OR enums.typname IS NOT NULL); ''', (self.source_schema_name, self.source_table_name)) i = 0 for column_name, in cursor.fetchall(): i += 1 ddl += f""" CREATE TABLE {attributes_table_name}_{i} PARTITION OF {attributes_table_name} FOR VALUES IN ('{column_name}'); """ commands.append( ExecuteSQL(sql_statement=f''' INSERT INTO {attributes_table_name}_{i} SELECT '{column_name}', "{column_name}", count(*) FROM {self.source_schema_name}.{self.source_table_name} WHERE "{column_name}" IS NOT NULL GROUP BY "{column_name}" ORDER BY "{column_name}"; CREATE INDEX {self.source_table_name}_{self.attributes_table_suffix}_{i}__value ON {attributes_table_name}_{i} USING GIN (value gin_trgm_ops); ''', echo_queries=False)) sub_pipeline.add_initial( Task(id='create_table', description='Creates the attributes table', commands=[ExecuteSQL(sql_statement=ddl, echo_queries=False)])) chunk_size = math.ceil( len(commands) / (2 * mara_pipelines.config.max_number_of_parallel_tasks())) for n, chunk in enumerate(more_itertools.chunked(commands, chunk_size)): task = Task(id=str(n), description='Process a portion of the attributes') task.add_commands(chunk) sub_pipeline.add(task)