示例#1
0
def root_pipeline():
    import app.pipelines.initialize_db
    import app.pipelines.load_data.load_ecommerce_data
    import app.pipelines.load_data.load_marketing_data
    import app.pipelines.e_commerce
    import app.pipelines.marketing
    import app.pipelines.generate_artifacts
    import app.pipelines.update_frontends
    import app.pipelines.consistency_checks
    import app.pipelines.update_frontends

    pipeline = Pipeline(
        id='mara_example_project_1',
        description='An example pipeline that integrates e-commerce and marketing funnel data')

    pipeline.add_initial(app.pipelines.initialize_db.pipeline)
    pipeline.add(app.pipelines.load_data.load_ecommerce_data.pipeline)
    pipeline.add(app.pipelines.load_data.load_marketing_data.pipeline)
    pipeline.add(app.pipelines.e_commerce.pipeline, upstreams=['load_ecommerce_data'])
    pipeline.add(app.pipelines.marketing.pipeline,
                 upstreams=['load_marketing_data', 'e_commerce'])

    pipeline.add(app.pipelines.generate_artifacts.pipeline, upstreams=['marketing'])
    pipeline.add(app.pipelines.update_frontends.pipeline, upstreams=['generate_artifacts'])
    pipeline.add(app.pipelines.consistency_checks.pipeline, upstreams=['generate_artifacts'])

    return pipeline
示例#2
0
def utils_pipeline(with_hll=False, with_cstore_fdw=False):
    pipeline = Pipeline(
        id="initialize_utils",
        description=
        "Creates an utils schema with a number of functions around the ETL best practices of Project A",
        base_path=pathlib.Path(__file__).parent)

    pipeline.add_initial(
        Task(id="create_utils_schema",
             description="Re-creates the utils schema",
             commands=[
                 ExecuteSQL(
                     sql_statement=
                     "DROP SCHEMA IF EXISTS util CASCADE; CREATE SCHEMA util;")
             ]))

    pipeline.add(
        Task(id='chunking',
             description='Runs file chunking.sql',
             commands=[
                 ExecuteSQL(sql_file_name='chunking.sql',
                            echo_queries=False,
                            replace={
                                'number_of_chunks':
                                lambda: config.number_of_chunks()
                            })
             ]))

    def add_task_for_file(file_name_without_extension):
        pipeline.add(
            Task(id=file_name_without_extension,
                 description=f'Runs file "{file_name_without_extension}.sql"',
                 commands=[
                     ExecuteSQL(sql_file_name=file_name_without_extension +
                                '.sql',
                                echo_queries=False)
                 ]))

    for file_name_without_extension in [
            'consistency_checks', 'data_sets', 'partitioning',
            'indexes_and_constraints', 'schema_switching', 'enums'
    ]:
        add_task_for_file(file_name_without_extension)

    if with_hll:
        add_task_for_file('hll')

    if with_cstore_fdw:
        add_task_for_file('cstore_fdw')

    return pipeline
示例#3
0
from mara_pipelines.parallel_tasks.files import ParallelReadFile, ReadMode
from mara_pipelines.parallel_tasks.sql import ParallelExecuteSQL
from mara_pipelines.pipelines import Pipeline, Task
from etl_tools.create_attributes_table import CreateAttributesTable

pipeline = Pipeline(
    id="pypi",
    description=
    "Builds a PyPI downloads cube using the public PyPi BigQuery data set",
    base_path=pathlib.Path(__file__).parent,
    labels={"Schema": "pypi_dim"})

pipeline.add_initial(
    Task(id="initialize_schemas",
         description="Recreates the schemas of the pipeline",
         commands=[
             ExecuteSQL(sql_file_name='recreate_schemas.sql'),
             ExecuteSQL(sql_file_name="create_data_schema.sql",
                        file_dependencies=["create_data_schema.sql"])
         ]))

read_download_file_dependencies = [
    "create_download_counts_data_table.sql", "create_data_schema.sql"
]

pipeline.add(
    ParallelReadFile(
        id="read_download_counts",
        description="Loads PyPI downloads from pre_downloaded csv files",
        file_pattern="*/*/*/pypi/downloads-v2.csv.gz",
        read_mode=ReadMode.ONLY_NEW,
        compression=Compression.GZIP,
示例#4
0
from mara_pipelines.config import default_db_alias

pipeline = Pipeline(
    id="facebook",
    description="Processes the data downloaded from the FacebookAds API",
    base_path=pathlib.Path(__file__).parent,
    labels={"Schema": "fb_dim"})

pipeline.add_initial(
    Task(
        id="initialize_schemas",
        description="Recreates the tmp and dim_next schemas",
        commands=[
            ExecuteSQL(
                sql_statement=
                "DROP SCHEMA IF EXISTS fb_dim_next CASCADE; CREATE SCHEMA fb_dim_next;"
            ),
            ExecuteSQL(sql_file_name="create_data_schema.sql",
                       echo_queries=False,
                       file_dependencies=["create_data_schema.sql"]),
            ExecuteSQL(sql_file_name="recreate_schemas.sql",
                       echo_queries=False)
        ]))

pipeline.add(
    Task(id="read_campaign_structure",
         description="Loads the adwords campaign structure",
         commands=[
             ExecuteSQL(
                 sql_file_name='create_campaign_structure_data_table.sql',
                 echo_queries=False),
示例#5
0
import etl_tools.utils
from mara_pipelines.commands.sql import ExecuteSQL
from mara_pipelines.parallel_tasks.sql import ParallelExecuteSQL
from mara_pipelines.pipelines import Pipeline, Task
from etl_tools.create_attributes_table import CreateAttributesTable

pipeline = Pipeline(
    id="python_projects",
    description=
    "Combines PyPI downloads and github activities to a Python project activity cube",
    base_path=pathlib.Path(__file__).parent,
    labels={"Schema": "pp_dim"})

pipeline.add_initial(
    Task(id="initialize_schemas",
         description="Recreates the schemas of the pipeline",
         commands=[ExecuteSQL(sql_file_name='recreate_schemas.sql')]))

pipeline.add(
    Task(
        id="extract_python_repo_activity",
        description=
        'Extracts activity metrics for github repos that have a corresponding pypi package (by name)',
        commands=[
            ExecuteSQL(sql_file_name="extract_python_repo_activity.sql")
        ]))

pipeline.add(ParallelExecuteSQL(
    id="transform_python_project_activity",
    description=
    "Aggregates downloads at project level and combines them with github activity metrics",
示例#6
0
from mara_pipelines.pipelines import Pipeline, Task
from mara_pipelines import config

pipeline = Pipeline(
    id="load_marketing_data",
    description=
    "Jobs related with loading marketing leads data from the backend database",
    max_number_of_parallel_tasks=5,
    base_path=pathlib.Path(__file__).parent,
    labels={"Schema": "m_data"})

pipeline.add_initial(
    Task(id="initialize_schemas",
         description="Recreates the marketing data schema",
         commands=[
             ExecuteSQL(sql_file_name='../recreate_marketing_data_schema.sql',
                        file_dependencies=[
                            pathlib.Path(__file__).parent.parent /
                            'recreate_marketing_data_schema.sql'
                        ])
         ]))

tables = ['closed_deal', 'marketing_qualified_lead']

for table in tables:
    pipeline.add(
        Task(id=f"load_{table}",
             description=f'Loads the {table}s from the backend database',
             commands=[
                 ExecuteSQL(sql_file_name=f'{table}/create_{table}_table.sql'),
                 Copy(sql_statement=f"""
                 SELECT *
示例#7
0
    description=
    "Creates data set tables for Metabase (completely flattened, without composed metrics, without personal data)",
    base_path=pathlib.Path(__file__).parent,
    labels={"Schema": 'metabase'})

pipeline.add_initial(
    Task(id="initialize_schema",
         description="Recreates the metabase schema",
         commands=[
             ExecuteSQL(sql_statement=f"""
DROP SCHEMA IF EXISTS util CASCADE;
CREATE SCHEMA util;            

DROP SCHEMA IF EXISTS metabase_next CASCADE;
CREATE SCHEMA metabase_next;
""",
                        echo_queries=False,
                        db_alias='metabase-data-write'),
             ExecuteSQL(sql_file_name=str(
                 initialize_db.pipeline.nodes['initialize_utils'].base_path() /
                 'schema_switching.sql'),
                        db_alias='metabase-data-write'),
             ExecuteSQL(sql_file_name=str(
                 initialize_db.pipeline.nodes['initialize_utils'].base_path() /
                 'cstore_fdw.sql'),
                        db_alias='metabase-data-write')
         ]))

for data_set in data_sets():

    def query(data_set):
        return data_set_sql_query(data_set=data_set,
示例#8
0
from mara_pipelines.commands.sql import ExecuteSQL
from mara_pipelines.pipelines import Pipeline, Task
from mara_schema.sql_generation import data_set_sql_query, database_identifier
from mara_schema.config import data_sets

pipeline = Pipeline(
    id="flatten_data_sets_for_mondrian",
    description="Creates data set tables for Mondrian (star schema, without composed metrics, without personal data)",
    base_path=pathlib.Path(__file__).parent,
    labels={"Schema": 'mondrian'})

pipeline.add_initial(
    Task(
        id="initialize_schema",
        description="Recreates the mondrian schema",
        commands=[
            ExecuteSQL(sql_statement=f"""
DROP SCHEMA IF EXISTS mondrian_next CASCADE;
CREATE SCHEMA mondrian_next;
""", echo_queries=False)]))

for data_set in data_sets():
    pipeline.add(
        Task(id=f"flatten_{data_set.id()}_for_mondrian",
             description=f'Flattens the "{data_set.name}" data set for best use in Mondrian',
             commands=[
                 ExecuteSQL(f"""
CREATE TABLE mondrian_next.{database_identifier(data_set.name)} AS
{data_set_sql_query(data_set=data_set, pre_computed_metrics=False, human_readable_columns=False, star_schema=True,
                    personal_data=False, high_cardinality_attributes=False)};
""",
示例#9
0
    def add_parallel_tasks(self, sub_pipeline: Pipeline) -> None:
        attributes_table_name = f'{self.source_schema_name}.{self.source_table_name}{self.attributes_table_suffix}'

        ddl = f'''
DROP TABLE IF EXISTS {attributes_table_name};

CREATE TABLE {attributes_table_name} (
    attribute TEXT NOT NULL, 
    value     TEXT NOT NULL, 
    row_count BIGINT NOT NULL
) PARTITION BY LIST (attribute);
'''

        commands = []

        with mara_db.postgresql.postgres_cursor_context(
                self.db_alias) as cursor:  # type: psycopg2.extensions.cursor
            cursor.execute(
                f'''
WITH enums AS (
    SELECT DISTINCT
      typname,
      nspname
    FROM pg_type
      JOIN pg_enum ON pg_type.oid = pg_enum.enumtypid
      JOIN pg_namespace ON pg_type.typnamespace = pg_namespace.oid
  )
SELECT column_name
FROM information_schema.columns
  LEFT JOIN enums ON udt_schema = enums.nspname AND udt_name = enums.typname
  WHERE table_schema = {'%s'}
      AND table_name = {'%s'}
      AND (data_type IN ('text', 'varchar') OR enums.typname IS NOT NULL);
''', (self.source_schema_name, self.source_table_name))

            i = 0

            for column_name, in cursor.fetchall():
                i += 1
                ddl += f"""
CREATE TABLE {attributes_table_name}_{i} PARTITION OF {attributes_table_name} FOR VALUES IN ('{column_name}');
"""
                commands.append(
                    ExecuteSQL(sql_statement=f'''
INSERT INTO {attributes_table_name}_{i} 
SELECT '{column_name}', "{column_name}", count(*)
FROM {self.source_schema_name}.{self.source_table_name}
WHERE "{column_name}" IS NOT NULL
GROUP BY "{column_name}"
ORDER BY "{column_name}";

CREATE INDEX {self.source_table_name}_{self.attributes_table_suffix}_{i}__value 
   ON {attributes_table_name}_{i} USING GIN (value gin_trgm_ops);
''',
                               echo_queries=False))

        sub_pipeline.add_initial(
            Task(id='create_table',
                 description='Creates the attributes table',
                 commands=[ExecuteSQL(sql_statement=ddl, echo_queries=False)]))

        chunk_size = math.ceil(
            len(commands) /
            (2 * mara_pipelines.config.max_number_of_parallel_tasks()))
        for n, chunk in enumerate(more_itertools.chunked(commands,
                                                         chunk_size)):
            task = Task(id=str(n),
                        description='Process a portion of the attributes')
            task.add_commands(chunk)
            sub_pipeline.add(task)