Пример #1
0
        file_dependencies=read_download_file_dependencies,
        date_regex="^(?P<year>\d{4})\/(?P<month>\d{2})\/(?P<day>\d{2})/",
        partition_target_table_by_day_id=True,
        timezone="UTC",
        commands_before=[
            ExecuteSQL(sql_file_name="create_download_data_table.sql",
                       file_dependencies=read_download_file_dependencies)
        ]))

pipeline.add(ParallelExecuteSQL(
    id="preprocess_project_version",
    description='Assigns unique ids to projects and versions',
    commands_before=[
        ExecuteSQL(sql_file_name="preprocess_project_version_1.sql")
    ],
    sql_statement=
    "SELECT pypi_tmp.preprocess_project_version_1(@chunk@::SMALLINT);",
    parameter_function=etl_tools.utils.chunk_parameter_function,
    parameter_placeholders=["@chunk@"],
    commands_after=[
        ExecuteSQL(sql_file_name="preprocess_project_version_2.sql")
    ]),
             upstreams=['read_download'])

for dimension in ['project', 'project_version']:
    pipeline.add(Task(
        id=f"transform_{dimension}",
        description=f'Creates the "{dimension}" dimension',
        commands=[ExecuteSQL(sql_file_name=f"transform_{dimension}.sql")]),
                 upstreams=['preprocess_project_version'])
Пример #2
0
pipeline.add(
    Task(
        id="extract_python_repo_activity",
        description=
        'Extracts activity metrics for github repos that have a corresponding pypi package (by name)',
        commands=[
            ExecuteSQL(sql_file_name="extract_python_repo_activity.sql")
        ]))

pipeline.add(ParallelExecuteSQL(
    id="transform_python_project_activity",
    description=
    "Aggregates downloads at project level and combines them with github activity metrics",
    commands_before=[
        ExecuteSQL(sql_file_name="transform_python_project_activity.sql")
    ],
    sql_statement=
    "SELECT pp_tmp.insert_python_project_activity(@chunk@::SMALLINT);",
    parameter_function=etl_tools.utils.chunk_parameter_function,
    parameter_placeholders=["@chunk@"]),
             upstreams=["extract_python_repo_activity"])

pipeline.add(Task(
    id="constrain_tables",
    description="Adds foreign key constrains between the dim tables",
    commands=[
        ExecuteSQL(sql_file_name="constrain_tables.sql", echo_queries=False)
    ]),
             upstreams=["transform_python_project_activity"])
Пример #3
0
    ["preprocess_ad"])


def index_ad_parameters():
    with mara_db.postgresql.postgres_cursor_context(
            default_db_alias()) as cursor:
        cursor.execute(
            '''select util.get_columns('fb_dim_next', 'ad', '%_name');''')
        return cursor.fetchall()


pipeline.add(
    ParallelExecuteSQL(
        id="index_ad",
        description="Adds indexes to all columns of the ad dimension",
        sql_statement=
        '''SELECT util.add_index('fb_dim_next', 'ad', column_names:=ARRAY[''@@param_1@@'']);''',
        parameter_function=index_ad_parameters,
        parameter_placeholders=["'@@param_1@@'"]), ["transform_ad"])

pipeline.add(
    Task(id="transform_ad_performance",
         description="Creates the fact table of the facebook cube",
         commands=[ExecuteSQL(sql_file_name="transform_ad_performance.sql")]),
    ["read_ad_performance"])


def index_ad_performance_parameters():
    with mara_db.postgresql.postgres_cursor_context(
            default_db_alias()) as cursor:
        cursor.execute(
Пример #4
0
    ))

pipeline.add(
    Task(id="transform_repo",
         description='Creates the "repo" dimension',
         commands=[
             ExecuteSQL(sql_file_name="transform_repo.sql")
         ]),
    upstreams=['read_repo_activity'])

pipeline.add(
    ParallelExecuteSQL(
        id="transform_repo_activity",
        description="Maps repo activites to their dimensions",
        commands_before=[
            ExecuteSQL(sql_file_name="transform_repo_activity.sql")
        ],
        sql_statement="SELECT gh_tmp.insert_repo_activity(@chunk@::SMALLINT);",
        parameter_function=etl_tools.utils.chunk_parameter_function,
        parameter_placeholders=["@chunk@"]),
    upstreams=["transform_repo"])

pipeline.add(
    ParallelExecuteSQL(
        id="create_repo_activity_data_set",
        description="Creates a flat data set table for Github repo activities",
        sql_statement="SELECT gh_tmp.insert_repo_activity_data_set(@chunk@::SMALLINT);",
        parameter_function=etl_tools.utils.chunk_parameter_function,
        parameter_placeholders=["@chunk@"],
        commands_before=[
            ExecuteSQL(sql_file_name="create_repo_activity_data_set.sql")
        commands_after=[
            ExecuteSQL(sql_statement='SELECT gads_data.upsert_ad_performance()')
        ]))

pipeline.add(
    Task(id="transform_ad",
         description="Creates the ad dimension table",
         commands=[
             ExecuteSQL(sql_file_name="transform_ad.sql")
         ]),
    upstreams=["read_campaign_structure", "read_ad_performance"])

pipeline.add(
    ParallelExecuteSQL(
        id="index_ad",
        description="Adds indexes to all name columns of the ad dimension",
        sql_statement="SELECT util.add_index('gads_dim_next', 'ad', column_names := ARRAY ['@column@']);",
        parameter_function=lambda: [('ad_name',), ('ad_group_name',), ('campaign_name',), ('account_name',)],
        parameter_placeholders=["@column@"]),
    upstreams=["transform_ad"])

pipeline.add(
    Task(id="transform_ad_performance",
         description="Creates the fact table of the google ads performance cube",
         commands=[
             ExecuteSQL(sql_file_name="transform-ad-performance.sql")
         ]),
    upstreams=["read_ad_performance", "read_campaign_structure"])

pipeline.add(
    ParallelExecuteSQL(
        id="index_ad_performance",