Exemplo n.º 1
0
def create_table_lineage_from_metadata(job_metadata_id,
                                       query_language=None,
                                       session=None):
    job_metadata = session.query(DataJobMetadata).get(job_metadata_id)
    if job_metadata is None:
        return

    _, lineage_per_statement = process_query(job_metadata.query_text,
                                             query_language)

    lineage_ids = []
    for statement_lineage in lineage_per_statement:
        if len(statement_lineage):
            for lineage in statement_lineage:
                if "source" in lineage:
                    source_string = lineage["source"].split(".")
                    parent_table = metastore.get_table_by_name(
                        source_string[0],
                        source_string[1],
                        job_metadata.metastore_id,
                        session=session,
                    )

                    target_string = lineage["target"].split(".")
                    child_table = metastore.get_table_by_name(
                        target_string[0],
                        target_string[1],
                        job_metadata.metastore_id,
                        session=session,
                    )

                    if parent_table and child_table:
                        lineage_ids.append(
                            add_table_lineage(
                                child_table.id,
                                parent_table.id,
                                job_metadata_id,
                                session=session,
                            ).id)

    return lineage_ids
def sync_table_to_metastore(table_per_statement,
                            statement_types,
                            metastore_id,
                            session=None):
    metastore_loader = get_metastore_loader(metastore_id, session=session)
    assert metastore_loader is not None

    tables_to_add = set()
    tables_to_remove = set()
    for tables, statement_type in zip(table_per_statement, statement_types):
        if statement_type == "DROP":
            for table in tables:
                tables_to_add.discard(table)
                tables_to_remove.add(table)
        elif statement_type is not None:  # Any other DML/DDL
            for table in tables:
                tables_to_remove.discard(table)

                # If table is create or alert, we must update metastore
                if table not in tables_to_add:  # This is to minimize the checks
                    if statement_type in ("CREATE", "ALTER"):
                        tables_to_add.add(table)
                    else:
                        # Otherwise for things like insert/select we only update
                        # if it doesn't exist in the metastore
                        schema_name, table_name = table.split(".")
                        query_table = m_logic.get_table_by_name(
                            schema_name,
                            table_name,
                            metastore_id=metastore_id,
                            session=session,
                        )
                        if not query_table:
                            tables_to_add.add(table)

    for table in tables_to_remove:
        schema_name, table_name = table.split(".")
        metastore_loader.sync_delete_table(schema_name,
                                           table_name,
                                           session=session)

    for table in tables_to_add:
        schema_name, table_name = table.split(".")
        metastore_loader.sync_create_or_update_table(schema_name,
                                                     table_name,
                                                     session=session)
Exemplo n.º 3
0
def get_table_by_name(
    schema_name,
    table_name,
    metastore_id,
    with_schema=True,
    with_column=True,
    with_warnings=True,
):
    with DBSession() as session:
        table = logic.get_table_by_name(schema_name,
                                        table_name,
                                        metastore_id,
                                        session=session)
        api_assert(table,
                   "{}.{} does not exist".format(schema_name, table_name))
        verify_data_schema_permission(table.schema_id, session=session)
        table_dict = table.to_dict(with_schema, with_column, with_warnings)

    return table_dict
def log_table_per_statement(
    table_per_statement,
    statement_types,
    query_execution_id,
    metastore_id,
    cell_id,
    session=None,
):
    metastore_loader = get_metastore_loader(metastore_id, session=session)
    assert metastore_loader is not None

    all_tables = set()
    # Only show example queries of SELECT statements
    for tables, statement_type in zip(table_per_statement, statement_types):
        if statement_type in ("SELECT", "INSERT"):
            all_tables.update(tables)

    for table in all_tables:
        schema_name, table_name = table.split(".")
        query_table = m_logic.get_table_by_name(schema_name,
                                                table_name,
                                                metastore_id=metastore_id,
                                                session=session)

        if query_table:  # Sanity check
            m_logic.delete_old_able_query_execution_log(
                cell_id=cell_id,
                query_execution_id=query_execution_id,
                commit=False,
                session=session,
            )
            m_logic.create_table_query_execution_log(
                table_id=query_table.id,
                cell_id=cell_id,
                query_execution_id=query_execution_id,
                session=session,
            )
Exemplo n.º 5
0
def exec_demo_set_up():
    with DBSession() as session:
        environment = environment_logic.create_environment(
            name="demo_environment",
            description="Demo environment",
            image="",
            public=True,
            commit=False,
            session=session,
        )

        local_db_conn = "sqlite:///demo/demo_data.db"
        metastore_id = QueryMetastore.create(
            {
                "name": "demo_metastore",
                "metastore_params": {"connection_string": local_db_conn,},
                "loader": "SqlAlchemyMetastoreLoader",
                "acl_control": {},
            },
            commit=False,
            session=session,
        ).id

        engine_id = QueryEngine.create(
            {
                "name": "sqlite",
                "description": "SQLite Engine",
                "language": "sqlite",
                "executor": "sqlalchemy",
                "executor_params": {"connection_string": local_db_conn,},
                "environment_id": environment.id,
                "metastore_id": metastore_id,
            },
            commit=False,
            session=session,
        ).id

        logic.add_query_engine_to_environment(
            environment.id, engine_id, commit=False, session=session
        )

        task_schedule_id = TaskSchedule.create(
            {
                "name": "update_metastore_{}".format(metastore_id),
                "task": "tasks.update_metastore.update_metastore",
                "cron": "0 0 * * *",
                "args": [metastore_id],
                "task_type": "prod",
                "enabled": True,
            },
            # commit=False,
            session=session,
        ).id
        schedule_logic.run_and_log_scheduled_task(
            scheduled_task_id=task_schedule_id, wait_to_finish=True, session=session
        )

        golden_table = metastore_logic.get_table_by_name(
            schema_name="main",
            name="world_happiness_2019",
            metastore_id=metastore_id,
            session=session,
        )
        if golden_table:
            metastore_logic.update_table(
                id=golden_table.id, golden=True, session=session
            )
            metastore_logic.update_table_information(
                data_table_id=golden_table.id,
                description="The World Happiness Report is a landmark survey of the state of global happiness. The first report was published in 2012, the second in 2013, the third in 2015, and the fourth in the 2016 Update. The World Happiness 2017, which ranks 155 countries by their happiness levels, was released at the United Nations at an event celebrating International Day of Happiness on March 20th. The report continues to gain global recognition as governments, organizations and civil society increasingly use happiness indicators to inform their policy-making decisions. Leading experts across fields – economics, psychology, survey analysis, national statistics, health, public policy and more – describe how measurements of well-being can be used effectively to assess the progress of nations. The reports review the state of happiness in the world today and show how the new science of happiness explains personal and national variations in happiness.",
                session=session,
            )
            demo_logic.create_demo_table_stats(
                table_id=golden_table.id, uid=current_user.id, session=session
            )
            score_column = metastore_logic.get_column_by_name(
                name="Score", table_id=golden_table.id, session=session
            )
            demo_logic.create_demo_table_column_stats(
                column_id=score_column.id, uid=current_user.id, session=session
            )

        schedule_logic.run_and_log_scheduled_task(
            scheduled_task_id=task_schedule_id, session=session
        )

        demo_logic.create_demo_lineage(metastore_id, current_user.id, session=session)

        data_doc_id = demo_logic.create_demo_data_doc(
            environment_id=environment.id,
            engine_id=engine_id,
            uid=current_user.id,
            session=session,
        )

        if data_doc_id:
            session.commit()

            return {
                "environment": environment.name,
                "data_doc_id": data_doc_id,
            }