Exemplo n.º 1
0
def etl_full(
    alias: str,
    schema: str,
    tables_pk: Dict,
    renew=False,
):
    """
    full etl
    """
    reader = get_reader(alias)
    source_db_database = Settings.get_source_db_database(alias, schema)
    schema = source_db_database.get("database")
    writer = get_writer()
    if not writer.check_database_exists(schema):
        if source_db_database.get("auto_create") is not False:
            writer.create_database(schema)
        else:
            logger.warning(
                f"Can't etl since no database {schema} found in ClickHouse and auto_create=false"
            )
            exit(-1)
    for table in source_db_database.get("tables"):
        if table.get("auto_full_etl") is False:
            continue
        table_name = table.get("table")
        pk = tables_pk.get(table_name)
        writer = get_writer(table.get("clickhouse_engine"))
        if not pk:
            logger.warning(f"No pk found in {schema}.{table_name}, skip")
            continue
        elif isinstance(pk, tuple):
            pk = f"({','.join(pk)})"
        if renew:
            drop_sql = f"drop table if exists {schema}.{table_name}"
            writer.execute(drop_sql)
            logger.info(f"drop table success:{schema}.{table_name}")
        if not writer.check_table_exists(schema, table_name):
            sign_column = table.get("sign_column")
            version_column = table.get("version_column")
            writer.execute(
                writer.get_table_create_sql(
                    reader,
                    schema,
                    table_name,
                    pk,
                    table.get("partition_by"),
                    table.get("engine_settings"),
                    sign_column=sign_column,
                    version_column=version_column,
                ))
            if reader.fix_column_type and not table.get("skip_decimal"):
                writer.fix_table_column_type(reader, schema, table_name)
            full_insert_sql = writer.get_full_insert_sql(
                reader, schema, table_name, sign_column)
            writer.execute(full_insert_sql)
            logger.info(f"full data etl for {schema}.{table_name} success")
        else:
            logger.debug(
                f"{schema}.{table_name} exists, skip, or use --renew force etl with drop old tables"
            )
Exemplo n.º 2
0
def insert_log(
    alias: str,
    schema: str,
    table: str,
    num: int,
    type_: int,
):
    if not Settings.monitoring():
        return
    from synch.factory import get_writer

    now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    sql = f"""INSERT INTO synch.log (alias, schema, table, num, type, created_at) VALUES ('{alias}', '{schema}', '{table}', {num}, {type_}, '{now}')"""
    get_writer().execute(sql)
Exemplo n.º 3
0
def test_full_etl_postgres():
    database = get_postgres_database()

    sql = "insert into test(id,amount) values(1,1)"
    get_reader(alias_postgres).execute(sql)

    etl_full(alias_postgres, database, {"test": "id"}, True)

    sql = f"select * from {database}.test"
    ret = get_writer().execute(sql)
    assert ret == [(1, Decimal("1"))]
Exemplo n.º 4
0
def test_full_etl_mysql():
    database = get_mysql_database()

    sql = f"insert into {database}.test(amount) values(1.00)"
    get_reader(alias_mysql).execute(sql)

    etl_full(alias_mysql, database, {"test": "id"}, True)

    sql = f"select * from {database}.test"

    ret = get_writer().execute(sql)
    assert ret == [(1, Decimal("1"))]
Exemplo n.º 5
0
def check(ctx: Context, schema: str):
    alias = ctx.obj["alias"]
    reader = get_reader(alias)
    writer = get_writer()
    tables = Settings.get_source_db_database_tables_name(alias, schema)
    for table in tables:
        source_table_count = reader.get_count(schema, table)
        target_table_count = writer.get_count(schema, table)
        if source_table_count == target_table_count:
            logger.info(f"{schema}.{table} is equal, count={source_table_count}")
        else:
            logger.warning(
                f"{schema}.{table} is not equal, source_table_count={source_table_count}, target_table_count={target_table_count}"
            )
Exemplo n.º 6
0
def test_delete_events(mocker):
    writer = get_writer()
    mocker.patch.object(writer, "execute", return_value=True, autospec=True)

    database = get_mysql_database()
    sql, params = writer.delete_events(database, "test", "id", ["1", "2"])
    assert sql == "alter table synch_mysql_test.test delete where id in %(pks)s"
    assert params == {"pks": ("1", "2")}

    sql, params = writer.delete_events(database, "test", "id", ["2"])
    assert sql == "alter table synch_mysql_test.test delete where id in %(pks)s"
    assert params == {"pks": ("2", )}

    sql, params = writer.delete_events(database, "test", "id", [1, 2])
    assert sql == "alter table synch_mysql_test.test delete where id in %(pks)s"
    assert params == {"pks": (1, 2)}

    sql, params = writer.delete_events(database, "test", "id", [2])
    assert sql == "alter table synch_mysql_test.test delete where id in %(pks)s"
    assert params == {"pks": (2, )}

    sql, params = writer.delete_events(database, "test", ("id", "id2"),
                                       [(1, 2), (2, 3)])
    assert (
        sql ==
        "alter table synch_mysql_test.test delete where (id=1 and id2=2) or (id=2 and id2=3)"
    )
    assert params is None

    sql, params = writer.delete_events(database, "test", ("id", "id2"),
                                       [("1", "2"), ("2", "3")])
    assert (
        sql ==
        "alter table synch_mysql_test.test delete where (id='1' and id2='2') or (id='2' and id2='3')"
    )
    assert params is None
Exemplo n.º 7
0
def continuous_etl(
    alias: str,
    schema: str,
    tables_pk: Dict,
    tables_dict: Dict,
    last_msg_id,
    skip_error: bool,
):
    """
    continuous etl from broker and insert into clickhouse
    """
    global len_event
    global event_list
    global is_insert
    global last_insert_time
    broker = get_broker(alias)

    insert_interval = Settings.insert_interval()
    insert_num = Settings.insert_num()
    logger.info(f"start consumer for {alias}.{schema} success")

    signal.signal(signal.SIGINT, signal_handler)
    signal.signal(signal.SIGTERM, signal_handler)

    for msg_id, msg in broker.msgs(schema,
                                   last_msg_id=last_msg_id,
                                   count=Settings.insert_num(),
                                   block=insert_interval * 1000):
        if not msg_id and not msg:
            logger.info(
                f"Block {insert_interval} seconds timeout, insert current {len_event} events"
            )
            if len_event > 0:
                is_insert = True
                alter_table = False
                query = None
            else:
                if is_stop:
                    finish_continuous_etl(broker)
                continue
        else:
            alter_table = False
            query = None
            logger.debug(f"msg_id:{msg_id}, msg:{msg}")
            len_event += 1
            event = msg
            table = event["table"]
            schema = event["schema"]
            action = event["action"]
            values = event["values"]

            if action == "query":
                alter_table = True
                query = values["query"]
            else:
                engine = tables_dict.get(table).get("clickhouse_engine")
                writer = get_writer(engine)
                event_list = writer.handle_event(
                    tables_dict,
                    tables_pk.get(table),
                    schema,
                    table,
                    action,
                    event_list,
                    event,
                )

            if (len_event == insert_num or time.time() - last_insert_time >=
                    Settings.insert_interval()):
                is_insert = True

        if is_insert or alter_table:
            for table, v in event_list.items():
                table_event_num = 0
                pk = tables_pk.get(table)
                if isinstance(v, dict):
                    writer = get_writer(ClickHouseEngine.merge_tree)
                    insert = v.get("insert")
                    delete = v.get("delete")
                    if delete:
                        delete_pks = list(delete.keys())
                    else:
                        delete_pks = []
                    if insert:
                        insert_events = list(
                            sorted(insert.values(),
                                   key=lambda x: x.get("event_unixtime")))
                    else:
                        insert_events = []
                    if skip_error:
                        try:
                            if delete_pks:
                                writer.delete_events(schema, table, pk,
                                                     delete_pks)
                            if insert_events:
                                writer.insert_events(schema, table,
                                                     insert_events)

                        except Exception as e:
                            logger.error(f"insert event error,error: {e}",
                                         exc_info=True,
                                         stack_info=True)
                    else:
                        if delete_pks:
                            writer.delete_events(schema, table, pk, delete_pks)
                        if insert_events:
                            writer.insert_events(schema, table, insert_events)

                    table_event_num += len(delete_pks)
                    table_event_num += len(insert_events)

                elif isinstance(v, list):
                    table_event_num += len(v)
                    writer = get_writer(ClickHouseEngine.collapsing_merge_tree)
                    if v:
                        if skip_error:
                            try:
                                writer.insert_events(schema, table, v)
                            except Exception as e:
                                logger.error(f"insert event error,error: {e}",
                                             exc_info=True,
                                             stack_info=True)
                        else:
                            writer.insert_events(schema, table, v)

                insert_log(alias, schema, table, table_event_num, 2)

            if alter_table:
                try:
                    get_writer().execute(query)
                except Exception as e:
                    logger.error(f"alter table error: {e}",
                                 exc_info=True,
                                 stack_info=True)
                    if not skip_error:
                        exit(-1)
            broker.commit(schema)
            logger.info(f"success commit {len_event} events")
            event_list = {}
            is_insert = False
            len_event = 0
            last_insert_time = time.time()
            if is_stop:
                finish_continuous_etl(broker)
Exemplo n.º 8
0
 def finalizer():
     reader.execute(sql)
     get_writer().execute(f"truncate table if exists {database}.test")
Exemplo n.º 9
0
 def finalizer():
     reader.execute(sql)
     get_writer().execute(f"truncate table if exists {postgres}.test")
Exemplo n.º 10
0
def test_table_exists():
    writer = get_writer()
    database = get_mysql_database()
    assert writer.check_table_exists(database, "test") is True
    assert writer.check_table_exists(database, "aaa") is False
Exemplo n.º 11
0
def test_database_exists():
    writer = get_writer()
    database = get_mysql_database()
    assert writer.check_database_exists(database,) is True
    assert writer.check_database_exists("aaa",) is False