def etl_full( alias: str, schema: str, tables_pk: Dict, renew=False, ): """ full etl """ reader = get_reader(alias) source_db_database = Settings.get_source_db_database(alias, schema) schema = source_db_database.get("database") writer = get_writer() if not writer.check_database_exists(schema): if source_db_database.get("auto_create") is not False: writer.create_database(schema) else: logger.warning( f"Can't etl since no database {schema} found in ClickHouse and auto_create=false" ) exit(-1) for table in source_db_database.get("tables"): if table.get("auto_full_etl") is False: continue table_name = table.get("table") pk = tables_pk.get(table_name) writer = get_writer(table.get("clickhouse_engine")) if not pk: logger.warning(f"No pk found in {schema}.{table_name}, skip") continue elif isinstance(pk, tuple): pk = f"({','.join(pk)})" if renew: drop_sql = f"drop table if exists {schema}.{table_name}" writer.execute(drop_sql) logger.info(f"drop table success:{schema}.{table_name}") if not writer.check_table_exists(schema, table_name): sign_column = table.get("sign_column") version_column = table.get("version_column") writer.execute( writer.get_table_create_sql( reader, schema, table_name, pk, table.get("partition_by"), table.get("engine_settings"), sign_column=sign_column, version_column=version_column, )) if reader.fix_column_type and not table.get("skip_decimal"): writer.fix_table_column_type(reader, schema, table_name) full_insert_sql = writer.get_full_insert_sql( reader, schema, table_name, sign_column) writer.execute(full_insert_sql) logger.info(f"full data etl for {schema}.{table_name} success") else: logger.debug( f"{schema}.{table_name} exists, skip, or use --renew force etl with drop old tables" )
def insert_log( alias: str, schema: str, table: str, num: int, type_: int, ): if not Settings.monitoring(): return from synch.factory import get_writer now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") sql = f"""INSERT INTO synch.log (alias, schema, table, num, type, created_at) VALUES ('{alias}', '{schema}', '{table}', {num}, {type_}, '{now}')""" get_writer().execute(sql)
def test_full_etl_postgres(): database = get_postgres_database() sql = "insert into test(id,amount) values(1,1)" get_reader(alias_postgres).execute(sql) etl_full(alias_postgres, database, {"test": "id"}, True) sql = f"select * from {database}.test" ret = get_writer().execute(sql) assert ret == [(1, Decimal("1"))]
def test_full_etl_mysql(): database = get_mysql_database() sql = f"insert into {database}.test(amount) values(1.00)" get_reader(alias_mysql).execute(sql) etl_full(alias_mysql, database, {"test": "id"}, True) sql = f"select * from {database}.test" ret = get_writer().execute(sql) assert ret == [(1, Decimal("1"))]
def check(ctx: Context, schema: str): alias = ctx.obj["alias"] reader = get_reader(alias) writer = get_writer() tables = Settings.get_source_db_database_tables_name(alias, schema) for table in tables: source_table_count = reader.get_count(schema, table) target_table_count = writer.get_count(schema, table) if source_table_count == target_table_count: logger.info(f"{schema}.{table} is equal, count={source_table_count}") else: logger.warning( f"{schema}.{table} is not equal, source_table_count={source_table_count}, target_table_count={target_table_count}" )
def test_delete_events(mocker): writer = get_writer() mocker.patch.object(writer, "execute", return_value=True, autospec=True) database = get_mysql_database() sql, params = writer.delete_events(database, "test", "id", ["1", "2"]) assert sql == "alter table synch_mysql_test.test delete where id in %(pks)s" assert params == {"pks": ("1", "2")} sql, params = writer.delete_events(database, "test", "id", ["2"]) assert sql == "alter table synch_mysql_test.test delete where id in %(pks)s" assert params == {"pks": ("2", )} sql, params = writer.delete_events(database, "test", "id", [1, 2]) assert sql == "alter table synch_mysql_test.test delete where id in %(pks)s" assert params == {"pks": (1, 2)} sql, params = writer.delete_events(database, "test", "id", [2]) assert sql == "alter table synch_mysql_test.test delete where id in %(pks)s" assert params == {"pks": (2, )} sql, params = writer.delete_events(database, "test", ("id", "id2"), [(1, 2), (2, 3)]) assert ( sql == "alter table synch_mysql_test.test delete where (id=1 and id2=2) or (id=2 and id2=3)" ) assert params is None sql, params = writer.delete_events(database, "test", ("id", "id2"), [("1", "2"), ("2", "3")]) assert ( sql == "alter table synch_mysql_test.test delete where (id='1' and id2='2') or (id='2' and id2='3')" ) assert params is None
def continuous_etl( alias: str, schema: str, tables_pk: Dict, tables_dict: Dict, last_msg_id, skip_error: bool, ): """ continuous etl from broker and insert into clickhouse """ global len_event global event_list global is_insert global last_insert_time broker = get_broker(alias) insert_interval = Settings.insert_interval() insert_num = Settings.insert_num() logger.info(f"start consumer for {alias}.{schema} success") signal.signal(signal.SIGINT, signal_handler) signal.signal(signal.SIGTERM, signal_handler) for msg_id, msg in broker.msgs(schema, last_msg_id=last_msg_id, count=Settings.insert_num(), block=insert_interval * 1000): if not msg_id and not msg: logger.info( f"Block {insert_interval} seconds timeout, insert current {len_event} events" ) if len_event > 0: is_insert = True alter_table = False query = None else: if is_stop: finish_continuous_etl(broker) continue else: alter_table = False query = None logger.debug(f"msg_id:{msg_id}, msg:{msg}") len_event += 1 event = msg table = event["table"] schema = event["schema"] action = event["action"] values = event["values"] if action == "query": alter_table = True query = values["query"] else: engine = tables_dict.get(table).get("clickhouse_engine") writer = get_writer(engine) event_list = writer.handle_event( tables_dict, tables_pk.get(table), schema, table, action, event_list, event, ) if (len_event == insert_num or time.time() - last_insert_time >= Settings.insert_interval()): is_insert = True if is_insert or alter_table: for table, v in event_list.items(): table_event_num = 0 pk = tables_pk.get(table) if isinstance(v, dict): writer = get_writer(ClickHouseEngine.merge_tree) insert = v.get("insert") delete = v.get("delete") if delete: delete_pks = list(delete.keys()) else: delete_pks = [] if insert: insert_events = list( sorted(insert.values(), key=lambda x: x.get("event_unixtime"))) else: insert_events = [] if skip_error: try: if delete_pks: writer.delete_events(schema, table, pk, delete_pks) if insert_events: writer.insert_events(schema, table, insert_events) except Exception as e: logger.error(f"insert event error,error: {e}", exc_info=True, stack_info=True) else: if delete_pks: writer.delete_events(schema, table, pk, delete_pks) if insert_events: writer.insert_events(schema, table, insert_events) table_event_num += len(delete_pks) table_event_num += len(insert_events) elif isinstance(v, list): table_event_num += len(v) writer = get_writer(ClickHouseEngine.collapsing_merge_tree) if v: if skip_error: try: writer.insert_events(schema, table, v) except Exception as e: logger.error(f"insert event error,error: {e}", exc_info=True, stack_info=True) else: writer.insert_events(schema, table, v) insert_log(alias, schema, table, table_event_num, 2) if alter_table: try: get_writer().execute(query) except Exception as e: logger.error(f"alter table error: {e}", exc_info=True, stack_info=True) if not skip_error: exit(-1) broker.commit(schema) logger.info(f"success commit {len_event} events") event_list = {} is_insert = False len_event = 0 last_insert_time = time.time() if is_stop: finish_continuous_etl(broker)
def finalizer(): reader.execute(sql) get_writer().execute(f"truncate table if exists {database}.test")
def finalizer(): reader.execute(sql) get_writer().execute(f"truncate table if exists {postgres}.test")
def test_table_exists(): writer = get_writer() database = get_mysql_database() assert writer.check_table_exists(database, "test") is True assert writer.check_table_exists(database, "aaa") is False
def test_database_exists(): writer = get_writer() database = get_mysql_database() assert writer.check_database_exists(database,) is True assert writer.check_database_exists("aaa",) is False