예제 #1
0
def get_writer(engine: ClickHouseEngine = None, choice=True) -> Union[ClickHouse, List[ClickHouse]]:
    """
    get writer once
    """
    writers = _writers.get(engine)
    if not choice:
        return writers
    if not writers:
        settings = Settings.get("clickhouse")
        hosts = settings.get("hosts")
        if Settings.is_cluster() and len(hosts) <= 1:
            raise ConfigurationError("hosts must more than one when cluster")
        for host in hosts:
            args = [host, settings.get("user"), settings.get("password"), Settings.cluster_name()]
            if engine == ClickHouseEngine.merge_tree.value:
                w = ClickHouseMergeTree(*args)
            elif engine == ClickHouseEngine.collapsing_merge_tree:
                w = ClickHouseCollapsingMergeTree(*args)
            elif engine == ClickHouseEngine.versioned_collapsing_merge_tree:
                w = ClickHouseVersionedCollapsingMergeTree(*args)
            elif engine == ClickHouseEngine.replacing_merge_tree or engine is None:
                w = ClickHouseReplacingMergeTree(*args)
            else:
                w = ClickHouse(*args)
            _writers.setdefault(engine, []).append(w)
    return random.choice(_writers.get(engine))  # nosec:B311
예제 #2
0
def init(config_file):
    """
    init
    """
    Settings.init(config_file)
    init_logging()
    dsn = Settings.get("sentry", "dsn")
    if dsn:
        import sentry_sdk
        from sentry_sdk.integrations.redis import RedisIntegration

        sentry_sdk.init(
            dsn,
            environment=Settings.get("sentry", "environment"),
            integrations=[RedisIntegration()],
        )
    if Settings.monitoring():
        init_monitor_db(Settings.cluster_name())
예제 #3
0
파일: etl.py 프로젝트: zhangweiwhim/synch
def etl_full(
    alias: str,
    schema: str,
    tables_pk: Dict,
    renew=False,
):
    """
    full etl
    """
    reader = get_reader(alias)
    source_db_database = Settings.get_source_db_database(alias, schema)
    schema = source_db_database.get("database")
    writer = get_writer()
    if not writer.check_database_exists(schema):
        if source_db_database.get("auto_create") is not False:
            writer.create_database(schema, Settings.cluster_name())
        else:
            logger.warning(
                f"Can't etl since no database {schema} found in ClickHouse and auto_create=false"
            )
            exit(-1)
    for table in source_db_database.get("tables"):
        if table.get("auto_full_etl") is False:
            continue
        table_name = table.get("table")
        pk = tables_pk.get(table_name)
        writer = get_writer(table.get("clickhouse_engine"))
        if not pk and not renew:
            logger.warning(f"No pk found in {schema}.{table_name}, skip")
            continue
        elif isinstance(pk, tuple):
            pk = f"({','.join(pk)})"
        if renew:
            drop_sql = f"drop table if exists {schema}.{table_name}"
            writer.execute(drop_sql)
            logger.info(f"drop table success:{schema}.{table_name}")
        if not writer.check_table_exists(schema, table_name):
            sign_column = table.get("sign_column")
            version_column = table.get("version_column")
            writer.execute(
                writer.get_table_create_sql(
                    reader,
                    schema,
                    table_name,
                    pk,
                    table.get("partition_by"),
                    table.get("engine_settings"),
                    sign_column=sign_column,
                    version_column=version_column,
                ))
            if Settings.is_cluster():
                for w in get_writer(choice=False):
                    w.execute(
                        w.get_distributed_table_create_sql(
                            schema, table_name,
                            Settings.get("clickhouse.distributed_suffix")))
            if reader.fix_column_type and not table.get("skip_decimal"):
                writer.fix_table_column_type(reader, schema, table_name)
            full_insert_sql = writer.get_full_insert_sql(
                reader, schema, table_name, sign_column)
            writer.execute(full_insert_sql)
            logger.info(f"full data etl for {schema}.{table_name} success")
        else:
            logger.debug(
                f"{schema}.{table_name} exists, skip, or use --renew force etl with drop old tables"
            )
예제 #4
0
    def _binlog_reading(
        self,
        only_tables,
        only_schemas,
        log_file,
        log_pos,
        server_id,
        skip_dmls,
        skip_delete_tables,
        skip_update_tables,
    ) -> Generator:
        stream = BinLogStreamReader(
            connection_settings=dict(
                host=self.host,
                port=self.port,
                user=self.user,
                passwd=self.password,
            ),
            resume_stream=True,
            blocking=True,
            server_id=server_id,
            only_tables=only_tables,
            only_schemas=only_schemas,
            only_events=self.only_events,
            log_file=log_file,
            log_pos=log_pos,
            fail_on_table_metadata_unavailable=True,
            slave_heartbeat=10,
        )
        for binlog_event in stream:
            if isinstance(binlog_event, QueryEvent):
                schema = binlog_event.schema.decode()
                query = binlog_event.query.lower()
                if "alter" not in query:
                    continue
                table, convent_sql = SqlConvert.to_clickhouse(
                    schema, query, Settings.cluster_name())
                if not convent_sql:
                    continue
                event = {
                    "table": table,
                    "schema": schema,
                    "action": "query",
                    "values": {
                        "query": convent_sql
                    },
                    "event_unixtime": int(time.time() * 10**6),
                    "action_seq": 0,
                }
                yield schema, None, event, stream.log_file, stream.log_pos
            else:
                schema = binlog_event.schema
                table = binlog_event.table
                skip_dml_table_name = f"{schema}.{table}"
                for row in binlog_event.rows:
                    if isinstance(binlog_event, WriteRowsEvent):
                        event = {
                            "table": table,
                            "schema": schema,
                            "action": "insert",
                            "values": row["values"],
                            "event_unixtime": int(time.time() * 10**6),
                            "action_seq": 2,
                        }

                    elif isinstance(binlog_event, UpdateRowsEvent):
                        if "update" in skip_dmls or skip_dml_table_name in skip_update_tables:
                            continue
                        delete_event = {
                            "table": table,
                            "schema": schema,
                            "action": "delete",
                            "values": row["before_values"],
                            "event_unixtime": int(time.time() * 10**6),
                            "action_seq": 1,
                        }
                        yield binlog_event.schema, binlog_event.table, delete_event, stream.log_file, stream.log_pos
                        event = {
                            "table": table,
                            "schema": schema,
                            "action": "insert",
                            "values": row["after_values"],
                            "event_unixtime": int(time.time() * 10**6),
                            "action_seq": 2,
                        }

                    elif isinstance(binlog_event, DeleteRowsEvent):
                        if "delete" in skip_dmls or skip_dml_table_name in skip_delete_tables:
                            continue
                        event = {
                            "table": table,
                            "schema": schema,
                            "action": "delete",
                            "values": row["values"],
                            "event_unixtime": int(time.time() * 10**6),
                            "action_seq": 1,
                        }
                    else:
                        return
                    yield binlog_event.schema, binlog_event.table, event, stream.log_file, stream.log_pos