예제 #1
0
    def load(self, writer: BufferedWriterWrapper) -> None:
        logger = logging.getLogger('snuba.bulk-loader')

        clickhouse_ro = ClickhousePool(client_settings={
            'readonly': True,
        })
        clickhouse_tables = clickhouse_ro.execute('show tables')
        if (self.__dest_table, ) not in clickhouse_tables:
            raise ValueError("Destination table %s does not exists" %
                             self.__dest_table)

        table_content = clickhouse_ro.execute("select count(*) from %s" %
                                              self.__dest_table)
        if table_content != [(0, )]:
            raise ValueError("Destination Table is not empty")

        descriptor = self.__source.get_descriptor()
        logger.info("Loading snapshot %s", descriptor.id)

        with self.__source.get_table_file(self.__source_table) as table:
            logger.info("Loading table %s from file", self.__source_table)
            row_count = 0
            with writer as buffer_writer:
                for row in table:
                    clickhouse_data = self.__row_processor(row)
                    buffer_writer.write(clickhouse_data)
                    row_count += 1
            logger.info("Load complete %d records loaded", row_count)
예제 #2
0
def drop_partitions(
    clickhouse: ClickhousePool,
    database: str,
    table: str,
    parts: Sequence[Tuple[datetime, int]],
    dry_run: bool = True,
) -> None:
    query_template = """\
        ALTER TABLE %(database)s.%(table)s DROP PARTITION ('%(date_str)s', %(retention_days)s)
    """

    for part_date, retention_days in parts:
        args = {
            "database": database,
            "table": table,
            "date_str": part_date.strftime("%Y-%m-%d"),
            "retention_days": retention_days,
        }

        query = (query_template % args).strip()
        if dry_run:
            logger.info("Dry run: " + query)
        else:
            logger.info("Dropping partition: " + query)
            clickhouse.execute(query)
예제 #3
0
def test_fallback_logic() -> None:
    state.set_config("use_fallback_host_in_native_connection_pool", 1)

    network_failure_connection = mock.Mock()
    network_failure_connection.execute.side_effect = EOFError()

    verification_connection = mock.Mock()
    verification_connection.execute.return_value = []

    pool = ClickhousePool(CLUSTER_HOST, CLUSTER_PORT, "test", "test",
                          TEST_DB_NAME)

    # The execute method will try to reuse a single slot in the connection
    # pool but reestablish new connections with _create_conn if a connection
    # fails with a network-related error. It may be cleaner to move connection
    # negotation/establishment into another class for separation of concerns.
    with mock.patch.object(pool,
                           "_create_conn",
                           lambda x, y=False: network_failure_connection):
        pool.pool = queue.LifoQueue(1)
        pool.pool.put(network_failure_connection, block=False)
        pool.fallback_pool = queue.LifoQueue(1)
        pool.fallback_pool.put(verification_connection, block=False)
        pool.execute("SELECT something")

    assert (network_failure_connection.execute.call_count == 3
            ), "Expected three (failed) attempts with main connection pool"
    assert (
        verification_connection.execute.call_count == 1
    ), "Expected one (successful) attempt with fallback connection pool"
예제 #4
0
def test_add_node() -> None:
    host_name = os.environ.get("CLICKHOUSE_HOST", "localhost")
    port = int(os.environ.get("CLICKHOUSE_PORT", 9000))
    user = "******"
    password = ""
    database = os.environ.get("CLICKHOUSE_DATABASE", "default")

    client = ClickhousePool(
        host_name,
        port,
        user,
        password,
        database,
    )

    assert set(client.execute("SHOW TABLES")) == set()

    runner.Runner.add_node(
        node_type=cluster.ClickhouseNodeType.LOCAL,
        storage_sets=[StorageSetKey.OUTCOMES],
        host_name=host_name,
        port=port,
        user=user,
        password=password,
        database=database,
    )

    assert set(client.execute("SHOW TABLES")) == {
        ("outcomes_raw_local", ),
        ("outcomes_hourly_local", ),
        ("outcomes_mv_hourly_local", ),
    }
예제 #5
0
def test_reconnect(FakeClient: Client) -> None:
    # If the connection NetworkErrors a first time, make sure we call it a second time.
    FakeClient.return_value.execute.side_effect = [
        errors.NetworkError,
        '{"data": "to my face"}',
    ]
    cp = ClickhousePool("0:0:0:0", 9000, "default", "", "default")
    cp.execute("SHOW TABLES")
    assert FakeClient.return_value.execute.mock_calls == [
        call(
            "SHOW TABLES",
            params=None,
            with_column_types=False,
            query_id=None,
            settings=None,
            types_check=False,
            columnar=False,
        ),
        call(
            "SHOW TABLES",
            params=None,
            with_column_types=False,
            query_id=None,
            settings=None,
            types_check=False,
            columnar=False,
        ),
    ]
예제 #6
0
 def test_reconnect(self, FakeClient):
     # If the connection NetworkErrors a first time, make sure we call it a second time.
     FakeClient.return_value.execute.side_effect = [
         errors.NetworkError, '{"data": "to my face"}'
     ]
     cp = ClickhousePool()
     cp.execute("SHOW TABLES")
     assert FakeClient.return_value.execute.mock_calls == [
         call("SHOW TABLES"), call("SHOW TABLES")
     ]
예제 #7
0
def test_reconnect(FakeClient) -> None:
    # If the connection NetworkErrors a first time, make sure we call it a second time.
    FakeClient.return_value.execute.side_effect = [
        errors.NetworkError,
        '{"data": "to my face"}',
    ]
    cp = ClickhousePool("0:0:0:0", 9000, "default", "", "default")
    cp.execute("SHOW TABLES")
    assert FakeClient.return_value.execute.mock_calls == [
        call("SHOW TABLES"),
        call("SHOW TABLES"),
    ]
예제 #8
0
def get_active_partitions(clickhouse: ClickhousePool,
                          storage: WritableTableStorage, database: str,
                          table: str) -> Sequence[util.Part]:

    response = clickhouse.execute(
        """
        SELECT DISTINCT partition
        FROM system.parts
        WHERE database = %(database)s
        AND table = %(table)s
        AND active = 1
        """,
        {
            "database": database,
            "table": table
        },
    )

    schema = storage.get_schema()
    assert isinstance(schema, TableSchema)
    part_format = schema.get_part_format()
    assert part_format is not None
    return [
        util.decode_part_str(part, part_format) for part, in response.results
    ]
예제 #9
0
def perform_select_query(
    columns: Sequence[str],
    table: str,
    where: Optional[Dict[str, str]],
    limit: Optional[str],
    connection: ClickhousePool,
) -> Sequence[Any]:
    """ Performs a SELECT query, with optional WHERE and LIMIT clauses

    Arguments:
    columns -- a list of columns to be SELECTed
    table -- the name of the table, upon which query is being run
    where -- a dict of WHERE conditions, (str, str) key-value pairs
    limit -- LIMIT argument, passed in as str
    connection -- ClickHouse connection object for query execution
    """

    select_clause = "SELECT " + (", ".join(columns))
    from_clause = " FROM " + table
    where_clause = ""

    if where:
        where_elems = [(key + " = " + "'" + where[key] + "'") for key in where]
        where_clause = " WHERE " + (" AND ".join(where_elems))

    limit_clause = (" LIMIT " + limit) if limit else ""
    full_query = select_clause + from_clause + where_clause + limit_clause

    return connection.execute(full_query)
예제 #10
0
    def add_node(
        self,
        node_type: ClickhouseNodeType,
        storage_sets: Sequence[StorageSetKey],
        host_name: str,
        port: int,
        user: str,
        password: str,
        database: str,
    ) -> None:
        client_settings = ClickhouseClientSettings.MIGRATE.value
        clickhouse = ClickhousePool(
            host_name,
            port,
            user,
            password,
            database,
            client_settings=client_settings.settings,
            send_receive_timeout=client_settings.timeout,
        )

        migrations: List[Migration] = []

        for group in get_active_migration_groups():
            group_loader = get_group_loader(group)

            for migration_id in group_loader.get_migrations():
                migration = group_loader.load_migration(migration_id)
                migrations.append(migration)

        for migration in migrations:
            if isinstance(migration, ClickhouseNodeMigration):
                operations = (
                    migration.forwards_local()
                    if node_type == ClickhouseNodeType.LOCAL
                    else migration.forwards_dist()
                )

                for sql_op in operations:
                    if isinstance(sql_op, SqlOperation):
                        if sql_op._storage_set in storage_sets:
                            sql = sql_op.format_sql()
                            print(f"Executing {sql}")
                            clickhouse.execute(sql)
            elif isinstance(migration, CodeMigration):
                for python_op in migration.forwards_global():
                    python_op.execute_new_node(storage_sets)
예제 #11
0
def optimize_partitions(
    clickhouse: ClickhousePool,
    database: str,
    table: str,
    parts: Sequence[util.Part],
    ignore_cutoff: bool,
    clickhouse_host: Optional[str] = None,
) -> None:
    query_template = """\
        OPTIMIZE TABLE %(database)s.%(table)s
        PARTITION %(partition)s FINAL
    """

    # Adding 10 minutes to the current time before finding the midnight time
    # to ensure this keeps working even if the system clock of the host that
    # starts the pod is slightly ahead of the system clock of the host running
    # the job. This prevents us from getting the wrong midnight.
    last_midnight = (datetime.now() + timedelta(minutes=10)).replace(
        hour=0, minute=0, second=0, microsecond=0)
    if not ignore_cutoff:
        cutoff_time: Optional[datetime] = (last_midnight +
                                           settings.OPTIMIZE_JOB_CUTOFF_TIME)
        logger.info("Cutoff time: %s", str(cutoff_time))
    else:
        cutoff_time = None
        logger.info("Ignoring cutoff time")

    for part in parts:
        if cutoff_time is not None and datetime.now() > cutoff_time:
            raise JobTimeoutException(
                "Optimize job is running past the cutoff time. Abandoning.")

        args = {
            "database": database,
            "table": table,
            "partition": part.name,
        }

        query = (query_template % args).strip()
        logger.info(f"Optimizing partition: {part.name}")
        start = time.time()
        clickhouse.execute(query)
        metrics.timing(
            "optimized_part",
            time.time() - start,
            tags=_get_metrics_tags(table, clickhouse_host),
        )
예제 #12
0
def pytest_configure() -> None:
    """
    Set up the Sentry SDK to avoid errors hidden by configuration.
    Ensure the snuba_test database exists
    """
    assert (
        settings.TESTING
    ), "settings.TESTING is False, try `SNUBA_SETTINGS=test` or `make test`"

    setup_sentry()

    for cluster in settings.CLUSTERS:
        connection = ClickhousePool(
            cluster["host"], cluster["port"], "default", "", "default",
        )
        database_name = cluster["database"]
        connection.execute(f"DROP DATABASE IF EXISTS {database_name};")
        connection.execute(f"CREATE DATABASE {database_name};")
예제 #13
0
파일: connect.py 프로젝트: getsentry/snuba
def check_clickhouse(clickhouse: ClickhousePool) -> None:
    ver = clickhouse.execute("SELECT version()").results[0][0]
    # The newer versions of altinity on arm add this to the version
    # and it breaks this check
    ver = ver.replace(".testingarm", "")
    ver = ver.replace(".altinitystable", "")
    if version.parse(ver) < version.parse(CLICKHOUSE_SERVER_MIN_VERSION):
        raise InvalidClickhouseVersion(
            f"Snuba requires Clickhouse version {CLICKHOUSE_SERVER_MIN_VERSION} ({clickhouse.host}:{clickhouse.port} - {ver})"
        )
예제 #14
0
def test_concurrency_limit() -> None:
    connection = mock.Mock()
    connection.execute.side_effect = TestError("some error")

    state.set_config("simultaneous_queries_sleep_seconds", 0.5)

    pool = ClickhousePool("host", 100, "test", "test", "test")
    pool.pool = queue.LifoQueue(1)
    pool.pool.put(connection, block=False)

    with pytest.raises(ClickhouseError):
        pool.execute("SELECT something")
    connection.execute.assert_called_once()

    connection.reset_mock(side_effect=True)
    connection.execute.side_effect = TestConcurrentError("some error")

    with pytest.raises(ClickhouseError):
        pool.execute("SELECT something")
    assert connection.execute.call_count == 2, "Expected two attempts"
예제 #15
0
def pytest_configure() -> None:
    """
    Set up the Sentry SDK to avoid errors hidden by configuration.
    Ensure the snuba_test database exists
    """
    setup_sentry()

    # There is only one cluster in test, so fetch the host from there.
    cluster = settings.CLUSTERS[0]

    connection = ClickhousePool(
        cluster["host"],
        cluster["port"],
        "default",
        "",
        "default",
    )

    database_name = cluster["database"]
    connection.execute(f"DROP DATABASE IF EXISTS {database_name};")
    connection.execute(f"CREATE DATABASE {database_name};")
예제 #16
0
파일: base.py 프로젝트: forkkit/snuba
class BaseTest(object):
    def setup_method(self, test_method, dataset_name=None):
        assert settings.TESTING, "settings.TESTING is False, try `SNUBA_SETTINGS=test` or `make test`"

        self.database = 'default'
        self.dataset_name = dataset_name

        if self.dataset_name:
            self.dataset = get_dataset(self.dataset_name)
            self.clickhouse = ClickhousePool()

            for statement in self.dataset.get_dataset_schemas(
            ).get_drop_statements():
                self.clickhouse.execute(statement)

            for statement in self.dataset.get_dataset_schemas(
            ).get_create_statements():
                self.clickhouse.execute(statement)

            redis_client.flushdb()

    def teardown_method(self, test_method):
        if self.dataset_name:
            for statement in self.dataset.get_dataset_schemas(
            ).get_drop_statements():
                self.clickhouse.execute(statement)

            redis_client.flushdb()
예제 #17
0
def optimize_partitions(
    clickhouse: ClickhousePool,
    database: str,
    table: str,
    parts: Sequence[util.Part],
) -> None:

    query_template = """\
        OPTIMIZE TABLE %(database)s.%(table)s
        PARTITION %(partition)s FINAL
    """

    for part in parts:
        args = {
            "database": database,
            "table": table,
            "partition": part.name,
        }

        query = (query_template % args).strip()
        logger.info(f"Optimizing partition: {part.name}")
        clickhouse.execute(query)
예제 #18
0
def drop_partitions(
    clickhouse: ClickhousePool,
    database: str,
    table: str,
    parts: Sequence[util.Part],
    dry_run: bool = True,
) -> None:
    query_template = """\
        ALTER TABLE %(database)s.%(table)s DROP PARTITION %(partition)s
    """

    for part in parts:
        args = {
            "database": database,
            "table": table,
            "partition": part.name,
        }

        query = (query_template % args).strip()
        if dry_run:
            logger.info("Dry run: " + query)
        else:
            logger.info("Dropping partition: " + query)
            clickhouse.execute(query)
예제 #19
0
def get_active_partitions(
    clickhouse: ClickhousePool, database: str, table: str
) -> Sequence[util.Part]:
    response = clickhouse.execute(
        """
        SELECT DISTINCT partition
        FROM system.parts
        WHERE database = %(database)s
        AND table = %(table)s
        AND active = 1
        """,
        {"database": database, "table": table},
    )

    return [util.decode_part_str(part) for part, in response]
예제 #20
0
    def test_messages(self):
        processor = GroupAssigneeProcessor('sentry_groupasignee')

        metadata = KafkaMessageMetadata(
            offset=42,
            partition=0,
        )

        begin_msg = json.loads(self.BEGIN_MSG)
        ret = processor.process_message(begin_msg, metadata)
        assert ret is None

        commit_msg = json.loads(self.COMMIT_MSG)
        ret = processor.process_message(commit_msg, metadata)
        assert ret is None

        insert_msg = json.loads(self.INSERT_MSG)
        ret = processor.process_message(insert_msg, metadata)
        assert ret.data == [self.PROCESSED]
        self.write_processed_records(ret.data)
        cp = ClickhousePool()
        ret = cp.execute("SELECT * FROM test_groupassignee_local;")
        assert ret[0] == (
            42,  # offset
            0,  # deleted
            2,  # project_id
            1359,  # group_id
            datetime(2019, 9, 19, 0, 17, 55),
            1,  # user_id
            None,  # team_id
        )

        update_msg = json.loads(self.UPDATE_MSG_NO_KEY_CHANGE)
        ret = processor.process_message(update_msg, metadata)
        assert ret.data == [self.PROCESSED]

        # Tests an update with key change which becomes a two inserts:
        # one deletion and the insertion of the new row.
        update_msg = json.loads(self.UPDATE_MSG_WITH_KEY_CHANGE)
        ret = processor.process_message(update_msg, metadata)
        assert ret.data == [self.DELETED, self.PROCESSED_UPDATE]

        delete_msg = json.loads(self.DELETE_MSG)
        ret = processor.process_message(delete_msg, metadata)
        assert ret.data == [self.DELETED]
예제 #21
0
 def test_bulk_load(self):
     row = GroupAssigneeRow.from_bulk({
         'project_id': '2',
         'group_id': '1359',
         'date_added': '2019-09-19 00:17:55+00',
         'user_id': '1',
         'team_id': '',
     })
     self.write_processed_records(row.to_clickhouse())
     cp = ClickhousePool()
     ret = cp.execute("SELECT * FROM test_groupassignee_local;")
     assert ret[0] == (
         0,  # offset
         0,  # deleted
         2,  # project_id
         1359,  # group_id
         datetime(2019, 9, 19, 0, 17, 55),
         1,  # user_id
         None,  # team_id
     )
예제 #22
0
    def test_messages(self):
        processor = GroupedMessageProcessor('sentry_groupedmessage')

        metadata = KafkaMessageMetadata(
            offset=42,
            partition=0,
        )

        begin_msg = json.loads(self.BEGIN_MSG)
        ret = processor.process_message(begin_msg, metadata)
        assert ret is None

        commit_msg = json.loads(self.COMMIT_MSG)
        ret = processor.process_message(commit_msg, metadata)
        assert ret is None

        insert_msg = json.loads(self.INSERT_MSG)
        ret = processor.process_message(insert_msg, metadata)
        assert ret[1] == self.PROCESSED
        self.write_processed_records(ret[1])
        cp = ClickhousePool()
        ret = cp.execute("SELECT * FROM test_groupedmessage_local;")
        assert ret[0] == (
            42,  # offset
            0,  # deleted
            2,  # project_id
            74,  # id
            0,  # status
            datetime(2019, 6, 19, 6, 46, 28),
            datetime(2019, 6, 19, 6, 45, 32),
            datetime(2019, 6, 19, 6, 45, 32),
            None,
        )

        update_msg = json.loads(self.UPDATE_MSG)
        ret = processor.process_message(update_msg, metadata)
        assert ret[1] == self.PROCESSED

        delete_msg = json.loads(self.DELETE_MSG)
        ret = processor.process_message(delete_msg, metadata)
        assert ret[1] == self.DELETED
예제 #23
0
 def test_bulk_load(self):
     row = GroupedMessageRow.from_bulk({
         'project_id': '2',
         'id': '10',
         'status': '0',
         'last_seen': '2019-06-28 17:57:32+00',
         'first_seen': '2019-06-28 06:40:17+00',
         'active_at': '2019-06-28 06:40:17+00',
         'first_release_id': '26',
     })
     self.write_processed_records(row.to_clickhouse())
     cp = ClickhousePool()
     ret = cp.execute("SELECT * FROM test_groupedmessage_local;")
     assert ret[0] == (
         0,  # offset
         0,  # deleted
         2,  # project_id
         10,  # id
         0,  # status
         datetime(2019, 6, 28, 17, 57, 32),
         datetime(2019, 6, 28, 6, 40, 17),
         datetime(2019, 6, 28, 6, 40, 17),
         26,
     )
예제 #24
0
def check_clickhouse(clickhouse: ClickhousePool) -> None:
    ver = clickhouse.execute("SELECT version()")[0][0]
    if version.parse(ver) < version.parse(CLICKHOUSE_SERVER_MIN_VERSION):
        raise InvalidClickhouseVersion(
            f"Snuba requires Clickhouse version {CLICKHOUSE_SERVER_MIN_VERSION}"
        )
예제 #25
0
def get_partitions_to_optimize(
    clickhouse: ClickhousePool,
    storage: ReadableTableStorage,
    database: str,
    table: str,
    before: Optional[datetime] = None,
) -> Sequence[util.Part]:
    engine = clickhouse.execute(
        """
        SELECT engine
        FROM system.tables
        WHERE (database = %(database)s) AND (name = %(table)s)
        """,
        {
            "database": database,
            "table": table
        },
    )

    if not engine:
        logger.warning("Table %s.%s doesn't exist on %s:%s" %
                       (database, table, clickhouse.host, clickhouse.port))
        return []

    if engine[0][0].startswith("Replicated"):
        is_leader = clickhouse.execute(
            """
            SELECT is_leader
            FROM system.replicas
            WHERE (database = %(database)s) AND (table = %(table)s)
            """,
            {
                "database": database,
                "table": table
            },
        )

        # response: [(0,)] for non-leader or [(1,)] for leader
        if not (len(is_leader) == 1 and is_leader[0][0]):
            return []

    active_parts = clickhouse.execute(
        """
        SELECT
            partition,
            count() AS c
        FROM system.parts
        WHERE active
        AND database = %(database)s
        AND table = %(table)s
        GROUP BY partition
        HAVING c > 1
        ORDER BY c DESC, partition
        """,
        {
            "database": database,
            "table": table
        },
    )

    schema = storage.get_schema()
    assert isinstance(schema, TableSchema)
    part_format = schema.get_part_format()
    assert part_format is not None

    parts = [
        util.decode_part_str(part, part_format) for part, count in active_parts
    ]

    if before:
        parts = [
            p for p in parts
            if (p.date + timedelta(days=6 - p.date.weekday())) < before
        ]

    return parts