Exemplo n.º 1
0
    def __init__(self, check, config):
        self._check = check
        self._db = None
        self._config = config
        self._log = get_check_logger()
        self._activity_last_query_start = None
        self._last_check_run = 0
        self._collection_loop_future = None
        self._cancel_event = threading.Event()
        self._tags = None
        self._tags_str = None
        self._service = "postgres"
        self._db_hostname = resolve_db_host(self._config.host)
        self._enabled = is_affirmative(self._config.statement_samples_config.get('enabled', False))
        self._run_sync = is_affirmative(self._config.statement_samples_config.get('run_sync', False))
        self._rate_limiter = ConstantRateLimiter(
            float(self._config.statement_samples_config.get('collections_per_second', 1))
        )
        self._explain_function = self._config.statement_samples_config.get(
            'explain_function', 'datadog.explain_statement'
        )

        # explained_statements_cache: limit how often we try to re-explain the same query
        self._explained_statements_cache = TTLCache(
            maxsize=int(self._config.statement_samples_config.get('explained_statements_cache_maxsize', 5000)),
            ttl=60 * 60 / int(self._config.statement_samples_config.get('explained_statements_per_hour_per_query', 60)),
        )

        # seen_samples_cache: limit the ingestion rate per (query_signature, plan_signature)
        self._seen_samples_cache = TTLCache(
            # assuming ~100 bytes per entry (query & plan signature, key hash, 4 pointers (ordered dict), expiry time)
            # total size: 10k * 100 = 1 Mb
            maxsize=int(self._config.statement_samples_config.get('seen_samples_cache_maxsize', 10000)),
            ttl=60 * 60 / int(self._config.statement_samples_config.get('samples_per_hour_per_query', 15)),
        )
Exemplo n.º 2
0
def test_constant_rate_limiter():
    rate_limit = 8
    test_duration_s = 0.5
    ratelimiter = ConstantRateLimiter(rate_limit)
    start = time.time()
    sleep_count = 0
    while time.time() - start < test_duration_s:
        ratelimiter.sleep()
        sleep_count += 1
    max_expected_count = rate_limit * test_duration_s
    assert max_expected_count - 1 <= sleep_count <= max_expected_count + 1
Exemplo n.º 3
0
    def _collect_statement_samples(self):
        self._log.debug("collecting statement samples")
        self._rate_limiter.sleep()
        events_statements_table, rate_limit = self._get_sample_collection_strategy(
        )
        if not events_statements_table:
            return
        if self._rate_limiter.rate_limit_s != rate_limit:
            self._rate_limiter = ConstantRateLimiter(rate_limit)
        start_time = time.time()

        tags = self._tags + [
            "events_statements_table:{}".format(events_statements_table)
        ]
        rows = self._get_new_events_statements(
            events_statements_table, self._events_statements_row_limit)
        rows = self._filter_valid_statement_rows(rows)
        events = self._collect_plans_for_statements(rows)
        submitted_count = 0
        for e in events:
            self._check.database_monitoring_query_sample(
                json.dumps(e, default=default_json_event_encoding))
            submitted_count += 1
        self._check.histogram("dd.mysql.collect_statement_samples.time",
                              (time.time() - start_time) * 1000,
                              tags=tags)
        self._check.count(
            "dd.mysql.collect_statement_samples.events_submitted.count",
            submitted_count,
            tags=tags)
        self._check.gauge(
            "dd.mysql.collect_statement_samples.seen_samples_cache.len",
            len(self._seen_samples_cache),
            tags=tags)
        self._check.gauge(
            "dd.mysql.collect_statement_samples.explained_statements_cache.len",
            len(self._explained_statements_cache),
            tags=tags,
        )
        self._check.gauge(
            "dd.mysql.collect_statement_samples.collection_strategy_cache.len",
            len(self._collection_strategy_cache),
            tags=tags,
        )
Exemplo n.º 4
0
    def _collect_statement_samples(self):
        self._rate_limiter.sleep()
        events_statements_table, rate_limit = self._get_sample_collection_strategy(
        )
        if not events_statements_table:
            return
        if self._rate_limiter.rate_limit_s != rate_limit:
            self._rate_limiter = ConstantRateLimiter(rate_limit)
        start_time = time.time()

        tags = self._tags + [
            "events_statements_table:{}".format(events_statements_table)
        ]
        rows = self._get_new_events_statements(
            events_statements_table, self._events_statements_row_limit)
        rows = self._filter_valid_statement_rows(rows)
        events = self._collect_plans_for_statements(rows)
        submitted_count, failed_count = self._statement_samples_client.submit_events(
            events)
        self._check.count("dd.mysql.statement_samples.error",
                          failed_count,
                          tags=self._tags + ["error:submit-events"])
        self._check.histogram("dd.mysql.collect_statement_samples.time",
                              (time.time() - start_time) * 1000,
                              tags=tags)
        self._check.count(
            "dd.mysql.collect_statement_samples.events_submitted.count",
            submitted_count,
            tags=tags)
        self._check.gauge(
            "dd.mysql.collect_statement_samples.seen_samples_cache.len",
            len(self._seen_samples_cache),
            tags=tags)
        self._check.gauge(
            "dd.mysql.collect_statement_samples.explained_statements_cache.len",
            len(self._explained_statements_cache),
            tags=tags,
        )
        self._check.gauge(
            "dd.mysql.collect_statement_samples.collection_strategy_cache.len",
            len(self._collection_strategy_cache),
            tags=tags,
        )
Exemplo n.º 5
0
 def __init__(self, check, config, connection_args):
     self._check = check
     self._version_processed = False
     self._connection_args = connection_args
     # checkpoint at zero so we pull the whole history table on the first run
     self._checkpoint = 0
     self._log = get_check_logger()
     self._last_check_run = 0
     self._db = None
     self._tags = None
     self._tags_str = None
     self._service = "mysql"
     self._collection_loop_future = None
     self._cancel_event = threading.Event()
     self._rate_limiter = ConstantRateLimiter(1)
     self._config = config
     self._db_hostname = resolve_db_host(self._config.host)
     self._enabled = is_affirmative(
         self._config.statement_samples_config.get('enabled', False))
     self._run_sync = is_affirmative(
         self._config.statement_samples_config.get('run_sync', False))
     self._collections_per_second = self._config.statement_samples_config.get(
         'collections_per_second', -1)
     self._events_statements_row_limit = self._config.statement_samples_config.get(
         'events_statements_row_limit', 5000)
     self._explain_procedure = self._config.statement_samples_config.get(
         'explain_procedure', 'explain_statement')
     self._fully_qualified_explain_procedure = self._config.statement_samples_config.get(
         'fully_qualified_explain_procedure', 'datadog.explain_statement')
     self._events_statements_temp_table = self._config.statement_samples_config.get(
         'events_statements_temp_table_name', 'datadog.temp_events')
     self._events_statements_enable_procedure = self._config.statement_samples_config.get(
         'events_statements_enable_procedure',
         'datadog.enable_events_statements_consumers')
     self._preferred_events_statements_tables = EVENTS_STATEMENTS_PREFERRED_TABLES
     self._has_window_functions = False
     events_statements_table = self._config.statement_samples_config.get(
         'events_statements_table', None)
     if events_statements_table:
         if events_statements_table in DEFAULT_EVENTS_STATEMENTS_COLLECTIONS_PER_SECOND:
             self._log.debug(
                 "Configured preferred events_statements_table: %s",
                 events_statements_table)
             self._preferred_events_statements_tables = [
                 events_statements_table
             ]
         else:
             self._log.warning(
                 "Invalid events_statements_table: %s. Must be one of %s. Falling back to trying all tables.",
                 events_statements_table,
                 ', '.join(DEFAULT_EVENTS_STATEMENTS_COLLECTIONS_PER_SECOND.
                           keys()),
             )
     self._explain_strategies = {
         'PROCEDURE': self._run_explain_procedure,
         'FQ_PROCEDURE': self._run_fully_qualified_explain_procedure,
         'STATEMENT': self._run_explain,
     }
     self._preferred_explain_strategies = [
         'PROCEDURE', 'FQ_PROCEDURE', 'STATEMENT'
     ]
     self._init_caches()
     self._statement_samples_client = _new_statement_samples_client()
Exemplo n.º 6
0
class MySQLStatementSamples(object):
    """
    Collects statement samples and execution plans.
    """

    executor = ThreadPoolExecutor()

    def __init__(self, check, config, connection_args):
        self._check = check
        self._version_processed = False
        self._connection_args = connection_args
        # checkpoint at zero so we pull the whole history table on the first run
        self._checkpoint = 0
        self._log = get_check_logger()
        self._last_check_run = 0
        self._db = None
        self._tags = None
        self._tags_str = None
        self._service = "mysql"
        self._collection_loop_future = None
        self._cancel_event = threading.Event()
        self._rate_limiter = ConstantRateLimiter(1)
        self._config = config
        self._db_hostname = resolve_db_host(self._config.host)
        self._enabled = is_affirmative(
            self._config.statement_samples_config.get('enabled', False))
        self._run_sync = is_affirmative(
            self._config.statement_samples_config.get('run_sync', False))
        self._collections_per_second = self._config.statement_samples_config.get(
            'collections_per_second', -1)
        self._events_statements_row_limit = self._config.statement_samples_config.get(
            'events_statements_row_limit', 5000)
        self._explain_procedure = self._config.statement_samples_config.get(
            'explain_procedure', 'explain_statement')
        self._fully_qualified_explain_procedure = self._config.statement_samples_config.get(
            'fully_qualified_explain_procedure', 'datadog.explain_statement')
        self._events_statements_temp_table = self._config.statement_samples_config.get(
            'events_statements_temp_table_name', 'datadog.temp_events')
        self._events_statements_enable_procedure = self._config.statement_samples_config.get(
            'events_statements_enable_procedure',
            'datadog.enable_events_statements_consumers')
        self._preferred_events_statements_tables = EVENTS_STATEMENTS_PREFERRED_TABLES
        self._has_window_functions = False
        events_statements_table = self._config.statement_samples_config.get(
            'events_statements_table', None)
        if events_statements_table:
            if events_statements_table in DEFAULT_EVENTS_STATEMENTS_COLLECTIONS_PER_SECOND:
                self._log.debug(
                    "Configured preferred events_statements_table: %s",
                    events_statements_table)
                self._preferred_events_statements_tables = [
                    events_statements_table
                ]
            else:
                self._log.warning(
                    "Invalid events_statements_table: %s. Must be one of %s. Falling back to trying all tables.",
                    events_statements_table,
                    ', '.join(DEFAULT_EVENTS_STATEMENTS_COLLECTIONS_PER_SECOND.
                              keys()),
                )
        self._explain_strategies = {
            'PROCEDURE': self._run_explain_procedure,
            'FQ_PROCEDURE': self._run_fully_qualified_explain_procedure,
            'STATEMENT': self._run_explain,
        }
        self._preferred_explain_strategies = [
            'PROCEDURE', 'FQ_PROCEDURE', 'STATEMENT'
        ]
        self._init_caches()
        self._statement_samples_client = _new_statement_samples_client()

    def _init_caches(self):
        self._collection_strategy_cache = TTLCache(
            maxsize=self._config.statement_samples_config.get(
                'collection_strategy_cache_maxsize', 1000),
            ttl=self._config.statement_samples_config.get(
                'collection_strategy_cache_ttl', 300),
        )

        # explained_statements_cache: limit how often we try to re-explain the same query
        self._explained_statements_cache = TTLCache(
            maxsize=self._config.statement_samples_config.get(
                'explained_statements_cache_maxsize', 5000),
            ttl=60 * 60 / self._config.statement_samples_config.get(
                'explained_statements_per_hour_per_query', 60),
        )

        # seen_samples_cache: limit the ingestion rate per (query_signature, plan_signature)
        self._seen_samples_cache = TTLCache(
            # assuming ~100 bytes per entry (query & plan signature, key hash, 4 pointers (ordered dict), expiry time)
            # total size: 10k * 100 = 1 Mb
            maxsize=self._config.statement_samples_config.get(
                'seen_samples_cache_maxsize', 10000),
            ttl=60 * 60 / self._config.statement_samples_config.get(
                'samples_per_hour_per_query', 15),
        )

    def run_sampler(self, tags):
        """
        start the sampler thread if not already running & update tag metadata
        :param tags:
        :return:
        """
        if not self._enabled:
            self._log.debug("Statement sampler not enabled")
            return
        self._tags = tags
        self._tags_str = ','.join(tags)
        for t in self._tags:
            if t.startswith('service:'):
                self._service = t[len('service:'):]
        if not self._version_processed and self._check.version:
            self._has_window_functions = self._check.version.version_compatible(
                (8, 0, 0))
            if self._check.version.flavor == "MariaDB" or not self._check.version.version_compatible(
                (5, 7, 0)):
                self._global_status_table = "information_schema.global_status"
            else:
                self._global_status_table = "performance_schema.global_status"
            self._version_processed = True
        self._last_check_run = time.time()
        if self._run_sync or is_affirmative(
                os.environ.get('DBM_STATEMENT_SAMPLER_RUN_SYNC', "false")):
            self._log.debug("Running statement sampler synchronously")
            self._collect_statement_samples()
        elif self._collection_loop_future is None or not self._collection_loop_future.running(
        ):
            self._collection_loop_future = MySQLStatementSamples.executor.submit(
                self.collection_loop)
        else:
            self._log.debug(
                "Statement sampler collection loop already running")

    def cancel(self):
        """
        Cancels the collection loop thread if it's running.
        Returns immediately, leaving the thread to stop & clean up on its own time.
        """
        self._cancel_event.set()

    def _get_db_connection(self):
        """
        lazy reconnect db
        pymysql connections are not thread safe so we can't reuse the same connection from the main check
        :return:
        """
        if not self._db:
            self._db = pymysql.connect(**self._connection_args)
        return self._db

    def _close_db_conn(self):
        if self._db:
            try:
                self._db.close()
            except Exception:
                self._log.debug("Failed to close db connection", exc_info=1)
            finally:
                self._db = None

    def collection_loop(self):
        try:
            self._log.info("Starting statement sampler collection loop")
            while True:
                if self._cancel_event.isSet():
                    self._log.info("Collection loop cancelled")
                    self._check.count(
                        "dd.mysql.statement_samples.collection_loop_cancel",
                        1,
                        tags=self._tags)
                    break
                if time.time(
                ) - self._last_check_run > self._config.min_collection_interval * 2:
                    self._log.info(
                        "Stopping statement sampler collection loop due to check inactivity"
                    )
                    self._check.count(
                        "dd.mysql.statement_samples.collection_loop_inactive_stop",
                        1,
                        tags=self._tags)
                    break
                self._collect_statement_samples()
        except pymysql.err.DatabaseError as e:
            self._log.warning(
                "Statement sampler database error: %s",
                e,
                exc_info=self._log.getEffectiveLevel() == logging.DEBUG)
            self._check.count(
                "dd.mysql.statement_samples.error",
                1,
                tags=self._tags +
                ["error:collection-loop-database-error-{}".format(type(e))],
            )
        except Exception as e:
            self._log.exception("Statement sampler collection loop crash")
            self._check.count(
                "dd.mysql.statement_samples.error",
                1,
                tags=self._tags +
                ["error:collection-loop-crash-{}".format(type(e))],
            )
        finally:
            self._log.info("Shutting down statement sampler collection loop")
            self._close_db_conn()

    def _cursor_run(self, cursor, query, params=None, obfuscated_params=None):
        """
        Run and log the query. If provided, obfuscated params are logged in place of the regular params.
        """
        self._log.debug("Running query [%s] %s", query,
                        obfuscated_params if obfuscated_params else params)
        cursor.execute(query, params)

    def _get_new_events_statements(self, events_statements_table, row_limit):
        # Select the most recent events with a bias towards events which have higher wait times
        start = time.time()
        drop_temp_table_query = "DROP TEMPORARY TABLE IF EXISTS {}".format(
            self._events_statements_temp_table)
        params = (self._checkpoint, row_limit)
        with closing(self._get_db_connection().cursor(
                pymysql.cursors.DictCursor)) as cursor:
            # silence expected warnings to avoid spam
            cursor.execute('SET @@SESSION.sql_notes = 0')
            try:
                self._cursor_run(cursor, drop_temp_table_query)
                self._cursor_run(
                    cursor,
                    CREATE_TEMP_TABLE.format(
                        temp_table=self._events_statements_temp_table,
                        statements_table="performance_schema." +
                        events_statements_table,
                    ),
                    params,
                )
            except pymysql.err.DatabaseError as e:
                self._check.count(
                    "dd.mysql.statement_samples.error",
                    1,
                    tags=self._tags +
                    ["error:create-temp-table-{}".format(type(e))],
                )
                raise
            if self._has_window_functions:
                sub_select = SUB_SELECT_EVENTS_WINDOW
            else:
                self._cursor_run(cursor, "set @row_num = 0")
                self._cursor_run(cursor, "set @current_digest = ''")
                sub_select = SUB_SELECT_EVENTS_NUMBERED
            self._cursor_run(
                cursor,
                "set @startup_time_s = {}".format(
                    STARTUP_TIME_SUBQUERY.format(
                        global_status_table=self._global_status_table)),
            )
            self._cursor_run(
                cursor,
                EVENTS_STATEMENTS_QUERY.format(
                    statements_numbered=sub_select.format(
                        statements_table=self._events_statements_temp_table)),
                params,
            )
            rows = cursor.fetchall()
            self._cursor_run(cursor, drop_temp_table_query)
            tags = self._tags + [
                "events_statements_table:{}".format(events_statements_table)
            ]
            self._check.histogram("dd.mysql.get_new_events_statements.time",
                                  (time.time() - start) * 1000,
                                  tags=tags)
            self._check.histogram("dd.mysql.get_new_events_statements.rows",
                                  len(rows),
                                  tags=tags)
            self._log.debug("Read %s rows from %s", len(rows),
                            events_statements_table)
            return rows

    def _filter_valid_statement_rows(self, rows):
        num_sent = 0
        num_truncated = 0

        for row in rows:
            if not row or not all(row):
                self._log.debug(
                    'Row was unexpectedly truncated or the events_statements table is not enabled'
                )
                continue
            sql_text = row['sql_text']
            if not sql_text:
                continue
            # The SQL_TEXT column will store 1024 chars by default. Plans cannot be captured on truncated
            # queries, so the `performance_schema_max_sql_text_length` variable must be raised.
            if sql_text[-3:] == '...':
                num_truncated += 1
                continue
            yield row
            # only save the checkpoint for rows that we have successfully processed
            # else rows that we ignore can push the checkpoint forward causing us to miss some on the next run
            if row['timer_start'] > self._checkpoint:
                self._checkpoint = row['timer_start']
            num_sent += 1

        if num_truncated > 0:
            self._log.warning(
                'Unable to collect %d/%d statement samples due to truncated SQL text. Consider raising '
                '`performance_schema_max_sql_text_length` to capture these queries.',
                num_truncated,
                num_truncated + num_sent,
            )
            self._check.count("dd.mysql.statement_samples.error",
                              1,
                              tags=self._tags + ["error:truncated-sql-text"])

    def _collect_plan_for_statement(self, row):
        # Plans have several important signatures to tag events with:
        # - `plan_signature` - hash computed from the normalized JSON plan to group identical plan trees
        # - `resource_hash` - hash computed off the raw sql text to match apm resources
        # - `query_signature` - hash computed from the digest text to match query metrics

        try:
            obfuscated_statement = datadog_agent.obfuscate_sql(row['sql_text'])
            obfuscated_digest_text = datadog_agent.obfuscate_sql(
                row['digest_text'])
        except Exception:
            # do not log the raw sql_text to avoid leaking sensitive data into logs. digest_text is safe as parameters
            # are obfuscated by the database
            self._log.debug("Failed to obfuscate statement: %s",
                            row['digest_text'])
            self._check.count("dd.mysql.statement_samples.error",
                              1,
                              tags=self._tags + ["error:sql-obfuscate"])
            return None

        apm_resource_hash = compute_sql_signature(obfuscated_statement)
        query_signature = compute_sql_signature(obfuscated_digest_text)

        query_cache_key = (row['current_schema'], query_signature)
        if query_cache_key in self._explained_statements_cache:
            return None
        self._explained_statements_cache[query_cache_key] = True

        plan = None
        with closing(self._get_db_connection().cursor()) as cursor:
            try:
                plan = self._explain_statement(cursor, row['sql_text'],
                                               row['current_schema'],
                                               obfuscated_statement)
            except Exception as e:
                self._check.count("dd.mysql.statement_samples.error",
                                  1,
                                  tags=self._tags +
                                  ["error:explain-{}".format(type(e))])
                self._log.exception("Failed to explain statement: %s",
                                    obfuscated_statement)

        normalized_plan, obfuscated_plan, plan_signature, plan_cost = None, None, None, None
        if plan:
            normalized_plan = datadog_agent.obfuscate_sql_exec_plan(
                plan, normalize=True) if plan else None
            obfuscated_plan = datadog_agent.obfuscate_sql_exec_plan(plan)
            plan_signature = compute_exec_plan_signature(normalized_plan)
            plan_cost = self._parse_execution_plan_cost(plan)

        query_plan_cache_key = (query_cache_key, plan_signature)
        if query_plan_cache_key not in self._seen_samples_cache:
            self._seen_samples_cache[query_plan_cache_key] = True
            return {
                "timestamp": row["timer_end_time_s"] * 1000,
                "host": self._db_hostname,
                "service": self._service,
                "ddsource": "mysql",
                "ddtags": self._tags_str,
                "duration": row['timer_wait_ns'],
                "network": {
                    "client": {
                        "ip": row.get('processlist_host', None),
                    }
                },
                "db": {
                    "instance": row['current_schema'],
                    "plan": {
                        "definition": obfuscated_plan,
                        "cost": plan_cost,
                        "signature": plan_signature
                    },
                    "query_signature": query_signature,
                    "resource_hash": apm_resource_hash,
                    "statement": obfuscated_statement,
                },
                'mysql': {
                    k: v
                    for k, v in row.items()
                    if k not in EVENTS_STATEMENTS_SAMPLE_EXCLUDE_KEYS
                },
            }

    def _collect_plans_for_statements(self, rows):
        for row in rows:
            try:
                event = self._collect_plan_for_statement(row)
                if event:
                    yield event
            except Exception:
                self._log.debug("Failed to collect plan for statement",
                                exc_info=1)

    def _get_enabled_performance_schema_consumers(self):
        """
        Returns the list of available performance schema consumers
        I.e. (events_statements_current, events_statements_history)
        :return:
        """
        with closing(self._get_db_connection().cursor()) as cursor:
            self._cursor_run(cursor, ENABLED_STATEMENTS_CONSUMERS_QUERY)
            return set([r[0] for r in cursor.fetchall()])

    def _enable_events_statements_consumers(self):
        """
        Enable events statements consumers
        :return:
        """
        try:
            with closing(self._get_db_connection().cursor()) as cursor:
                self._cursor_run(
                    cursor, 'CALL {}()'.format(
                        self._events_statements_enable_procedure))
        except pymysql.err.DatabaseError as e:
            self._log.debug(
                "failed to enable events_statements consumers using procedure=%s: %s",
                self._events_statements_enable_procedure,
                e,
            )

    def _get_sample_collection_strategy(self):
        """
        Decides on the plan collection strategy:
        - which events_statement_history-* table are we using
        - how long should the rate and time limits be
        :return: (table, rate_limit)
        """
        cached_strategy = self._collection_strategy_cache.get(
            "plan_collection_strategy")
        if cached_strategy:
            self._log.debug("Using cached plan_collection_strategy: %s",
                            cached_strategy)
            return cached_strategy

        enabled_consumers = self._get_enabled_performance_schema_consumers()
        if len(enabled_consumers) < 3:
            self._enable_events_statements_consumers()
            enabled_consumers = self._get_enabled_performance_schema_consumers(
            )

        if not enabled_consumers:
            self._log.warning(
                "Cannot collect statement samples as there are no enabled performance_schema.events_statements_* "
                "consumers. Enable performance_schema and at least one events_statements consumer in order to collect "
                "statement samples.")
            self._check.count(
                "dd.mysql.statement_samples.error",
                1,
                tags=self._tags +
                ["error:no-enabled-events-statements-consumers"],
            )
            return None, None
        self._log.debug(
            "Found enabled performance_schema statements consumers: %s",
            enabled_consumers)

        events_statements_table = None
        for table in self._preferred_events_statements_tables:
            if table not in enabled_consumers:
                continue
            rows = self._get_new_events_statements(table, 1)
            if not rows:
                self._log.debug(
                    "No statements found in %s table. checking next one.",
                    table)
                continue
            events_statements_table = table
            break
        if not events_statements_table:
            self._log.warning(
                "Cannot collect statement samples as all enabled events_statements_consumers %s are empty.",
                enabled_consumers,
            )
            return None, None

        rate_limit = self._collections_per_second
        if rate_limit < 0:
            rate_limit = DEFAULT_EVENTS_STATEMENTS_COLLECTIONS_PER_SECOND[
                events_statements_table]

        # cache only successful strategies
        # should be short enough that we'll reflect updates relatively quickly
        # i.e., an aurora replica becomes a master (or vice versa).
        strategy = (events_statements_table, rate_limit)
        self._log.debug(
            "Chose plan collection strategy: events_statements_table=%s, collections_per_second=%s",
            events_statements_table,
            rate_limit,
        )
        self._collection_strategy_cache["plan_collection_strategy"] = strategy
        return strategy

    def _collect_statement_samples(self):
        self._rate_limiter.sleep()
        events_statements_table, rate_limit = self._get_sample_collection_strategy(
        )
        if not events_statements_table:
            return
        if self._rate_limiter.rate_limit_s != rate_limit:
            self._rate_limiter = ConstantRateLimiter(rate_limit)
        start_time = time.time()

        tags = self._tags + [
            "events_statements_table:{}".format(events_statements_table)
        ]
        rows = self._get_new_events_statements(
            events_statements_table, self._events_statements_row_limit)
        rows = self._filter_valid_statement_rows(rows)
        events = self._collect_plans_for_statements(rows)
        submitted_count, failed_count = self._statement_samples_client.submit_events(
            events)
        self._check.count("dd.mysql.statement_samples.error",
                          failed_count,
                          tags=self._tags + ["error:submit-events"])
        self._check.histogram("dd.mysql.collect_statement_samples.time",
                              (time.time() - start_time) * 1000,
                              tags=tags)
        self._check.count(
            "dd.mysql.collect_statement_samples.events_submitted.count",
            submitted_count,
            tags=tags)
        self._check.gauge(
            "dd.mysql.collect_statement_samples.seen_samples_cache.len",
            len(self._seen_samples_cache),
            tags=tags)
        self._check.gauge(
            "dd.mysql.collect_statement_samples.explained_statements_cache.len",
            len(self._explained_statements_cache),
            tags=tags,
        )
        self._check.gauge(
            "dd.mysql.collect_statement_samples.collection_strategy_cache.len",
            len(self._collection_strategy_cache),
            tags=tags,
        )

    def _explain_statement(self, cursor, statement, schema,
                           obfuscated_statement):
        """
        Tries the available methods used to explain a statement for the given schema. If a non-retryable
        error occurs (such as a permissions error), then statements executed under the schema will be
        disallowed in future attempts.
        """
        start_time = time.time()
        strategy_cache_key = 'explain_strategy:%s' % schema
        explain_strategy_error = 'ERROR'
        tags = self._tags + ["schema:{}".format(schema)]

        self._log.debug('explaining statement. schema=%s, statement="%s"',
                        schema, statement)

        if not self._can_explain(obfuscated_statement):
            self._log.debug('Skipping statement which cannot be explained: %s',
                            obfuscated_statement)
            return None

        if self._collection_strategy_cache.get(
                strategy_cache_key) == explain_strategy_error:
            self._log.debug(
                'Skipping statement due to cached collection failure: %s',
                obfuscated_statement)
            return None

        try:
            # If there was a default schema when this query was run, then switch to it before trying to collect
            # the execution plan. This is necessary when the statement uses non-fully qualified tables
            # e.g. `select * from mytable` instead of `select * from myschema.mytable`
            if schema:
                self._cursor_run(cursor, 'USE `{}`'.format(schema))
        except pymysql.err.DatabaseError as e:
            if len(e.args) != 2:
                raise
            if e.args[0] in PYMYSQL_NON_RETRYABLE_ERRORS:
                self._collection_strategy_cache[
                    strategy_cache_key] = explain_strategy_error
            self._check.count("dd.mysql.statement_samples.error",
                              1,
                              tags=tags +
                              ["error:explain-use-schema-{}".format(type(e))])
            self._log.debug(
                'Failed to collect execution plan because schema could not be accessed. error=%s, schema=%s, '
                'statement="%s"',
                e.args,
                schema,
                obfuscated_statement,
            )
            return None

        # Use a cached strategy for the schema, if any, or try each strategy to collect plans
        strategies = list(self._preferred_explain_strategies)
        cached = self._collection_strategy_cache.get(strategy_cache_key)
        if cached:
            strategies.remove(cached)
            strategies.insert(0, cached)

        for strategy in strategies:
            try:
                if not schema and strategy == "PROCEDURE":
                    self._log.debug(
                        'skipping PROCEDURE strategy as there is no default schema for this statement="%s"',
                        obfuscated_statement,
                    )
                    continue
                plan = self._explain_strategies[strategy](cursor, statement,
                                                          obfuscated_statement)
                if plan:
                    self._collection_strategy_cache[
                        strategy_cache_key] = strategy
                    self._log.debug(
                        'Successfully collected execution plan. strategy=%s, schema=%s, statement="%s"',
                        strategy,
                        schema,
                        obfuscated_statement,
                    )
                    self._check.histogram(
                        "dd.mysql.run_explain.time",
                        (time.time() - start_time) * 1000,
                        tags=self._tags + ["strategy:{}".format(strategy)],
                    )
                    return plan
            except pymysql.err.DatabaseError as e:
                if len(e.args) != 2:
                    raise
                # we don't cache failed plan collection failures for specific queries because some queries in a schema
                # can fail while others succeed. The failed collection will be cached for the specific query
                # so we won't try to explain it again for the cache duration there.
                self._check.count(
                    "dd.mysql.statement_samples.error",
                    1,
                    tags=tags +
                    ["error:explain-attempt-{}-{}".format(strategy, type(e))],
                )
                self._log.debug(
                    'Failed to collect execution plan. error=%s, strategy=%s, schema=%s, statement="%s"',
                    e.args,
                    strategy,
                    schema,
                    obfuscated_statement,
                )
                continue
        return None

    def _run_explain(self, cursor, statement, obfuscated_statement):
        """
        Run the explain using the EXPLAIN statement
        """
        self._log.debug("running query [EXPLAIN FORMAT=json %s]",
                        obfuscated_statement)
        cursor.execute('EXPLAIN FORMAT=json {}'.format(statement))
        return cursor.fetchone()[0]

    def _run_explain_procedure(self, cursor, statement, obfuscated_statement):
        """
        Run the explain by calling the stored procedure if available.
        """
        self._cursor_run(cursor, 'CALL {}(%s)'.format(self._explain_procedure),
                         statement, obfuscated_statement)
        return cursor.fetchone()[0]

    def _run_fully_qualified_explain_procedure(self, cursor, statement,
                                               obfuscated_statement):
        """
        Run the explain by calling the fully qualified stored procedure if available.
        """
        self._cursor_run(
            cursor,
            'CALL {}(%s)'.format(self._fully_qualified_explain_procedure),
            statement, obfuscated_statement)
        return cursor.fetchone()[0]

    @staticmethod
    def _can_explain(obfuscated_statement):
        return obfuscated_statement.split(
            ' ', 1)[0].lower() in VALID_EXPLAIN_STATEMENTS

    @staticmethod
    def _parse_execution_plan_cost(execution_plan):
        """
        Parses the total cost from the execution plan, if set. If not set, returns cost of 0.
        """
        cost = json.loads(execution_plan).get('query_block', {}).get(
            'cost_info', {}).get('query_cost', 0.0)
        return float(cost or 0.0)
Exemplo n.º 7
0
class PostgresStatementSamples(object):
    """
    Collects statement samples and execution plans.
    """

    executor = ThreadPoolExecutor()

    def __init__(self, check, config):
        self._check = check
        self._db = None
        self._config = config
        self._log = get_check_logger()
        self._activity_last_query_start = None
        self._last_check_run = 0
        self._collection_loop_future = None
        self._cancel_event = threading.Event()
        self._tags = None
        self._tags_str = None
        self._service = "postgres"
        self._db_hostname = resolve_db_host(self._config.host)
        self._enabled = is_affirmative(self._config.statement_samples_config.get('enabled', False))
        self._run_sync = is_affirmative(self._config.statement_samples_config.get('run_sync', False))
        self._rate_limiter = ConstantRateLimiter(
            float(self._config.statement_samples_config.get('collections_per_second', 1))
        )
        self._explain_function = self._config.statement_samples_config.get(
            'explain_function', 'datadog.explain_statement'
        )

        # explained_statements_cache: limit how often we try to re-explain the same query
        self._explained_statements_cache = TTLCache(
            maxsize=int(self._config.statement_samples_config.get('explained_statements_cache_maxsize', 5000)),
            ttl=60 * 60 / int(self._config.statement_samples_config.get('explained_statements_per_hour_per_query', 60)),
        )

        # seen_samples_cache: limit the ingestion rate per (query_signature, plan_signature)
        self._seen_samples_cache = TTLCache(
            # assuming ~100 bytes per entry (query & plan signature, key hash, 4 pointers (ordered dict), expiry time)
            # total size: 10k * 100 = 1 Mb
            maxsize=int(self._config.statement_samples_config.get('seen_samples_cache_maxsize', 10000)),
            ttl=60 * 60 / int(self._config.statement_samples_config.get('samples_per_hour_per_query', 15)),
        )

    def cancel(self):
        self._cancel_event.set()

    def run_sampler(self, tags):
        """
        start the sampler thread if not already running
        :param tags:
        :return:
        """
        if not self._enabled:
            self._log.debug("Statement sampler not enabled")
            return
        self._tags = tags
        self._tags_str = ','.join(self._tags)
        for t in self._tags:
            if t.startswith('service:'):
                self._service = t[len('service:') :]
        self._last_check_run = time.time()
        if self._run_sync or is_affirmative(os.environ.get('DBM_STATEMENT_SAMPLER_RUN_SYNC', "false")):
            self._log.debug("Running statement sampler synchronously")
            self._collect_statement_samples()
        elif self._collection_loop_future is None or not self._collection_loop_future.running():
            self._collection_loop_future = PostgresStatementSamples.executor.submit(self._collection_loop)
        else:
            self._log.debug("Statement sampler collection loop already running")

    def _get_new_pg_stat_activity(self):
        start_time = time.time()
        query = PG_STAT_ACTIVITY_QUERY.format(pg_stat_activity_view=self._config.pg_stat_activity_view)
        with self._get_db().cursor(cursor_factory=psycopg2.extras.DictCursor) as cursor:
            params = (self._config.dbname,)
            if self._activity_last_query_start:
                query = query + " AND query_start > %s"
                params = params + (self._activity_last_query_start,)
            self._log.debug("Running query [%s] %s", query, params)
            cursor.execute(query, params)
            rows = cursor.fetchall()
        self._check.histogram(
            "dd.postgres.get_new_pg_stat_activity.time", (time.time() - start_time) * 1000, tags=self._tags
        )
        self._check.histogram("dd.postgres.get_new_pg_stat_activity.rows", len(rows), tags=self._tags)
        self._log.debug("Loaded %s rows from %s", len(rows), self._config.pg_stat_activity_view)
        return rows

    def _filter_valid_statement_rows(self, rows):
        insufficient_privilege_count = 0
        total_count = 0
        for row in rows:
            total_count += 1
            if not row['datname']:
                continue
            query = row['query']
            if not query:
                continue
            if query == '<insufficient privilege>':
                insufficient_privilege_count += 1
                continue
            if self._activity_last_query_start is None or row['query_start'] > self._activity_last_query_start:
                self._activity_last_query_start = row['query_start']
            yield row
        if insufficient_privilege_count > 0:
            self._log.warning(
                "Insufficient privilege for %s/%s queries when collecting from %s.", self._config.pg_stat_activity_view
            )
            self._check.count(
                "dd.postgres.statement_samples.error",
                insufficient_privilege_count,
                tags=self._tags + ["error:insufficient-privilege"],
            )

    def _get_db(self):
        # while psycopg2 is threadsafe (meaning in theory we should be able to use the same connection as the parent
        # check), the parent doesn't use autocommit and instead calls commit() and rollback() explicitly, meaning
        # it can cause strange clashing issues if we're trying to use the same connection from another thread here.
        # since the statement sampler runs continuously it's best we have our own connection here with autocommit
        # enabled
        if not self._db or self._db.closed:
            self._db = self._check._new_connection()
            self._db.set_session(autocommit=True)
        if self._db.status != psycopg2.extensions.STATUS_READY:
            # Some transaction went wrong and the connection is in an unhealthy state. Let's fix that
            self._db.rollback()
        return self._db

    def _collection_loop(self):
        try:
            self._log.info("Starting statement sampler collection loop")
            while True:
                if self._cancel_event.isSet():
                    self._log.info("Collection loop cancelled")
                    self._check.count("dd.postgres.statement_samples.collection_loop_cancel", 1, tags=self._tags)
                    break
                if time.time() - self._last_check_run > self._config.min_collection_interval * 2:
                    self._log.info("Sampler collection loop stopping due to check inactivity")
                    self._check.count("dd.postgres.statement_samples.collection_loop_inactive_stop", 1, tags=self._tags)
                    break
                self._collect_statement_samples()
        except psycopg2.errors.DatabaseError as e:
            self._log.warning(
                "Statement sampler database error: %s", e, exc_info=self._log.getEffectiveLevel() == logging.DEBUG
            )
            self._check.count(
                "dd.postgres.statement_samples.error",
                1,
                tags=self._tags + ["error:database-{}".format(type(e))],
            )
        except Exception as e:
            self._log.exception("Statement sampler collection loop crash")
            self._check.count(
                "dd.postgres.statement_samples.error",
                1,
                tags=self._tags + ["error:collection-loop-crash-{}".format(type(e))],
            )
        finally:
            self._log.info("Shutting down statement sampler collection loop")
            self._close_db_conn()

    def _close_db_conn(self):
        if self._db and not self._db.closed:
            try:
                self._db.close()
            except Exception:
                self._log.exception("failed to close DB connection")
        self._db = None

    def _collect_statement_samples(self):
        self._rate_limiter.sleep()
        start_time = time.time()
        rows = self._get_new_pg_stat_activity()
        rows = self._filter_valid_statement_rows(rows)
        events = self._explain_pg_stat_activity(rows)
        submitted_count = 0
        for e in events:
            self._check.database_monitoring_query_sample(json.dumps(e, default=default_json_event_encoding))
            submitted_count += 1
        elapsed_ms = (time.time() - start_time) * 1000
        self._check.histogram("dd.postgres.collect_statement_samples.time", elapsed_ms, tags=self._tags)
        self._check.count(
            "dd.postgres.collect_statement_samples.events_submitted.count", submitted_count, tags=self._tags
        )
        self._check.gauge(
            "dd.postgres.collect_statement_samples.seen_samples_cache.len",
            len(self._seen_samples_cache),
            tags=self._tags,
        )
        self._check.gauge(
            "dd.postgres.collect_statement_samples.explained_statements_cache.len",
            len(self._explained_statements_cache),
            tags=self._tags,
        )

    def _can_explain_statement(self, obfuscated_statement):
        if obfuscated_statement.startswith('SELECT {}'.format(self._explain_function)):
            return False
        if obfuscated_statement.startswith('autovacuum:'):
            return False
        if obfuscated_statement.split(' ', 1)[0].lower() not in VALID_EXPLAIN_STATEMENTS:
            return False
        return True

    def _run_explain(self, statement, obfuscated_statement):
        if not self._can_explain_statement(obfuscated_statement):
            return None
        with self._get_db().cursor() as cursor:
            try:
                start_time = time.time()
                self._log.debug("Running query: %s(%s)", self._explain_function, obfuscated_statement)
                cursor.execute(
                    """SELECT {explain_function}($stmt${statement}$stmt$)""".format(
                        explain_function=self._explain_function, statement=statement
                    )
                )
                result = cursor.fetchone()
                self._check.histogram(
                    "dd.postgres.run_explain.time", (time.time() - start_time) * 1000, tags=self._tags
                )
            except psycopg2.errors.UndefinedFunction:
                self._log.warning(
                    "Failed to collect execution plan due to undefined explain_function=%s",
                    self._explain_function,
                )
                self._check.count(
                    "dd.postgres.statement_samples.error", 1, tags=self._tags + ["error:explain-undefined-function"]
                )
                return None
            except Exception as e:
                self._log.debug("Failed to collect execution plan. query='%s'", obfuscated_statement, exc_info=1)
                self._check.count(
                    "dd.postgres.statement_samples.error", 1, tags=self._tags + ["error:explain-{}".format(type(e))]
                )
                return None
        if not result or len(result) < 1 or len(result[0]) < 1:
            return None
        return result[0][0]

    def _collect_plan_for_statement(self, row):
        try:
            obfuscated_statement = datadog_agent.obfuscate_sql(row['query'])
        except Exception as e:
            self._log.debug("Failed to obfuscate statement: %s", e)
            self._check.count("dd.postgres.statement_samples.error", 1, tags=self._tags + ["error:sql-obfuscate"])
            return None

        # limit the rate of explains done to the database
        query_signature = compute_sql_signature(obfuscated_statement)
        if query_signature in self._explained_statements_cache:
            return None
        self._explained_statements_cache[query_signature] = True

        # Plans have several important signatures to tag events with. Note that for postgres, the
        # query_signature and resource_hash will be the same value.
        # - `plan_signature` - hash computed from the normalized JSON plan to group identical plan trees
        # - `resource_hash` - hash computed off the raw sql text to match apm resources
        # - `query_signature` - hash computed from the raw sql text to match query metrics
        plan_dict = self._run_explain(row['query'], obfuscated_statement)
        plan, normalized_plan, obfuscated_plan, plan_signature, plan_cost = None, None, None, None, None
        if plan_dict:
            plan = json.dumps(plan_dict)
            # if we're using the orjson implementation then json.dumps returns bytes
            plan = plan.decode('utf-8') if isinstance(plan, bytes) else plan
            normalized_plan = datadog_agent.obfuscate_sql_exec_plan(plan, normalize=True)
            obfuscated_plan = datadog_agent.obfuscate_sql_exec_plan(plan)
            plan_signature = compute_exec_plan_signature(normalized_plan)
            plan_cost = plan_dict.get('Plan', {}).get('Total Cost', 0.0) or 0.0

        statement_plan_sig = (query_signature, plan_signature)
        if statement_plan_sig not in self._seen_samples_cache:
            self._seen_samples_cache[statement_plan_sig] = True
            event = {
                "host": self._db_hostname,
                "service": self._service,
                "ddsource": "postgres",
                "ddtags": self._tags_str,
                "network": {
                    "client": {
                        "ip": row.get('client_addr', None),
                        "port": row.get('client_port', None),
                        "hostname": row.get('client_hostname', None),
                    }
                },
                "db": {
                    "instance": row.get('datname', None),
                    "plan": {"definition": obfuscated_plan, "cost": plan_cost, "signature": plan_signature},
                    "query_signature": query_signature,
                    "resource_hash": query_signature,
                    "application": row.get('application_name', None),
                    "user": row['usename'],
                    "statement": obfuscated_statement,
                },
                'postgres': {k: v for k, v in row.items() if k not in pg_stat_activity_sample_exclude_keys},
            }
            event['timestamp'] = time.time() * 1000
            if row['state'] in {'idle', 'idle in transaction'}:
                if row['state_change'] and row['query_start']:
                    event['duration'] = (row['state_change'] - row['query_start']).total_seconds() * 1e9
                    # If the transaction is idle then we have a more specific "end time" than the current time at
                    # which we're collecting this event. According to the postgres docs, all of the timestamps in
                    # pg_stat_activity are `timestamp with time zone` so the timezone should always be present. However,
                    # if there is something wrong and it's missing then we can't use `state_change` for the timestamp
                    # of the event else we risk the timestamp being significantly off and the event getting dropped
                    # during ingestion.
                    if row['state_change'].tzinfo:
                        event['timestamp'] = get_timestamp(row['state_change']) * 1000
            return event

    def _explain_pg_stat_activity(self, rows):
        for row in rows:
            try:
                event = self._collect_plan_for_statement(row)
                if event:
                    yield event
            except Exception:
                self._log.exception("Crashed trying to collect execution plan for statement")
                self._check.count(
                    "dd.postgres.statement_samples.error",
                    1,
                    tags=self._tags + ["error:collect-plan-for-statement-crash"],
                )