def test_constant_rate_limiter(): rate_limit = 8 test_duration_s = 0.5 ratelimiter = ConstantRateLimiter(rate_limit) start = time.time() sleep_count = 0 while time.time() - start < test_duration_s: ratelimiter.sleep() sleep_count += 1 max_expected_count = rate_limit * test_duration_s assert max_expected_count - 1 <= sleep_count <= max_expected_count + 1
class MySQLStatementSamples(object): """ Collects statement samples and execution plans. """ executor = ThreadPoolExecutor() def __init__(self, check, config, connection_args): self._check = check self._version_processed = False self._connection_args = connection_args # checkpoint at zero so we pull the whole history table on the first run self._checkpoint = 0 self._log = get_check_logger() self._last_check_run = 0 self._db = None self._tags = None self._tags_str = None self._service = "mysql" self._collection_loop_future = None self._cancel_event = threading.Event() self._rate_limiter = ConstantRateLimiter(1) self._config = config self._db_hostname = resolve_db_host(self._config.host) self._enabled = is_affirmative( self._config.statement_samples_config.get('enabled', False)) self._run_sync = is_affirmative( self._config.statement_samples_config.get('run_sync', False)) self._collections_per_second = self._config.statement_samples_config.get( 'collections_per_second', -1) self._events_statements_row_limit = self._config.statement_samples_config.get( 'events_statements_row_limit', 5000) self._explain_procedure = self._config.statement_samples_config.get( 'explain_procedure', 'explain_statement') self._fully_qualified_explain_procedure = self._config.statement_samples_config.get( 'fully_qualified_explain_procedure', 'datadog.explain_statement') self._events_statements_temp_table = self._config.statement_samples_config.get( 'events_statements_temp_table_name', 'datadog.temp_events') self._events_statements_enable_procedure = self._config.statement_samples_config.get( 'events_statements_enable_procedure', 'datadog.enable_events_statements_consumers') self._preferred_events_statements_tables = EVENTS_STATEMENTS_PREFERRED_TABLES self._has_window_functions = False events_statements_table = self._config.statement_samples_config.get( 'events_statements_table', None) if events_statements_table: if events_statements_table in DEFAULT_EVENTS_STATEMENTS_COLLECTIONS_PER_SECOND: self._log.debug( "Configured preferred events_statements_table: %s", events_statements_table) self._preferred_events_statements_tables = [ events_statements_table ] else: self._log.warning( "Invalid events_statements_table: %s. Must be one of %s. Falling back to trying all tables.", events_statements_table, ', '.join(DEFAULT_EVENTS_STATEMENTS_COLLECTIONS_PER_SECOND. keys()), ) self._explain_strategies = { 'PROCEDURE': self._run_explain_procedure, 'FQ_PROCEDURE': self._run_fully_qualified_explain_procedure, 'STATEMENT': self._run_explain, } self._preferred_explain_strategies = [ 'PROCEDURE', 'FQ_PROCEDURE', 'STATEMENT' ] self._init_caches() self._statement_samples_client = _new_statement_samples_client() def _init_caches(self): self._collection_strategy_cache = TTLCache( maxsize=self._config.statement_samples_config.get( 'collection_strategy_cache_maxsize', 1000), ttl=self._config.statement_samples_config.get( 'collection_strategy_cache_ttl', 300), ) # explained_statements_cache: limit how often we try to re-explain the same query self._explained_statements_cache = TTLCache( maxsize=self._config.statement_samples_config.get( 'explained_statements_cache_maxsize', 5000), ttl=60 * 60 / self._config.statement_samples_config.get( 'explained_statements_per_hour_per_query', 60), ) # seen_samples_cache: limit the ingestion rate per (query_signature, plan_signature) self._seen_samples_cache = TTLCache( # assuming ~100 bytes per entry (query & plan signature, key hash, 4 pointers (ordered dict), expiry time) # total size: 10k * 100 = 1 Mb maxsize=self._config.statement_samples_config.get( 'seen_samples_cache_maxsize', 10000), ttl=60 * 60 / self._config.statement_samples_config.get( 'samples_per_hour_per_query', 15), ) def run_sampler(self, tags): """ start the sampler thread if not already running & update tag metadata :param tags: :return: """ if not self._enabled: self._log.debug("Statement sampler not enabled") return self._tags = tags self._tags_str = ','.join(tags) for t in self._tags: if t.startswith('service:'): self._service = t[len('service:'):] if not self._version_processed and self._check.version: self._has_window_functions = self._check.version.version_compatible( (8, 0, 0)) if self._check.version.flavor == "MariaDB" or not self._check.version.version_compatible( (5, 7, 0)): self._global_status_table = "information_schema.global_status" else: self._global_status_table = "performance_schema.global_status" self._version_processed = True self._last_check_run = time.time() if self._run_sync or is_affirmative( os.environ.get('DBM_STATEMENT_SAMPLER_RUN_SYNC', "false")): self._log.debug("Running statement sampler synchronously") self._collect_statement_samples() elif self._collection_loop_future is None or not self._collection_loop_future.running( ): self._collection_loop_future = MySQLStatementSamples.executor.submit( self.collection_loop) else: self._log.debug( "Statement sampler collection loop already running") def cancel(self): """ Cancels the collection loop thread if it's running. Returns immediately, leaving the thread to stop & clean up on its own time. """ self._cancel_event.set() def _get_db_connection(self): """ lazy reconnect db pymysql connections are not thread safe so we can't reuse the same connection from the main check :return: """ if not self._db: self._db = pymysql.connect(**self._connection_args) return self._db def _close_db_conn(self): if self._db: try: self._db.close() except Exception: self._log.debug("Failed to close db connection", exc_info=1) finally: self._db = None def collection_loop(self): try: self._log.info("Starting statement sampler collection loop") while True: if self._cancel_event.isSet(): self._log.info("Collection loop cancelled") self._check.count( "dd.mysql.statement_samples.collection_loop_cancel", 1, tags=self._tags) break if time.time( ) - self._last_check_run > self._config.min_collection_interval * 2: self._log.info( "Stopping statement sampler collection loop due to check inactivity" ) self._check.count( "dd.mysql.statement_samples.collection_loop_inactive_stop", 1, tags=self._tags) break self._collect_statement_samples() except pymysql.err.DatabaseError as e: self._log.warning( "Statement sampler database error: %s", e, exc_info=self._log.getEffectiveLevel() == logging.DEBUG) self._check.count( "dd.mysql.statement_samples.error", 1, tags=self._tags + ["error:collection-loop-database-error-{}".format(type(e))], ) except Exception as e: self._log.exception("Statement sampler collection loop crash") self._check.count( "dd.mysql.statement_samples.error", 1, tags=self._tags + ["error:collection-loop-crash-{}".format(type(e))], ) finally: self._log.info("Shutting down statement sampler collection loop") self._close_db_conn() def _cursor_run(self, cursor, query, params=None, obfuscated_params=None): """ Run and log the query. If provided, obfuscated params are logged in place of the regular params. """ self._log.debug("Running query [%s] %s", query, obfuscated_params if obfuscated_params else params) cursor.execute(query, params) def _get_new_events_statements(self, events_statements_table, row_limit): # Select the most recent events with a bias towards events which have higher wait times start = time.time() drop_temp_table_query = "DROP TEMPORARY TABLE IF EXISTS {}".format( self._events_statements_temp_table) params = (self._checkpoint, row_limit) with closing(self._get_db_connection().cursor( pymysql.cursors.DictCursor)) as cursor: # silence expected warnings to avoid spam cursor.execute('SET @@SESSION.sql_notes = 0') try: self._cursor_run(cursor, drop_temp_table_query) self._cursor_run( cursor, CREATE_TEMP_TABLE.format( temp_table=self._events_statements_temp_table, statements_table="performance_schema." + events_statements_table, ), params, ) except pymysql.err.DatabaseError as e: self._check.count( "dd.mysql.statement_samples.error", 1, tags=self._tags + ["error:create-temp-table-{}".format(type(e))], ) raise if self._has_window_functions: sub_select = SUB_SELECT_EVENTS_WINDOW else: self._cursor_run(cursor, "set @row_num = 0") self._cursor_run(cursor, "set @current_digest = ''") sub_select = SUB_SELECT_EVENTS_NUMBERED self._cursor_run( cursor, "set @startup_time_s = {}".format( STARTUP_TIME_SUBQUERY.format( global_status_table=self._global_status_table)), ) self._cursor_run( cursor, EVENTS_STATEMENTS_QUERY.format( statements_numbered=sub_select.format( statements_table=self._events_statements_temp_table)), params, ) rows = cursor.fetchall() self._cursor_run(cursor, drop_temp_table_query) tags = self._tags + [ "events_statements_table:{}".format(events_statements_table) ] self._check.histogram("dd.mysql.get_new_events_statements.time", (time.time() - start) * 1000, tags=tags) self._check.histogram("dd.mysql.get_new_events_statements.rows", len(rows), tags=tags) self._log.debug("Read %s rows from %s", len(rows), events_statements_table) return rows def _filter_valid_statement_rows(self, rows): num_sent = 0 num_truncated = 0 for row in rows: if not row or not all(row): self._log.debug( 'Row was unexpectedly truncated or the events_statements table is not enabled' ) continue sql_text = row['sql_text'] if not sql_text: continue # The SQL_TEXT column will store 1024 chars by default. Plans cannot be captured on truncated # queries, so the `performance_schema_max_sql_text_length` variable must be raised. if sql_text[-3:] == '...': num_truncated += 1 continue yield row # only save the checkpoint for rows that we have successfully processed # else rows that we ignore can push the checkpoint forward causing us to miss some on the next run if row['timer_start'] > self._checkpoint: self._checkpoint = row['timer_start'] num_sent += 1 if num_truncated > 0: self._log.warning( 'Unable to collect %d/%d statement samples due to truncated SQL text. Consider raising ' '`performance_schema_max_sql_text_length` to capture these queries.', num_truncated, num_truncated + num_sent, ) self._check.count("dd.mysql.statement_samples.error", 1, tags=self._tags + ["error:truncated-sql-text"]) def _collect_plan_for_statement(self, row): # Plans have several important signatures to tag events with: # - `plan_signature` - hash computed from the normalized JSON plan to group identical plan trees # - `resource_hash` - hash computed off the raw sql text to match apm resources # - `query_signature` - hash computed from the digest text to match query metrics try: obfuscated_statement = datadog_agent.obfuscate_sql(row['sql_text']) obfuscated_digest_text = datadog_agent.obfuscate_sql( row['digest_text']) except Exception: # do not log the raw sql_text to avoid leaking sensitive data into logs. digest_text is safe as parameters # are obfuscated by the database self._log.debug("Failed to obfuscate statement: %s", row['digest_text']) self._check.count("dd.mysql.statement_samples.error", 1, tags=self._tags + ["error:sql-obfuscate"]) return None apm_resource_hash = compute_sql_signature(obfuscated_statement) query_signature = compute_sql_signature(obfuscated_digest_text) query_cache_key = (row['current_schema'], query_signature) if query_cache_key in self._explained_statements_cache: return None self._explained_statements_cache[query_cache_key] = True plan = None with closing(self._get_db_connection().cursor()) as cursor: try: plan = self._explain_statement(cursor, row['sql_text'], row['current_schema'], obfuscated_statement) except Exception as e: self._check.count("dd.mysql.statement_samples.error", 1, tags=self._tags + ["error:explain-{}".format(type(e))]) self._log.exception("Failed to explain statement: %s", obfuscated_statement) normalized_plan, obfuscated_plan, plan_signature, plan_cost = None, None, None, None if plan: normalized_plan = datadog_agent.obfuscate_sql_exec_plan( plan, normalize=True) if plan else None obfuscated_plan = datadog_agent.obfuscate_sql_exec_plan(plan) plan_signature = compute_exec_plan_signature(normalized_plan) plan_cost = self._parse_execution_plan_cost(plan) query_plan_cache_key = (query_cache_key, plan_signature) if query_plan_cache_key not in self._seen_samples_cache: self._seen_samples_cache[query_plan_cache_key] = True return { "timestamp": row["timer_end_time_s"] * 1000, "host": self._db_hostname, "service": self._service, "ddsource": "mysql", "ddtags": self._tags_str, "duration": row['timer_wait_ns'], "network": { "client": { "ip": row.get('processlist_host', None), } }, "db": { "instance": row['current_schema'], "plan": { "definition": obfuscated_plan, "cost": plan_cost, "signature": plan_signature }, "query_signature": query_signature, "resource_hash": apm_resource_hash, "statement": obfuscated_statement, }, 'mysql': { k: v for k, v in row.items() if k not in EVENTS_STATEMENTS_SAMPLE_EXCLUDE_KEYS }, } def _collect_plans_for_statements(self, rows): for row in rows: try: event = self._collect_plan_for_statement(row) if event: yield event except Exception: self._log.debug("Failed to collect plan for statement", exc_info=1) def _get_enabled_performance_schema_consumers(self): """ Returns the list of available performance schema consumers I.e. (events_statements_current, events_statements_history) :return: """ with closing(self._get_db_connection().cursor()) as cursor: self._cursor_run(cursor, ENABLED_STATEMENTS_CONSUMERS_QUERY) return set([r[0] for r in cursor.fetchall()]) def _enable_events_statements_consumers(self): """ Enable events statements consumers :return: """ try: with closing(self._get_db_connection().cursor()) as cursor: self._cursor_run( cursor, 'CALL {}()'.format( self._events_statements_enable_procedure)) except pymysql.err.DatabaseError as e: self._log.debug( "failed to enable events_statements consumers using procedure=%s: %s", self._events_statements_enable_procedure, e, ) def _get_sample_collection_strategy(self): """ Decides on the plan collection strategy: - which events_statement_history-* table are we using - how long should the rate and time limits be :return: (table, rate_limit) """ cached_strategy = self._collection_strategy_cache.get( "plan_collection_strategy") if cached_strategy: self._log.debug("Using cached plan_collection_strategy: %s", cached_strategy) return cached_strategy enabled_consumers = self._get_enabled_performance_schema_consumers() if len(enabled_consumers) < 3: self._enable_events_statements_consumers() enabled_consumers = self._get_enabled_performance_schema_consumers( ) if not enabled_consumers: self._log.warning( "Cannot collect statement samples as there are no enabled performance_schema.events_statements_* " "consumers. Enable performance_schema and at least one events_statements consumer in order to collect " "statement samples.") self._check.count( "dd.mysql.statement_samples.error", 1, tags=self._tags + ["error:no-enabled-events-statements-consumers"], ) return None, None self._log.debug( "Found enabled performance_schema statements consumers: %s", enabled_consumers) events_statements_table = None for table in self._preferred_events_statements_tables: if table not in enabled_consumers: continue rows = self._get_new_events_statements(table, 1) if not rows: self._log.debug( "No statements found in %s table. checking next one.", table) continue events_statements_table = table break if not events_statements_table: self._log.warning( "Cannot collect statement samples as all enabled events_statements_consumers %s are empty.", enabled_consumers, ) return None, None rate_limit = self._collections_per_second if rate_limit < 0: rate_limit = DEFAULT_EVENTS_STATEMENTS_COLLECTIONS_PER_SECOND[ events_statements_table] # cache only successful strategies # should be short enough that we'll reflect updates relatively quickly # i.e., an aurora replica becomes a master (or vice versa). strategy = (events_statements_table, rate_limit) self._log.debug( "Chose plan collection strategy: events_statements_table=%s, collections_per_second=%s", events_statements_table, rate_limit, ) self._collection_strategy_cache["plan_collection_strategy"] = strategy return strategy def _collect_statement_samples(self): self._rate_limiter.sleep() events_statements_table, rate_limit = self._get_sample_collection_strategy( ) if not events_statements_table: return if self._rate_limiter.rate_limit_s != rate_limit: self._rate_limiter = ConstantRateLimiter(rate_limit) start_time = time.time() tags = self._tags + [ "events_statements_table:{}".format(events_statements_table) ] rows = self._get_new_events_statements( events_statements_table, self._events_statements_row_limit) rows = self._filter_valid_statement_rows(rows) events = self._collect_plans_for_statements(rows) submitted_count, failed_count = self._statement_samples_client.submit_events( events) self._check.count("dd.mysql.statement_samples.error", failed_count, tags=self._tags + ["error:submit-events"]) self._check.histogram("dd.mysql.collect_statement_samples.time", (time.time() - start_time) * 1000, tags=tags) self._check.count( "dd.mysql.collect_statement_samples.events_submitted.count", submitted_count, tags=tags) self._check.gauge( "dd.mysql.collect_statement_samples.seen_samples_cache.len", len(self._seen_samples_cache), tags=tags) self._check.gauge( "dd.mysql.collect_statement_samples.explained_statements_cache.len", len(self._explained_statements_cache), tags=tags, ) self._check.gauge( "dd.mysql.collect_statement_samples.collection_strategy_cache.len", len(self._collection_strategy_cache), tags=tags, ) def _explain_statement(self, cursor, statement, schema, obfuscated_statement): """ Tries the available methods used to explain a statement for the given schema. If a non-retryable error occurs (such as a permissions error), then statements executed under the schema will be disallowed in future attempts. """ start_time = time.time() strategy_cache_key = 'explain_strategy:%s' % schema explain_strategy_error = 'ERROR' tags = self._tags + ["schema:{}".format(schema)] self._log.debug('explaining statement. schema=%s, statement="%s"', schema, statement) if not self._can_explain(obfuscated_statement): self._log.debug('Skipping statement which cannot be explained: %s', obfuscated_statement) return None if self._collection_strategy_cache.get( strategy_cache_key) == explain_strategy_error: self._log.debug( 'Skipping statement due to cached collection failure: %s', obfuscated_statement) return None try: # If there was a default schema when this query was run, then switch to it before trying to collect # the execution plan. This is necessary when the statement uses non-fully qualified tables # e.g. `select * from mytable` instead of `select * from myschema.mytable` if schema: self._cursor_run(cursor, 'USE `{}`'.format(schema)) except pymysql.err.DatabaseError as e: if len(e.args) != 2: raise if e.args[0] in PYMYSQL_NON_RETRYABLE_ERRORS: self._collection_strategy_cache[ strategy_cache_key] = explain_strategy_error self._check.count("dd.mysql.statement_samples.error", 1, tags=tags + ["error:explain-use-schema-{}".format(type(e))]) self._log.debug( 'Failed to collect execution plan because schema could not be accessed. error=%s, schema=%s, ' 'statement="%s"', e.args, schema, obfuscated_statement, ) return None # Use a cached strategy for the schema, if any, or try each strategy to collect plans strategies = list(self._preferred_explain_strategies) cached = self._collection_strategy_cache.get(strategy_cache_key) if cached: strategies.remove(cached) strategies.insert(0, cached) for strategy in strategies: try: if not schema and strategy == "PROCEDURE": self._log.debug( 'skipping PROCEDURE strategy as there is no default schema for this statement="%s"', obfuscated_statement, ) continue plan = self._explain_strategies[strategy](cursor, statement, obfuscated_statement) if plan: self._collection_strategy_cache[ strategy_cache_key] = strategy self._log.debug( 'Successfully collected execution plan. strategy=%s, schema=%s, statement="%s"', strategy, schema, obfuscated_statement, ) self._check.histogram( "dd.mysql.run_explain.time", (time.time() - start_time) * 1000, tags=self._tags + ["strategy:{}".format(strategy)], ) return plan except pymysql.err.DatabaseError as e: if len(e.args) != 2: raise # we don't cache failed plan collection failures for specific queries because some queries in a schema # can fail while others succeed. The failed collection will be cached for the specific query # so we won't try to explain it again for the cache duration there. self._check.count( "dd.mysql.statement_samples.error", 1, tags=tags + ["error:explain-attempt-{}-{}".format(strategy, type(e))], ) self._log.debug( 'Failed to collect execution plan. error=%s, strategy=%s, schema=%s, statement="%s"', e.args, strategy, schema, obfuscated_statement, ) continue return None def _run_explain(self, cursor, statement, obfuscated_statement): """ Run the explain using the EXPLAIN statement """ self._log.debug("running query [EXPLAIN FORMAT=json %s]", obfuscated_statement) cursor.execute('EXPLAIN FORMAT=json {}'.format(statement)) return cursor.fetchone()[0] def _run_explain_procedure(self, cursor, statement, obfuscated_statement): """ Run the explain by calling the stored procedure if available. """ self._cursor_run(cursor, 'CALL {}(%s)'.format(self._explain_procedure), statement, obfuscated_statement) return cursor.fetchone()[0] def _run_fully_qualified_explain_procedure(self, cursor, statement, obfuscated_statement): """ Run the explain by calling the fully qualified stored procedure if available. """ self._cursor_run( cursor, 'CALL {}(%s)'.format(self._fully_qualified_explain_procedure), statement, obfuscated_statement) return cursor.fetchone()[0] @staticmethod def _can_explain(obfuscated_statement): return obfuscated_statement.split( ' ', 1)[0].lower() in VALID_EXPLAIN_STATEMENTS @staticmethod def _parse_execution_plan_cost(execution_plan): """ Parses the total cost from the execution plan, if set. If not set, returns cost of 0. """ cost = json.loads(execution_plan).get('query_block', {}).get( 'cost_info', {}).get('query_cost', 0.0) return float(cost or 0.0)
class PostgresStatementSamples(object): """ Collects statement samples and execution plans. """ executor = ThreadPoolExecutor() def __init__(self, check, config): self._check = check self._db = None self._config = config self._log = get_check_logger() self._activity_last_query_start = None self._last_check_run = 0 self._collection_loop_future = None self._cancel_event = threading.Event() self._tags = None self._tags_str = None self._service = "postgres" self._db_hostname = resolve_db_host(self._config.host) self._enabled = is_affirmative(self._config.statement_samples_config.get('enabled', False)) self._run_sync = is_affirmative(self._config.statement_samples_config.get('run_sync', False)) self._rate_limiter = ConstantRateLimiter( float(self._config.statement_samples_config.get('collections_per_second', 1)) ) self._explain_function = self._config.statement_samples_config.get( 'explain_function', 'datadog.explain_statement' ) # explained_statements_cache: limit how often we try to re-explain the same query self._explained_statements_cache = TTLCache( maxsize=int(self._config.statement_samples_config.get('explained_statements_cache_maxsize', 5000)), ttl=60 * 60 / int(self._config.statement_samples_config.get('explained_statements_per_hour_per_query', 60)), ) # seen_samples_cache: limit the ingestion rate per (query_signature, plan_signature) self._seen_samples_cache = TTLCache( # assuming ~100 bytes per entry (query & plan signature, key hash, 4 pointers (ordered dict), expiry time) # total size: 10k * 100 = 1 Mb maxsize=int(self._config.statement_samples_config.get('seen_samples_cache_maxsize', 10000)), ttl=60 * 60 / int(self._config.statement_samples_config.get('samples_per_hour_per_query', 15)), ) def cancel(self): self._cancel_event.set() def run_sampler(self, tags): """ start the sampler thread if not already running :param tags: :return: """ if not self._enabled: self._log.debug("Statement sampler not enabled") return self._tags = tags self._tags_str = ','.join(self._tags) for t in self._tags: if t.startswith('service:'): self._service = t[len('service:') :] self._last_check_run = time.time() if self._run_sync or is_affirmative(os.environ.get('DBM_STATEMENT_SAMPLER_RUN_SYNC', "false")): self._log.debug("Running statement sampler synchronously") self._collect_statement_samples() elif self._collection_loop_future is None or not self._collection_loop_future.running(): self._collection_loop_future = PostgresStatementSamples.executor.submit(self._collection_loop) else: self._log.debug("Statement sampler collection loop already running") def _get_new_pg_stat_activity(self): start_time = time.time() query = PG_STAT_ACTIVITY_QUERY.format(pg_stat_activity_view=self._config.pg_stat_activity_view) with self._get_db().cursor(cursor_factory=psycopg2.extras.DictCursor) as cursor: params = (self._config.dbname,) if self._activity_last_query_start: query = query + " AND query_start > %s" params = params + (self._activity_last_query_start,) self._log.debug("Running query [%s] %s", query, params) cursor.execute(query, params) rows = cursor.fetchall() self._check.histogram( "dd.postgres.get_new_pg_stat_activity.time", (time.time() - start_time) * 1000, tags=self._tags ) self._check.histogram("dd.postgres.get_new_pg_stat_activity.rows", len(rows), tags=self._tags) self._log.debug("Loaded %s rows from %s", len(rows), self._config.pg_stat_activity_view) return rows def _filter_valid_statement_rows(self, rows): insufficient_privilege_count = 0 total_count = 0 for row in rows: total_count += 1 if not row['datname']: continue query = row['query'] if not query: continue if query == '<insufficient privilege>': insufficient_privilege_count += 1 continue if self._activity_last_query_start is None or row['query_start'] > self._activity_last_query_start: self._activity_last_query_start = row['query_start'] yield row if insufficient_privilege_count > 0: self._log.warning( "Insufficient privilege for %s/%s queries when collecting from %s.", self._config.pg_stat_activity_view ) self._check.count( "dd.postgres.statement_samples.error", insufficient_privilege_count, tags=self._tags + ["error:insufficient-privilege"], ) def _get_db(self): # while psycopg2 is threadsafe (meaning in theory we should be able to use the same connection as the parent # check), the parent doesn't use autocommit and instead calls commit() and rollback() explicitly, meaning # it can cause strange clashing issues if we're trying to use the same connection from another thread here. # since the statement sampler runs continuously it's best we have our own connection here with autocommit # enabled if not self._db or self._db.closed: self._db = self._check._new_connection() self._db.set_session(autocommit=True) if self._db.status != psycopg2.extensions.STATUS_READY: # Some transaction went wrong and the connection is in an unhealthy state. Let's fix that self._db.rollback() return self._db def _collection_loop(self): try: self._log.info("Starting statement sampler collection loop") while True: if self._cancel_event.isSet(): self._log.info("Collection loop cancelled") self._check.count("dd.postgres.statement_samples.collection_loop_cancel", 1, tags=self._tags) break if time.time() - self._last_check_run > self._config.min_collection_interval * 2: self._log.info("Sampler collection loop stopping due to check inactivity") self._check.count("dd.postgres.statement_samples.collection_loop_inactive_stop", 1, tags=self._tags) break self._collect_statement_samples() except psycopg2.errors.DatabaseError as e: self._log.warning( "Statement sampler database error: %s", e, exc_info=self._log.getEffectiveLevel() == logging.DEBUG ) self._check.count( "dd.postgres.statement_samples.error", 1, tags=self._tags + ["error:database-{}".format(type(e))], ) except Exception as e: self._log.exception("Statement sampler collection loop crash") self._check.count( "dd.postgres.statement_samples.error", 1, tags=self._tags + ["error:collection-loop-crash-{}".format(type(e))], ) finally: self._log.info("Shutting down statement sampler collection loop") self._close_db_conn() def _close_db_conn(self): if self._db and not self._db.closed: try: self._db.close() except Exception: self._log.exception("failed to close DB connection") self._db = None def _collect_statement_samples(self): self._rate_limiter.sleep() start_time = time.time() rows = self._get_new_pg_stat_activity() rows = self._filter_valid_statement_rows(rows) events = self._explain_pg_stat_activity(rows) submitted_count = 0 for e in events: self._check.database_monitoring_query_sample(json.dumps(e, default=default_json_event_encoding)) submitted_count += 1 elapsed_ms = (time.time() - start_time) * 1000 self._check.histogram("dd.postgres.collect_statement_samples.time", elapsed_ms, tags=self._tags) self._check.count( "dd.postgres.collect_statement_samples.events_submitted.count", submitted_count, tags=self._tags ) self._check.gauge( "dd.postgres.collect_statement_samples.seen_samples_cache.len", len(self._seen_samples_cache), tags=self._tags, ) self._check.gauge( "dd.postgres.collect_statement_samples.explained_statements_cache.len", len(self._explained_statements_cache), tags=self._tags, ) def _can_explain_statement(self, obfuscated_statement): if obfuscated_statement.startswith('SELECT {}'.format(self._explain_function)): return False if obfuscated_statement.startswith('autovacuum:'): return False if obfuscated_statement.split(' ', 1)[0].lower() not in VALID_EXPLAIN_STATEMENTS: return False return True def _run_explain(self, statement, obfuscated_statement): if not self._can_explain_statement(obfuscated_statement): return None with self._get_db().cursor() as cursor: try: start_time = time.time() self._log.debug("Running query: %s(%s)", self._explain_function, obfuscated_statement) cursor.execute( """SELECT {explain_function}($stmt${statement}$stmt$)""".format( explain_function=self._explain_function, statement=statement ) ) result = cursor.fetchone() self._check.histogram( "dd.postgres.run_explain.time", (time.time() - start_time) * 1000, tags=self._tags ) except psycopg2.errors.UndefinedFunction: self._log.warning( "Failed to collect execution plan due to undefined explain_function=%s", self._explain_function, ) self._check.count( "dd.postgres.statement_samples.error", 1, tags=self._tags + ["error:explain-undefined-function"] ) return None except Exception as e: self._log.debug("Failed to collect execution plan. query='%s'", obfuscated_statement, exc_info=1) self._check.count( "dd.postgres.statement_samples.error", 1, tags=self._tags + ["error:explain-{}".format(type(e))] ) return None if not result or len(result) < 1 or len(result[0]) < 1: return None return result[0][0] def _collect_plan_for_statement(self, row): try: obfuscated_statement = datadog_agent.obfuscate_sql(row['query']) except Exception as e: self._log.debug("Failed to obfuscate statement: %s", e) self._check.count("dd.postgres.statement_samples.error", 1, tags=self._tags + ["error:sql-obfuscate"]) return None # limit the rate of explains done to the database query_signature = compute_sql_signature(obfuscated_statement) if query_signature in self._explained_statements_cache: return None self._explained_statements_cache[query_signature] = True # Plans have several important signatures to tag events with. Note that for postgres, the # query_signature and resource_hash will be the same value. # - `plan_signature` - hash computed from the normalized JSON plan to group identical plan trees # - `resource_hash` - hash computed off the raw sql text to match apm resources # - `query_signature` - hash computed from the raw sql text to match query metrics plan_dict = self._run_explain(row['query'], obfuscated_statement) plan, normalized_plan, obfuscated_plan, plan_signature, plan_cost = None, None, None, None, None if plan_dict: plan = json.dumps(plan_dict) # if we're using the orjson implementation then json.dumps returns bytes plan = plan.decode('utf-8') if isinstance(plan, bytes) else plan normalized_plan = datadog_agent.obfuscate_sql_exec_plan(plan, normalize=True) obfuscated_plan = datadog_agent.obfuscate_sql_exec_plan(plan) plan_signature = compute_exec_plan_signature(normalized_plan) plan_cost = plan_dict.get('Plan', {}).get('Total Cost', 0.0) or 0.0 statement_plan_sig = (query_signature, plan_signature) if statement_plan_sig not in self._seen_samples_cache: self._seen_samples_cache[statement_plan_sig] = True event = { "host": self._db_hostname, "service": self._service, "ddsource": "postgres", "ddtags": self._tags_str, "network": { "client": { "ip": row.get('client_addr', None), "port": row.get('client_port', None), "hostname": row.get('client_hostname', None), } }, "db": { "instance": row.get('datname', None), "plan": {"definition": obfuscated_plan, "cost": plan_cost, "signature": plan_signature}, "query_signature": query_signature, "resource_hash": query_signature, "application": row.get('application_name', None), "user": row['usename'], "statement": obfuscated_statement, }, 'postgres': {k: v for k, v in row.items() if k not in pg_stat_activity_sample_exclude_keys}, } event['timestamp'] = time.time() * 1000 if row['state'] in {'idle', 'idle in transaction'}: if row['state_change'] and row['query_start']: event['duration'] = (row['state_change'] - row['query_start']).total_seconds() * 1e9 # If the transaction is idle then we have a more specific "end time" than the current time at # which we're collecting this event. According to the postgres docs, all of the timestamps in # pg_stat_activity are `timestamp with time zone` so the timezone should always be present. However, # if there is something wrong and it's missing then we can't use `state_change` for the timestamp # of the event else we risk the timestamp being significantly off and the event getting dropped # during ingestion. if row['state_change'].tzinfo: event['timestamp'] = get_timestamp(row['state_change']) * 1000 return event def _explain_pg_stat_activity(self, rows): for row in rows: try: event = self._collect_plan_for_statement(row) if event: yield event except Exception: self._log.exception("Crashed trying to collect execution plan for statement") self._check.count( "dd.postgres.statement_samples.error", 1, tags=self._tags + ["error:collect-plan-for-statement-crash"], )