def get_reports_mock(): context = OrderedDict() context['foo'] = 42 context['test'] = 'bar' yield LinterEntry(linter_type='foo_linter', table_name='table_001', message='Something is fishy here', context=context) yield LinterEntry(linter_type='bar_linter', table_name='table_042', message='An index is missing')
def check_queries_using_filesort(database, queries): """ Using filesort MySQL must do an extra pass to find out how to retrieve the rows in sorted order. The sort is done by going through all rows according to the join type and storing the sort key and pointer to the row for all rows that match the WHERE clause. The keys then are sorted and the rows are retrieved in sorted order. Percona says: The truth is, filesort is badly named. Anytime a sort can't be performed from an index, it's a filesort. It has nothing to do with files. Filesort should be called "sort." It is quicksort at heart. :type database indexdigest.database.Database :type queries list[str] :rtype: list[LinterEntry] """ filtered = filter_explain_extra(database, queries, check='Using filesort') for (query, table_used, context) in filtered: yield LinterEntry(linter_type='queries_using_filesort', table_name=table_used, message='"{}" query used filesort'.format( shorten_query(query)), context=context)
def check_missing_primary_index(database): """ :type database indexdigest.database.Database :rtype: list[LinterEntry] """ for table in database.get_tables(): # list non-primary (and non-unique) indices only # @see https://bugs.mysql.com/bug.php?id=76252 # @see https://github.com/Wikia/app/pull/9863 indices = [ index for index in database.get_table_indices(table) if index.is_primary or index.is_unique ] if indices: # so we have at least one primary or unique index defined continue context = OrderedDict() context['schema'] = database.get_table_schema(table) yield LinterEntry( linter_type='missing_primary_index', table_name=table, message='"{}" table does not have any primary or unique index'. format(table), context=context)
def check_full_table_scan(database, queries): """ Full table scan An operation that requires reading the entire contents of a table, rather than just selected portions using an index. Typically performed either with small lookup tables, or in data warehousing situations with large tables where all available data is aggregated and analyzed. How frequently these operations occur, and the sizes of the tables relative to available memory, have implications for the algorithms used in query optimization and managing the buffer pool. :type database indexdigest.database.Database :type queries list[str] :rtype: list[LinterEntry] """ for (query, table_used, _, row) in explain_queries(database, queries): # The output from EXPLAIN shows ALL in the type column when # MySQL uses a full table scan to resolve a query. if row['type'] != 'ALL': continue context = OrderedDict() context['query'] = query context['explain_rows'] = int(row['rows']) # we get string here when using MariaDB 10.5 yield LinterEntry(linter_type='queries_using_full_table_scan', table_name=table_used, message='"{}" query triggered full table scan'. format(shorten_query(query)), context=context)
def check_generic_primary_key(database): """ :type database indexdigest.database.Database :rtype: list[LinterEntry] """ for table_name in database.get_tables(): indices = [ index for index in database.get_table_indices(table_name) if index.is_primary ] # no primary index, a different check will take care of it if not indices: continue # there can be only one primary key, take the first one from the list primary_key = indices[0] # print(table_name, primary_key, primary_key.columns[0]) if primary_key.columns[0] == GENERIC_PRIMARY_KEY: yield LinterEntry( linter_type='generic_primary_key', table_name=table_name, message='"{}" has a primary key called id, ' 'use a more meaningful name'.format(table_name), context={"schema": database.get_table_schema(table_name)})
def check_not_used_indices(database, queries): """ :type database indexdigest.database.Database :type queries list[str] :rtype: list[LinterEntry] """ logger = logging.getLogger(__name__) used_indices = defaultdict(list) # EXPLAIN each query for (query, table_used, index_used, _) in explain_queries(database, queries): if index_used is not None: logger.info("Query <%s> uses %s index on `%s` table", query, index_used, table_used) used_indices[table_used].append(index_used) # analyze all tables used by the above queries # print(used_indices) for table_name, table_indices in used_indices.items(): for index in database.get_table_indices(table_name): if index.name not in table_indices: yield LinterEntry( linter_type='not_used_indices', table_name=table_name, message='"{}" index was not used by provided queries'. format(index.name), context={"not_used_index": str(index)})
def check_not_used_tables(database, queries): """ :type database indexdigest.database.Database :type queries list[str] :rtype: list[LinterEntry] """ logger = logging.getLogger(__name__) # get database meta-data tables = database.get_tables() # analyze only SELECT queries from the log used_tables = get_used_tables_from_queries(queries) logger.info("These tables were used by provided queries: %s", used_tables) # now check which tables were not used not_used_tables = [table for table in tables if table not in used_tables] # generate reports for table in not_used_tables: metadata = database.get_table_metadata(table) context = OrderedDict() context['schema'] = database.get_table_schema(table) context['table_size_mb'] = \ 1. * (metadata['data_size'] + metadata['index_size']) / 1024 / 1024 context['rows_estimated'] = database.get_table_rows_estimate(table) yield LinterEntry( linter_type='not_used_tables', table_name=table, message='"{}" table was not used by provided queries'.format( table), context=context)
def check_high_offset_selects(_, queries): """ :type _ indexdigest.database.Database :type queries list[str] :rtype: list[LinterEntry] """ for query in queries: # ignore insert queries (#140) if not is_select_query(query): continue res = get_query_limit_and_offset(query) if res is None: continue (limit, offset) = res if offset < OFFSET_THRESHOLD: continue table_name = get_query_tables(query)[0] context = OrderedDict() context['query'] = query context['limit'] = limit context['offset'] = offset yield LinterEntry(linter_type='high_offset_selects', table_name=table_name, message='"{}" query uses too high offset impacting the performance'. format(shorten_query(query)), context=context)
def check_not_used_tables(database, queries): """ :type database indexdigest.database.Database :type queries list[str] :rtype: list[LinterEntry] """ logger = logging.getLogger(__name__) # get database meta-data tables = database.get_tables() # analyze only SELECT queries from the log used_tables = get_used_tables_from_queries(database, queries) logger.info("These tables were used by provided queries: %s", used_tables) # now check which tables were not used not_used_tables = [table for table in tables if table not in used_tables] # generate reports for table in not_used_tables: yield LinterEntry( linter_type='not_used_tables', table_name=table, message='"{}" table was not used by provided queries'.format( table))
def check_queries_not_using_indices(database, queries): """ :type database indexdigest.database.Database :type queries list[str] :rtype: list[LinterEntry] """ for (query, table_used, index_used, explain_row) in explain_queries(database, queries): # print(query, explain_row) # EXPLAIN can return no matching row in const table in Extra column. # Do not consider this query as not using an index. -- see #44 if explain_row['Extra'] in [ 'Impossible WHERE noticed after reading const tables', 'no matching row in const table', 'No tables used' ]: continue if index_used is None: context = OrderedDict() context['query'] = query # https://dev.mysql.com/doc/refman/5.7/en/explain-output.html#explain-extra-information context['explain_extra'] = explain_row['Extra'] context['explain_rows'] = explain_row['rows'] context['explain_filtered'] = explain_row.get( 'filtered') # can be not set context['explain_possible_keys'] = explain_row['possible_keys'] yield LinterEntry( linter_type='queries_not_using_index', table_name=table_used, message='"{}" query did not make use of any index'.format( shorten_query(query)), context=context)
def get_reports_mock(linter_types): """ :type linter_types list[str] :rtype: list[LinterEntry] """ return [ LinterEntry(linter_type=linter_type, table_name='foo', message='message') for linter_type in linter_types ]
def get_reports_mock(tables): """ :type tables list[str] :rtype: list[LinterEntry] """ return [ LinterEntry(linter_type='foo', table_name=table, message='message') for table in tables ]
def check_empty_database(database): """ :type database indexdigest.database.Database :rtype: list[LinterEntry] """ for db_name in get_empty_databases(database): yield LinterEntry( linter_type='empty_database', table_name=db_name, message='"{}" database has no tables'.format(db_name))
def check_not_used_columns(database, queries): """ :type database indexdigest.database.Database :type queries list[str] :rtype: list[LinterEntry] :raises Exception """ logger = logging.getLogger(__name__) # analyze only SELECT queries from the log queries = list(filter(is_select_query, queries)) used_tables = get_used_tables_from_queries(queries) used_columns = defaultdict(list) logger.info("Will check these tables: %s", used_tables) # analyze given queries and collect used columns for each table for query in queries: tables = get_query_tables(query) if tables: columns = get_query_columns(query) # print(query, table, columns) # add used columns per table # FIXME: assume we're querying just a single table for now used_columns[tables[0]] += columns else: logger.error( 'Unable to extract tables and columns used from the query: %s', query) # analyze table schemas and report not used columns for each table for table in used_tables: logger.info("Checking %s table", table) table_columns = database.get_table_columns(table) # now get the difference and report them not_used_columns = [ column for column in table_columns if column.name not in set(used_columns[table]) ] if table_columns else [] for column in not_used_columns: yield LinterEntry( linter_type='not_used_columns', table_name=table, message='"{}" column was not used by provided queries'.format( column), context={ 'column_name': column.name, 'column_type': column.type })
def check_test_tables(database): """ :type database indexdigest.database.Database :rtype: list[LinterEntry] """ test_tables = [ table for table in database.get_tables() if is_test_table(table) ] for table in test_tables: yield LinterEntry( linter_type='test_tables', table_name=table, message='"{}" seems to be a test table'.format(table), context={'schema': database.get_table_schema(table)})
def check_single_column(database): """ :type database indexdigest.database.Database :rtype: list[LinterEntry] """ tables = [ table for table in database.get_tables() if len(database.get_table_columns(table)) == 1 ] for table in tables: yield LinterEntry( linter_type='single_column', table_name=table, message='"{}" has just a single column'.format(table), context={'schema': database.get_table_schema(table)})
def check_data_too_old(database, env=None): """ :type database indexdigest.database.Database :type env dict :rtype: list[LinterEntry] """ now = int( time()) # I will probably never understand dates handling in Python # set up a diff threshold (in days) env = env if env else dict() diff_threshold = int( env.get('INDEX_DIGEST_DATA_TOO_OLD_THRESHOLD_DAYS', 3 * 30)) for (table_name, column) in get_time_columns(database): timestamps = get_boundary_times(database, table_name, column) if timestamps is None or timestamps.get('min') is None: continue diff = now - timestamps.get('min') # print(table_name, column, timestamps, now, diff) if diff > diff_threshold * 86400: diff_days = int(diff / 86400) metadata = database.get_table_metadata(table_name) context = OrderedDict() context['diff_days'] = diff_days context['data_since'] = str( datetime.fromtimestamp(timestamps.get('min'))) context['data_until'] = str( datetime.fromtimestamp(timestamps.get('max'))) context['date_column_name'] = str(column) context['schema'] = database.get_table_schema(table_name) context['rows'] = database.get_table_rows_estimate(table_name) context['table_size_mb'] = \ 1. * (metadata['data_size'] + metadata['index_size']) / 1024 / 1024 yield LinterEntry(linter_type='data_too_old', table_name=table_name, message='"{}" has rows added {} days ago, ' 'consider changing retention policy'.format( table_name, diff_days), context=context)
def check_having_clause(_, queries): """ :type queries list[str] :rtype: list[LinterEntry] """ queries_with_having_clause = [ query for query in queries if query_has_having_clause(query) ] for query in queries_with_having_clause: table_name = get_query_tables(query)[0] yield LinterEntry(linter_type='having_clause', table_name=table_name, message='"{}" query uses HAVING clause'.format( shorten_query(query)), context={"query": query})
def check_select_star(_, queries): """ :type queries list[str] :rtype: list[LinterEntry] """ queries_with_wildcard = [ query for query in queries if is_wildcard_query(query) ] for query in queries_with_wildcard: table_name = get_query_tables(query)[0] yield LinterEntry(linter_type='select_star', table_name=table_name, message='"{}" query uses SELECT *'.format( shorten_query(query)), context={"query": query})
def check_insert_ignore_queries(database, queries): """ :type database indexdigest.database.Database :type queries list[str] :rtype: list[LinterEntry] """ queries = [query for query in queries if is_insert_ignore_query(query)] for query in queries: table_used = get_query_tables(query)[0] context = OrderedDict() context['query'] = query context['schema'] = database.get_table_schema(table_used) yield LinterEntry(linter_type='insert_ignore', table_name=table_used, message='"{}" query uses a risky INSERT IGNORE'. format(shorten_query(query)), context=context)
def check_selects_with_like(database, queries): """ :type database indexdigest.database.Database :type queries list[str] :rtype: list[LinterEntry] """ for (query, table_used, index_used, explain_row) in explain_queries(database, queries): if index_used is None and query_uses_leftmost_like(query): context = OrderedDict() context['query'] = query # https://dev.mysql.com/doc/refman/5.7/en/explain-output.html#explain-extra-information context['explain_extra'] = explain_row['Extra'] context['explain_rows'] = explain_row['rows'] yield LinterEntry(linter_type='selects_with_like', table_name=table_used, message='"{}" query uses LIKE with left-most wildcard'. format(shorten_query(query)), context=context)
def check_queries_using_temporary(database, queries): """ Using temporary To resolve the query, MySQL needs to create a temporary table to hold the result. This typically happens if the query contains GROUP BY and ORDER BY clauses that list columns differently. :type database indexdigest.database.Database :type queries list[str] :rtype: list[LinterEntry] """ filtered = filter_explain_extra(database, queries, check='Using temporary') for (query, table_used, context) in filtered: yield LinterEntry(linter_type='queries_using_temporary', table_name=table_used, message='"{}" query used temporary'.format( shorten_query(query)), context=context)
def check_redundant_indices(database): """ :type database indexdigest.database.Database :rtype: list[LinterEntry] """ logger = logging.getLogger(__name__) for table in database.get_tables(): logger.info("Checking %s table", table) indices = database.get_table_indices(table) meta = database.get_table_metadata(table) schema = database.get_table_schema(table) redundant_indices = set() for (redundant_index, suggested_index) in get_redundant_indices(indices): # the index we're about to suggest was reported as redundant - #48 if suggested_index in redundant_indices: continue context = OrderedDict() context['redundant'] = redundant_index context['covered_by'] = suggested_index context['schema'] = schema context[ 'table_data_size_mb'] = 1. * meta['data_size'] / 1024 / 1024 context[ 'table_index_size_mb'] = 1. * meta['index_size'] / 1024 / 1024 # add to the list to avoid redundant indices being reported in a loop - #48 redundant_indices.add(redundant_index) yield LinterEntry( linter_type='redundant_indices', table_name=table, message= '"{}" index can be removed as redundant (covered by "{}")'. format(redundant_index.name, suggested_index.name), context=context)
def check_use_innodb(database): """ :type database indexdigest.database.Database :rtype: list[LinterEntry] """ # in MySQL 8.0 information_schema tables columns are uppercase res = database.query_dict_rows( "SELECT TABLE_NAME, ENGINE FROM information_schema.tables " "WHERE ENGINE <> 'InnoDB' and TABLE_SCHEMA = '{}'".format( database.db_name)) for row in res: context = OrderedDict() context['schema'] = database.get_table_schema(row['TABLE_NAME']) context['engine'] = row['ENGINE'] yield LinterEntry( linter_type='use_innodb', table_name=row['TABLE_NAME'], message='"{TABLE_NAME}" uses {ENGINE} storage engine'.format( **row), context=context)
def check_latin_columns(database): """ :type database indexdigest.database.Database :rtype: list[LinterEntry] """ for table in database.get_tables(): for column in database.get_table_columns(table): if not is_text_column_latin(column): continue # print([table, column, column.character_set, column.collation]) context = OrderedDict() context['column'] = column.name context['column_character_set'] = column.character_set context['column_collation'] = column.collation context['schema'] = database.get_table_schema(table) yield LinterEntry( linter_type='non_utf_columns', table_name=table, message='"{}" text column has "{}" character set defined'. format(column.name, column.character_set), context=context)
def check_low_cardinality_index(database): """ :type database indexdigest.database.Database :rtype: list[LinterEntry] """ for table_name, rows_count, index in get_low_cardinality_indices(database): # the least frequent value should be used in up to 20% of rows # https://www.percona.com/blog/2007/08/28/do-you-always-need-index-on-where-column/ row = database.query_dict_row( 'SELECT {column} AS value, COUNT(*) AS cnt FROM `{table}` ' 'GROUP BY 1 ORDER BY 2 ASC LIMIT 1'.format( column=index['COLUMN_NAME'], table=index['TABLE_NAME'])) value_usage = 100. * row['cnt'] / rows_count # print(row, value_usage) # the least frequent value is quite rare - it makes sense to have an index here if value_usage < INDEX_VALUE_PERCENTAGE_THRESHOLD: continue print(value_usage, index, table_name) context = OrderedDict() context['column_name'] = index['COLUMN_NAME'] context['index_name'] = index['INDEX_NAME'] context['index_cardinality'] = int(index['CARDINALITY']) context['schema'] = database.get_table_schema(table_name) context['value_usage'] = value_usage yield LinterEntry( linter_type='low_cardinality_index', table_name=table_name, message='"{}" index on "{}" column has low cardinality, ' 'check if it is needed'.format(index['INDEX_NAME'], index['COLUMN_NAME']), context=context)