Пример #1
0
    def get_reports_mock():
        context = OrderedDict()
        context['foo'] = 42
        context['test'] = 'bar'

        yield LinterEntry(linter_type='foo_linter',
                          table_name='table_001',
                          message='Something is fishy here',
                          context=context)

        yield LinterEntry(linter_type='bar_linter',
                          table_name='table_042',
                          message='An index is missing')
def check_queries_using_filesort(database, queries):
    """
    Using filesort

    MySQL must do an extra pass to find out how to retrieve the rows in sorted order. The sort is
    done by going through all rows according to the join type and storing the sort key and pointer
    to the row for all rows that match the WHERE clause. The keys then are sorted and the rows are
    retrieved in sorted order.

    Percona says: The truth is, filesort is badly named. Anytime a sort can't be performed from an
    index, it's a filesort. It has nothing to do with files. Filesort should be called "sort."
    It is quicksort at heart.

    :type database  indexdigest.database.Database
    :type queries list[str]
    :rtype: list[LinterEntry]
    """
    filtered = filter_explain_extra(database, queries, check='Using filesort')

    for (query, table_used, context) in filtered:
        yield LinterEntry(linter_type='queries_using_filesort',
                          table_name=table_used,
                          message='"{}" query used filesort'.format(
                              shorten_query(query)),
                          context=context)
def check_missing_primary_index(database):
    """
    :type database  indexdigest.database.Database
    :rtype: list[LinterEntry]
    """
    for table in database.get_tables():
        # list non-primary (and non-unique) indices only
        # @see https://bugs.mysql.com/bug.php?id=76252
        # @see https://github.com/Wikia/app/pull/9863
        indices = [
            index for index in database.get_table_indices(table)
            if index.is_primary or index.is_unique
        ]

        if indices:
            # so we have at least one primary or unique index defined
            continue

        context = OrderedDict()
        context['schema'] = database.get_table_schema(table)

        yield LinterEntry(
            linter_type='missing_primary_index',
            table_name=table,
            message='"{}" table does not have any primary or unique index'.
            format(table),
            context=context)
def check_full_table_scan(database, queries):
    """
    Full table scan

    An operation that requires reading the entire contents of a table, rather than just selected
    portions using an index. Typically performed either with small lookup tables, or in data
    warehousing situations with large tables where all available data is aggregated and analyzed.
    How frequently these operations occur, and the sizes of the tables relative to available memory,
    have implications for the algorithms used in query optimization and managing the buffer pool.

    :type database  indexdigest.database.Database
    :type queries list[str]
    :rtype: list[LinterEntry]
    """
    for (query, table_used, _, row) in explain_queries(database, queries):
        # The output from EXPLAIN shows ALL in the type column when
        # MySQL uses a full table scan to resolve a query.
        if row['type'] != 'ALL':
            continue

        context = OrderedDict()
        context['query'] = query
        context['explain_rows'] = int(row['rows'])  # we get string here when using MariaDB 10.5

        yield LinterEntry(linter_type='queries_using_full_table_scan', table_name=table_used,
                          message='"{}" query triggered full table scan'.
                          format(shorten_query(query)),
                          context=context)
Пример #5
0
def check_generic_primary_key(database):
    """
    :type database  indexdigest.database.Database
    :rtype: list[LinterEntry]
    """
    for table_name in database.get_tables():
        indices = [
            index for index in database.get_table_indices(table_name)
            if index.is_primary
        ]

        # no primary index, a different check will take care of it
        if not indices:
            continue

        # there can be only one primary key, take the first one from the list
        primary_key = indices[0]
        # print(table_name, primary_key, primary_key.columns[0])

        if primary_key.columns[0] == GENERIC_PRIMARY_KEY:
            yield LinterEntry(
                linter_type='generic_primary_key',
                table_name=table_name,
                message='"{}" has a primary key called id, '
                'use a more meaningful name'.format(table_name),
                context={"schema": database.get_table_schema(table_name)})
Пример #6
0
def check_not_used_indices(database, queries):
    """
    :type database  indexdigest.database.Database
    :type queries list[str]
    :rtype: list[LinterEntry]
    """
    logger = logging.getLogger(__name__)

    used_indices = defaultdict(list)

    # EXPLAIN each query
    for (query, table_used, index_used,
         _) in explain_queries(database, queries):
        if index_used is not None:
            logger.info("Query <%s> uses %s index on `%s` table", query,
                        index_used, table_used)
            used_indices[table_used].append(index_used)

    # analyze all tables used by the above queries
    # print(used_indices)
    for table_name, table_indices in used_indices.items():
        for index in database.get_table_indices(table_name):

            if index.name not in table_indices:
                yield LinterEntry(
                    linter_type='not_used_indices',
                    table_name=table_name,
                    message='"{}" index was not used by provided queries'.
                    format(index.name),
                    context={"not_used_index": str(index)})
def check_not_used_tables(database, queries):
    """
    :type database  indexdigest.database.Database
    :type queries list[str]
    :rtype: list[LinterEntry]
    """
    logger = logging.getLogger(__name__)

    # get database meta-data
    tables = database.get_tables()

    # analyze only SELECT queries from the log
    used_tables = get_used_tables_from_queries(queries)
    logger.info("These tables were used by provided queries: %s", used_tables)

    # now check which tables were not used
    not_used_tables = [table for table in tables if table not in used_tables]

    # generate reports
    for table in not_used_tables:
        metadata = database.get_table_metadata(table)
        context = OrderedDict()

        context['schema'] = database.get_table_schema(table)
        context['table_size_mb'] = \
            1. * (metadata['data_size'] + metadata['index_size']) / 1024 / 1024
        context['rows_estimated'] = database.get_table_rows_estimate(table)

        yield LinterEntry(
            linter_type='not_used_tables',
            table_name=table,
            message='"{}" table was not used by provided queries'.format(
                table),
            context=context)
def check_high_offset_selects(_, queries):
    """
    :type _ indexdigest.database.Database
    :type queries list[str]
    :rtype: list[LinterEntry]
    """
    for query in queries:
        # ignore insert queries (#140)
        if not is_select_query(query):
            continue

        res = get_query_limit_and_offset(query)

        if res is None:
            continue

        (limit, offset) = res

        if offset < OFFSET_THRESHOLD:
            continue

        table_name = get_query_tables(query)[0]

        context = OrderedDict()
        context['query'] = query
        context['limit'] = limit
        context['offset'] = offset

        yield LinterEntry(linter_type='high_offset_selects', table_name=table_name,
                          message='"{}" query uses too high offset impacting the performance'.
                          format(shorten_query(query)),
                          context=context)
def check_not_used_tables(database, queries):
    """
    :type database  indexdigest.database.Database
    :type queries list[str]
    :rtype: list[LinterEntry]
    """
    logger = logging.getLogger(__name__)

    # get database meta-data
    tables = database.get_tables()

    # analyze only SELECT queries from the log
    used_tables = get_used_tables_from_queries(database, queries)
    logger.info("These tables were used by provided queries: %s", used_tables)

    # now check which tables were not used
    not_used_tables = [table for table in tables if table not in used_tables]

    # generate reports
    for table in not_used_tables:
        yield LinterEntry(
            linter_type='not_used_tables',
            table_name=table,
            message='"{}" table was not used by provided queries'.format(
                table))
def check_queries_not_using_indices(database, queries):
    """
    :type database  indexdigest.database.Database
    :type queries list[str]
    :rtype: list[LinterEntry]
    """
    for (query, table_used, index_used,
         explain_row) in explain_queries(database, queries):
        # print(query, explain_row)

        # EXPLAIN can return no matching row in const table in Extra column.
        # Do not consider this query as not using an index. -- see #44
        if explain_row['Extra'] in [
                'Impossible WHERE noticed after reading const tables',
                'no matching row in const table', 'No tables used'
        ]:
            continue

        if index_used is None:
            context = OrderedDict()
            context['query'] = query

            # https://dev.mysql.com/doc/refman/5.7/en/explain-output.html#explain-extra-information
            context['explain_extra'] = explain_row['Extra']
            context['explain_rows'] = explain_row['rows']
            context['explain_filtered'] = explain_row.get(
                'filtered')  # can be not set
            context['explain_possible_keys'] = explain_row['possible_keys']

            yield LinterEntry(
                linter_type='queries_not_using_index',
                table_name=table_used,
                message='"{}" query did not make use of any index'.format(
                    shorten_query(query)),
                context=context)
Пример #11
0
 def get_reports_mock(linter_types):
     """
     :type linter_types list[str]
     :rtype: list[LinterEntry]
     """
     return [
         LinterEntry(linter_type=linter_type, table_name='foo', message='message')
         for linter_type in linter_types
     ]
Пример #12
0
 def get_reports_mock(tables):
     """
     :type tables list[str]
     :rtype: list[LinterEntry]
     """
     return [
         LinterEntry(linter_type='foo', table_name=table, message='message')
         for table in tables
     ]
def check_empty_database(database):
    """
    :type database  indexdigest.database.Database
    :rtype: list[LinterEntry]
    """
    for db_name in get_empty_databases(database):
        yield LinterEntry(
            linter_type='empty_database',
            table_name=db_name,
            message='"{}" database has no tables'.format(db_name))
def check_not_used_columns(database, queries):
    """
    :type database  indexdigest.database.Database
    :type queries list[str]
    :rtype: list[LinterEntry]
    :raises Exception
    """
    logger = logging.getLogger(__name__)

    # analyze only SELECT queries from the log
    queries = list(filter(is_select_query, queries))

    used_tables = get_used_tables_from_queries(queries)
    used_columns = defaultdict(list)

    logger.info("Will check these tables: %s", used_tables)

    # analyze given queries and collect used columns for each table
    for query in queries:
        tables = get_query_tables(query)
        if tables:
            columns = get_query_columns(query)

            # print(query, table, columns)

            # add used columns per table
            # FIXME: assume we're querying just a single table for now
            used_columns[tables[0]] += columns
        else:
            logger.error(
                'Unable to extract tables and columns used from the query: %s',
                query)

    # analyze table schemas and report not used columns for each table
    for table in used_tables:
        logger.info("Checking %s table", table)
        table_columns = database.get_table_columns(table)

        # now get the difference and report them
        not_used_columns = [
            column for column in table_columns
            if column.name not in set(used_columns[table])
        ] if table_columns else []

        for column in not_used_columns:
            yield LinterEntry(
                linter_type='not_used_columns',
                table_name=table,
                message='"{}" column was not used by provided queries'.format(
                    column),
                context={
                    'column_name': column.name,
                    'column_type': column.type
                })
Пример #15
0
def check_test_tables(database):
    """
    :type database  indexdigest.database.Database
    :rtype: list[LinterEntry]
    """
    test_tables = [
        table for table in database.get_tables() if is_test_table(table)
    ]

    for table in test_tables:
        yield LinterEntry(
            linter_type='test_tables',
            table_name=table,
            message='"{}" seems to be a test table'.format(table),
            context={'schema': database.get_table_schema(table)})
Пример #16
0
def check_single_column(database):
    """
    :type database  indexdigest.database.Database
    :rtype: list[LinterEntry]
    """
    tables = [
        table for table in database.get_tables()
        if len(database.get_table_columns(table)) == 1
    ]

    for table in tables:
        yield LinterEntry(
            linter_type='single_column',
            table_name=table,
            message='"{}" has just a single column'.format(table),
            context={'schema': database.get_table_schema(table)})
def check_data_too_old(database, env=None):
    """
    :type database  indexdigest.database.Database
    :type env dict
    :rtype: list[LinterEntry]
    """
    now = int(
        time())  # I will probably never understand dates handling in Python

    # set up a diff threshold (in days)
    env = env if env else dict()
    diff_threshold = int(
        env.get('INDEX_DIGEST_DATA_TOO_OLD_THRESHOLD_DAYS', 3 * 30))

    for (table_name, column) in get_time_columns(database):
        timestamps = get_boundary_times(database, table_name, column)

        if timestamps is None or timestamps.get('min') is None:
            continue

        diff = now - timestamps.get('min')
        # print(table_name, column, timestamps, now, diff)

        if diff > diff_threshold * 86400:
            diff_days = int(diff / 86400)

            metadata = database.get_table_metadata(table_name)

            context = OrderedDict()
            context['diff_days'] = diff_days
            context['data_since'] = str(
                datetime.fromtimestamp(timestamps.get('min')))
            context['data_until'] = str(
                datetime.fromtimestamp(timestamps.get('max')))
            context['date_column_name'] = str(column)
            context['schema'] = database.get_table_schema(table_name)
            context['rows'] = database.get_table_rows_estimate(table_name)
            context['table_size_mb'] = \
                1. * (metadata['data_size'] + metadata['index_size']) / 1024 / 1024

            yield LinterEntry(linter_type='data_too_old',
                              table_name=table_name,
                              message='"{}" has rows added {} days ago, '
                              'consider changing retention policy'.format(
                                  table_name, diff_days),
                              context=context)
Пример #18
0
def check_having_clause(_, queries):
    """
    :type queries list[str]
    :rtype: list[LinterEntry]
    """
    queries_with_having_clause = [
        query for query in queries if query_has_having_clause(query)
    ]

    for query in queries_with_having_clause:
        table_name = get_query_tables(query)[0]

        yield LinterEntry(linter_type='having_clause',
                          table_name=table_name,
                          message='"{}" query uses HAVING clause'.format(
                              shorten_query(query)),
                          context={"query": query})
def check_select_star(_, queries):
    """
    :type queries list[str]
    :rtype: list[LinterEntry]
    """
    queries_with_wildcard = [
        query for query in queries if is_wildcard_query(query)
    ]

    for query in queries_with_wildcard:
        table_name = get_query_tables(query)[0]

        yield LinterEntry(linter_type='select_star',
                          table_name=table_name,
                          message='"{}" query uses SELECT *'.format(
                              shorten_query(query)),
                          context={"query": query})
Пример #20
0
def check_insert_ignore_queries(database, queries):
    """
    :type database  indexdigest.database.Database
    :type queries list[str]
    :rtype: list[LinterEntry]
    """
    queries = [query for query in queries if is_insert_ignore_query(query)]

    for query in queries:
        table_used = get_query_tables(query)[0]

        context = OrderedDict()
        context['query'] = query
        context['schema'] = database.get_table_schema(table_used)

        yield LinterEntry(linter_type='insert_ignore', table_name=table_used,
                          message='"{}" query uses a risky INSERT IGNORE'.
                          format(shorten_query(query)),
                          context=context)
def check_selects_with_like(database, queries):
    """
    :type database  indexdigest.database.Database
    :type queries list[str]
    :rtype: list[LinterEntry]
    """
    for (query, table_used, index_used, explain_row) in explain_queries(database, queries):
        if index_used is None and query_uses_leftmost_like(query):
            context = OrderedDict()
            context['query'] = query

            # https://dev.mysql.com/doc/refman/5.7/en/explain-output.html#explain-extra-information
            context['explain_extra'] = explain_row['Extra']
            context['explain_rows'] = explain_row['rows']

            yield LinterEntry(linter_type='selects_with_like', table_name=table_used,
                              message='"{}" query uses LIKE with left-most wildcard'.
                              format(shorten_query(query)),
                              context=context)
def check_queries_using_temporary(database, queries):
    """
    Using temporary

    To resolve the query, MySQL needs to create a temporary table to hold the result. This
    typically happens if the query contains GROUP BY and ORDER BY clauses that list columns
    differently.

    :type database  indexdigest.database.Database
    :type queries list[str]
    :rtype: list[LinterEntry]
    """
    filtered = filter_explain_extra(database, queries, check='Using temporary')

    for (query, table_used, context) in filtered:
        yield LinterEntry(linter_type='queries_using_temporary',
                          table_name=table_used,
                          message='"{}" query used temporary'.format(
                              shorten_query(query)),
                          context=context)
def check_redundant_indices(database):
    """
    :type database  indexdigest.database.Database
    :rtype: list[LinterEntry]
    """
    logger = logging.getLogger(__name__)

    for table in database.get_tables():
        logger.info("Checking %s table", table)

        indices = database.get_table_indices(table)
        meta = database.get_table_metadata(table)
        schema = database.get_table_schema(table)

        redundant_indices = set()

        for (redundant_index,
             suggested_index) in get_redundant_indices(indices):
            # the index we're about to suggest was reported as redundant - #48
            if suggested_index in redundant_indices:
                continue

            context = OrderedDict()
            context['redundant'] = redundant_index
            context['covered_by'] = suggested_index
            context['schema'] = schema
            context[
                'table_data_size_mb'] = 1. * meta['data_size'] / 1024 / 1024
            context[
                'table_index_size_mb'] = 1. * meta['index_size'] / 1024 / 1024

            # add to the list to avoid redundant indices being reported in a loop - #48
            redundant_indices.add(redundant_index)

            yield LinterEntry(
                linter_type='redundant_indices',
                table_name=table,
                message=
                '"{}" index can be removed as redundant (covered by "{}")'.
                format(redundant_index.name, suggested_index.name),
                context=context)
Пример #24
0
def check_use_innodb(database):
    """
    :type database  indexdigest.database.Database
    :rtype: list[LinterEntry]
    """
    # in MySQL 8.0 information_schema tables columns are uppercase
    res = database.query_dict_rows(
        "SELECT TABLE_NAME, ENGINE FROM information_schema.tables "
        "WHERE ENGINE <> 'InnoDB' and TABLE_SCHEMA = '{}'".format(
            database.db_name))

    for row in res:
        context = OrderedDict()
        context['schema'] = database.get_table_schema(row['TABLE_NAME'])
        context['engine'] = row['ENGINE']

        yield LinterEntry(
            linter_type='use_innodb',
            table_name=row['TABLE_NAME'],
            message='"{TABLE_NAME}" uses {ENGINE} storage engine'.format(
                **row),
            context=context)
Пример #25
0
def check_latin_columns(database):
    """
    :type database  indexdigest.database.Database
    :rtype: list[LinterEntry]
    """
    for table in database.get_tables():
        for column in database.get_table_columns(table):
            if not is_text_column_latin(column):
                continue

            # print([table, column, column.character_set, column.collation])

            context = OrderedDict()
            context['column'] = column.name
            context['column_character_set'] = column.character_set
            context['column_collation'] = column.collation
            context['schema'] = database.get_table_schema(table)

            yield LinterEntry(
                linter_type='non_utf_columns',
                table_name=table,
                message='"{}" text column has "{}" character set defined'.
                format(column.name, column.character_set),
                context=context)
Пример #26
0
def check_low_cardinality_index(database):
    """
    :type database  indexdigest.database.Database
    :rtype: list[LinterEntry]
    """
    for table_name, rows_count, index in get_low_cardinality_indices(database):
        # the least frequent value should be used in up to 20% of rows
        # https://www.percona.com/blog/2007/08/28/do-you-always-need-index-on-where-column/
        row = database.query_dict_row(
            'SELECT {column} AS value, COUNT(*) AS cnt FROM `{table}` '
            'GROUP BY 1 ORDER BY 2 ASC LIMIT 1'.format(
                column=index['COLUMN_NAME'], table=index['TABLE_NAME']))

        value_usage = 100. * row['cnt'] / rows_count
        # print(row, value_usage)

        # the least frequent value is quite rare - it makes sense to have an index here
        if value_usage < INDEX_VALUE_PERCENTAGE_THRESHOLD:
            continue

        print(value_usage, index, table_name)

        context = OrderedDict()
        context['column_name'] = index['COLUMN_NAME']
        context['index_name'] = index['INDEX_NAME']
        context['index_cardinality'] = int(index['CARDINALITY'])
        context['schema'] = database.get_table_schema(table_name)
        context['value_usage'] = value_usage

        yield LinterEntry(
            linter_type='low_cardinality_index',
            table_name=table_name,
            message='"{}" index on "{}" column has low cardinality, '
            'check if it is needed'.format(index['INDEX_NAME'],
                                           index['COLUMN_NAME']),
            context=context)